Merge branch 'msr-bitmaps' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorRadim Krčmář <rkrcmar@redhat.com>
Fri, 2 Feb 2018 17:26:58 +0000 (18:26 +0100)
committerRadim Krčmář <rkrcmar@redhat.com>
Fri, 9 Feb 2018 20:35:35 +0000 (21:35 +0100)
This topic branch allocates separate MSR bitmaps for each VCPU.
This is required for the IBRS enablement to choose, on a per-VM
basis, whether to intercept the SPEC_CTRL and PRED_CMD MSRs;
the IBRS enablement comes in through the tip tree.

1  2 
arch/x86/kvm/vmx.c

diff --combined arch/x86/kvm/vmx.c
index bb5b4888505bdccc4a505aa1adcde95d9edd62f8,896af99a8606f3efc1f12f55515528120adcbcdb..9973a301364e0e0c8aa7d8be23f0c0d8e69ac3c7
@@@ -111,6 -111,14 +111,14 @@@ static u64 __read_mostly host_xss
  static bool __read_mostly enable_pml = 1;
  module_param_named(pml, enable_pml, bool, S_IRUGO);
  
+ #define MSR_TYPE_R    1
+ #define MSR_TYPE_W    2
+ #define MSR_TYPE_RW   3
+ #define MSR_BITMAP_MODE_X2APIC                1
+ #define MSR_BITMAP_MODE_X2APIC_APICV  2
+ #define MSR_BITMAP_MODE_LM            4
  #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
  
  /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
@@@ -209,6 -217,7 +217,7 @@@ struct loaded_vmcs 
        int soft_vnmi_blocked;
        ktime_t entry_time;
        s64 vnmi_blocked_time;
+       unsigned long *msr_bitmap;
        struct list_head loaded_vmcss_on_cpu_link;
  };
  
@@@ -408,12 -417,6 +417,12 @@@ struct __packed vmcs12 
   */
  #define VMCS12_SIZE 0x1000
  
 +/*
 + * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
 + * supported VMCS12 field encoding.
 + */
 +#define VMCS12_MAX_FIELD_INDEX 0x17
 +
  /*
   * The nested_vmx structure is part of vcpu_vmx, and holds information we need
   * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@@ -437,7 -440,6 +446,7 @@@ struct nested_vmx 
         * data hold by vmcs12
         */
        bool sync_shadow_vmcs;
 +      bool dirty_vmcs12;
  
        bool change_vmcs01_virtual_x2apic_mode;
        /* L2 must run next, and mustn't decide to exit to L1. */
        bool pi_pending;
        u16 posted_intr_nv;
  
-       unsigned long *msr_bitmap;
        struct hrtimer preemption_timer;
        bool preemption_timer_expired;
  
@@@ -580,6 -580,7 +587,7 @@@ struct vcpu_vmx 
        struct kvm_vcpu       vcpu;
        unsigned long         host_rsp;
        u8                    fail;
+       u8                    msr_bitmap_mode;
        u32                   exit_intr_info;
        u32                   idt_vectoring_info;
        ulong                 rflags;
  
        u32 host_pkru;
  
 +      unsigned long host_debugctlmsr;
 +
        /*
         * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
         * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
@@@ -688,24 -687,67 +696,24 @@@ static struct pi_desc *vcpu_to_pi_desc(
        return &(to_vmx(vcpu)->pi_desc);
  }
  
 +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
  #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
 -#define FIELD(number, name)   [number] = VMCS12_OFFSET(name)
 -#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
 -                              [number##_HIGH] = VMCS12_OFFSET(name)+4
 +#define FIELD(number, name)   [ROL16(number, 6)] = VMCS12_OFFSET(name)
 +#define FIELD64(number, name)                                         \
 +      FIELD(number, name),                                            \
 +      [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
  
  
 -static unsigned long shadow_read_only_fields[] = {
 -      /*
 -       * We do NOT shadow fields that are modified when L0
 -       * traps and emulates any vmx instruction (e.g. VMPTRLD,
 -       * VMXON...) executed by L1.
 -       * For example, VM_INSTRUCTION_ERROR is read
 -       * by L1 if a vmx instruction fails (part of the error path).
 -       * Note the code assumes this logic. If for some reason
 -       * we start shadowing these fields then we need to
 -       * force a shadow sync when L0 emulates vmx instructions
 -       * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
 -       * by nested_vmx_failValid)
 -       */
 -      VM_EXIT_REASON,
 -      VM_EXIT_INTR_INFO,
 -      VM_EXIT_INSTRUCTION_LEN,
 -      IDT_VECTORING_INFO_FIELD,
 -      IDT_VECTORING_ERROR_CODE,
 -      VM_EXIT_INTR_ERROR_CODE,
 -      EXIT_QUALIFICATION,
 -      GUEST_LINEAR_ADDRESS,
 -      GUEST_PHYSICAL_ADDRESS
 +static u16 shadow_read_only_fields[] = {
 +#define SHADOW_FIELD_RO(x) x,
 +#include "vmx_shadow_fields.h"
  };
  static int max_shadow_read_only_fields =
        ARRAY_SIZE(shadow_read_only_fields);
  
 -static unsigned long shadow_read_write_fields[] = {
 -      TPR_THRESHOLD,
 -      GUEST_RIP,
 -      GUEST_RSP,
 -      GUEST_CR0,
 -      GUEST_CR3,
 -      GUEST_CR4,
 -      GUEST_INTERRUPTIBILITY_INFO,
 -      GUEST_RFLAGS,
 -      GUEST_CS_SELECTOR,
 -      GUEST_CS_AR_BYTES,
 -      GUEST_CS_LIMIT,
 -      GUEST_CS_BASE,
 -      GUEST_ES_BASE,
 -      GUEST_BNDCFGS,
 -      CR0_GUEST_HOST_MASK,
 -      CR0_READ_SHADOW,
 -      CR4_READ_SHADOW,
 -      TSC_OFFSET,
 -      EXCEPTION_BITMAP,
 -      CPU_BASED_VM_EXEC_CONTROL,
 -      VM_ENTRY_EXCEPTION_ERROR_CODE,
 -      VM_ENTRY_INTR_INFO_FIELD,
 -      VM_ENTRY_INSTRUCTION_LEN,
 -      VM_ENTRY_EXCEPTION_ERROR_CODE,
 -      HOST_FS_BASE,
 -      HOST_GS_BASE,
 -      HOST_FS_SELECTOR,
 -      HOST_GS_SELECTOR
 +static u16 shadow_read_write_fields[] = {
 +#define SHADOW_FIELD_RW(x) x,
 +#include "vmx_shadow_fields.h"
  };
  static int max_shadow_read_write_fields =
        ARRAY_SIZE(shadow_read_write_fields);
@@@ -856,13 -898,9 +864,13 @@@ static const unsigned short vmcs_field_
  
  static inline short vmcs_field_to_offset(unsigned long field)
  {
 -      BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
 +      unsigned index;
 +
 +      if (field >> 15)
 +              return -ENOENT;
  
 -      if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
 +      index = ROL16(field, 6);
 +      if (index >= ARRAY_SIZE(vmcs_field_to_offset_table))
                return -ENOENT;
  
        /*
         */
        asm("lfence");
  
 -      if (vmcs_field_to_offset_table[field] == 0)
 +      if (vmcs_field_to_offset_table[index] == 0)
                return -ENOENT;
  
 -      return vmcs_field_to_offset_table[field];
 +      return vmcs_field_to_offset_table[index];
  }
  
  static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
@@@ -897,6 -935,7 +905,7 @@@ static bool vmx_get_nmi_mask(struct kvm
  static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
  static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
                                            u16 error_code);
+ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
  
  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@@ -914,12 -953,8 +923,6 @@@ static DEFINE_PER_CPU(struct list_head
  static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
  
  enum {
-       VMX_MSR_BITMAP_LEGACY,
-       VMX_MSR_BITMAP_LONGMODE,
-       VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
-       VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
-       VMX_MSR_BITMAP_LEGACY_X2APIC,
-       VMX_MSR_BITMAP_LONGMODE_X2APIC,
 -      VMX_IO_BITMAP_A,
 -      VMX_IO_BITMAP_B,
        VMX_VMREAD_BITMAP,
        VMX_VMWRITE_BITMAP,
        VMX_BITMAP_NR
  
  static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
  
- #define vmx_msr_bitmap_legacy                (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
- #define vmx_msr_bitmap_longmode              (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
- #define vmx_msr_bitmap_legacy_x2apic_apicv   (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
- #define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
- #define vmx_msr_bitmap_legacy_x2apic         (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
- #define vmx_msr_bitmap_longmode_x2apic       (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
 -#define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
 -#define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
  #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
  #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
  
@@@ -2291,7 -2322,6 +2288,7 @@@ static void vmx_vcpu_load(struct kvm_vc
  
        vmx_vcpu_pi_load(vcpu, cpu);
        vmx->host_pkru = read_pkru();
 +      vmx->host_debugctlmsr = get_debugctlmsr();
  }
  
  static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@@ -2531,36 -2561,6 +2528,6 @@@ static void move_msr_up(struct vcpu_vm
        vmx->guest_msrs[from] = tmp;
  }
  
- static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
- {
-       unsigned long *msr_bitmap;
-       if (is_guest_mode(vcpu))
-               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
-       else if (cpu_has_secondary_exec_ctrls() &&
-                (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
-                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-               if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
-                       if (is_long_mode(vcpu))
-                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
-                       else
-                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
-               } else {
-                       if (is_long_mode(vcpu))
-                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
-                       else
-                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
-               }
-       } else {
-               if (is_long_mode(vcpu))
-                       msr_bitmap = vmx_msr_bitmap_longmode;
-               else
-                       msr_bitmap = vmx_msr_bitmap_legacy;
-       }
-       vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
- }
  /*
   * Set up the vmcs to automatically save and restore system
   * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
@@@ -2601,7 -2601,7 +2568,7 @@@ static void setup_msrs(struct vcpu_vmx 
        vmx->save_nmsrs = save_nmsrs;
  
        if (cpu_has_vmx_msr_bitmap())
-               vmx_set_msr_bitmap(&vmx->vcpu);
+               vmx_update_msr_bitmap(&vmx->vcpu);
  }
  
  /*
@@@ -2879,7 -2879,7 +2846,7 @@@ static void nested_vmx_setup_ctls_msrs(
        rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
  
        /* highest index: VMX_PREEMPTION_TIMER_VALUE */
 -      vmx->nested.nested_vmx_vmcs_enum = 0x2e;
 +      vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
  }
  
  /*
@@@ -3215,7 -3215,6 +3182,7 @@@ static inline bool vmx_feature_control_
   */
  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  {
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct shared_msr_entry *msr;
  
        switch (msr_info->index) {
                msr_info->data = vmcs_readl(GUEST_GS_BASE);
                break;
        case MSR_KERNEL_GS_BASE:
 -              vmx_load_host_state(to_vmx(vcpu));
 -              msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
 +              vmx_load_host_state(vmx);
 +              msr_info->data = vmx->msr_guest_kernel_gs_base;
                break;
  #endif
        case MSR_EFER:
                break;
        case MSR_IA32_MCG_EXT_CTL:
                if (!msr_info->host_initiated &&
 -                  !(to_vmx(vcpu)->msr_ia32_feature_control &
 +                  !(vmx->msr_ia32_feature_control &
                      FEATURE_CONTROL_LMCE))
                        return 1;
                msr_info->data = vcpu->arch.mcg_ext_ctl;
                break;
        case MSR_IA32_FEATURE_CONTROL:
 -              msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
 +              msr_info->data = vmx->msr_ia32_feature_control;
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!nested_vmx_allowed(vcpu))
                        return 1;
                /* Otherwise falls through */
        default:
 -              msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
 +              msr = find_msr_entry(vmx, msr_info->index);
                if (msr) {
                        msr_info->data = msr->data;
                        break;
@@@ -3599,7 -3598,7 +3566,7 @@@ static __init int setup_vmcs_config(str
  #endif
              CPU_BASED_CR3_LOAD_EXITING |
              CPU_BASED_CR3_STORE_EXITING |
 -            CPU_BASED_USE_IO_BITMAPS |
 +            CPU_BASED_UNCOND_IO_EXITING |
              CPU_BASED_MOV_DR_EXITING |
              CPU_BASED_USE_TSC_OFFSETING |
              CPU_BASED_INVLPG_EXITING |
                        SECONDARY_EXEC_ENABLE_EPT |
                        SECONDARY_EXEC_UNRESTRICTED_GUEST |
                        SECONDARY_EXEC_PAUSE_LOOP_EXITING |
 +                      SECONDARY_EXEC_DESC |
                        SECONDARY_EXEC_RDTSCP |
                        SECONDARY_EXEC_ENABLE_INVPCID |
                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@@ -3798,11 -3796,6 +3765,6 @@@ static struct vmcs *alloc_vmcs_cpu(int 
        return vmcs;
  }
  
- static struct vmcs *alloc_vmcs(void)
- {
-       return alloc_vmcs_cpu(raw_smp_processor_id());
- }
  static void free_vmcs(struct vmcs *vmcs)
  {
        free_pages((unsigned long)vmcs, vmcs_config.order);
@@@ -3818,20 -3811,36 +3780,36 @@@ static void free_loaded_vmcs(struct loa
        loaded_vmcs_clear(loaded_vmcs);
        free_vmcs(loaded_vmcs->vmcs);
        loaded_vmcs->vmcs = NULL;
+       if (loaded_vmcs->msr_bitmap)
+               free_page((unsigned long)loaded_vmcs->msr_bitmap);
        WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
  }
  
- static void vmx_nested_free_vmcs02(struct vcpu_vmx *vmx)
+ static struct vmcs *alloc_vmcs(void)
  {
-       struct loaded_vmcs *loaded_vmcs = &vmx->nested.vmcs02;
+       return alloc_vmcs_cpu(raw_smp_processor_id());
+ }
  
-       /*
-        * Just leak the VMCS02 if the WARN triggers. Better than
-        * a use-after-free.
-        */
-       if (WARN_ON(vmx->loaded_vmcs == loaded_vmcs))
-               return;
+ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+ {
+       loaded_vmcs->vmcs = alloc_vmcs();
+       if (!loaded_vmcs->vmcs)
+               return -ENOMEM;
+       loaded_vmcs->shadow_vmcs = NULL;
+       loaded_vmcs_init(loaded_vmcs);
+       if (cpu_has_vmx_msr_bitmap()) {
+               loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+               if (!loaded_vmcs->msr_bitmap)
+                       goto out_vmcs;
+               memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+       }
+       return 0;
+ out_vmcs:
        free_loaded_vmcs(loaded_vmcs);
+       return -ENOMEM;
  }
  
  static void free_kvm_area(void)
        }
  }
  
 -enum vmcs_field_type {
 -      VMCS_FIELD_TYPE_U16 = 0,
 -      VMCS_FIELD_TYPE_U64 = 1,
 -      VMCS_FIELD_TYPE_U32 = 2,
 -      VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
 +enum vmcs_field_width {
 +      VMCS_FIELD_WIDTH_U16 = 0,
 +      VMCS_FIELD_WIDTH_U64 = 1,
 +      VMCS_FIELD_WIDTH_U32 = 2,
 +      VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
  };
  
 -static inline int vmcs_field_type(unsigned long field)
 +static inline int vmcs_field_width(unsigned long field)
  {
        if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
 -              return VMCS_FIELD_TYPE_U32;
 +              return VMCS_FIELD_WIDTH_U32;
        return (field >> 13) & 0x3 ;
  }
  
@@@ -3867,66 -3876,43 +3845,66 @@@ static void init_vmcs_shadow_fields(voi
  {
        int i, j;
  
 -      /* No checks for read only fields yet */
 +      for (i = j = 0; i < max_shadow_read_only_fields; i++) {
 +              u16 field = shadow_read_only_fields[i];
 +              if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
 +                  (i + 1 == max_shadow_read_only_fields ||
 +                   shadow_read_only_fields[i + 1] != field + 1))
 +                      pr_err("Missing field from shadow_read_only_field %x\n",
 +                             field + 1);
 +
 +              clear_bit(field, vmx_vmread_bitmap);
 +#ifdef CONFIG_X86_64
 +              if (field & 1)
 +                      continue;
 +#endif
 +              if (j < i)
 +                      shadow_read_only_fields[j] = field;
 +              j++;
 +      }
 +      max_shadow_read_only_fields = j;
  
        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
 -              switch (shadow_read_write_fields[i]) {
 -              case GUEST_BNDCFGS:
 -                      if (!kvm_mpx_supported())
 +              u16 field = shadow_read_write_fields[i];
 +              if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
 +                  (i + 1 == max_shadow_read_write_fields ||
 +                   shadow_read_write_fields[i + 1] != field + 1))
 +                      pr_err("Missing field from shadow_read_write_field %x\n",
 +                             field + 1);
 +
 +              /*
 +               * PML and the preemption timer can be emulated, but the
 +               * processor cannot vmwrite to fields that don't exist
 +               * on bare metal.
 +               */
 +              switch (field) {
 +              case GUEST_PML_INDEX:
 +                      if (!cpu_has_vmx_pml())
 +                              continue;
 +                      break;
 +              case VMX_PREEMPTION_TIMER_VALUE:
 +                      if (!cpu_has_vmx_preemption_timer())
 +                              continue;
 +                      break;
 +              case GUEST_INTR_STATUS:
 +                      if (!cpu_has_vmx_apicv())
                                continue;
                        break;
                default:
                        break;
                }
  
 +              clear_bit(field, vmx_vmwrite_bitmap);
 +              clear_bit(field, vmx_vmread_bitmap);
 +#ifdef CONFIG_X86_64
 +              if (field & 1)
 +                      continue;
 +#endif
                if (j < i)
 -                      shadow_read_write_fields[j] =
 -                              shadow_read_write_fields[i];
 +                      shadow_read_write_fields[j] = field;
                j++;
        }
        max_shadow_read_write_fields = j;
 -
 -      /* shadowed fields guest access without vmexit */
 -      for (i = 0; i < max_shadow_read_write_fields; i++) {
 -              unsigned long field = shadow_read_write_fields[i];
 -
 -              clear_bit(field, vmx_vmwrite_bitmap);
 -              clear_bit(field, vmx_vmread_bitmap);
 -              if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
 -                      clear_bit(field + 1, vmx_vmwrite_bitmap);
 -                      clear_bit(field + 1, vmx_vmread_bitmap);
 -              }
 -      }
 -      for (i = 0; i < max_shadow_read_only_fields; i++) {
 -              unsigned long field = shadow_read_only_fields[i];
 -
 -              clear_bit(field, vmx_vmread_bitmap);
 -              if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
 -                      clear_bit(field + 1, vmx_vmread_bitmap);
 -      }
  }
  
  static __init int alloc_kvm_area(void)
@@@ -4139,10 -4125,9 +4117,10 @@@ static void exit_lmode(struct kvm_vcpu 
  
  #endif
  
 -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
 +                              bool invalidate_gpa)
  {
 -      if (enable_ept) {
 +      if (enable_ept && (invalidate_gpa || !enable_vpid)) {
                if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                        return;
                ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
        }
  }
  
 -static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 +static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
  {
 -      __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
 +      __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
  }
  
  static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
  {
        if (enable_ept)
 -              vmx_flush_tlb(vcpu);
 +              vmx_flush_tlb(vcpu, true);
  }
  
  static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@@ -4357,7 -4342,7 +4335,7 @@@ static void vmx_set_cr3(struct kvm_vcp
                ept_load_pdptrs(vcpu);
        }
  
 -      vmx_flush_tlb(vcpu);
 +      vmx_flush_tlb(vcpu, true);
        vmcs_writel(GUEST_CR3, guest_cr3);
  }
  
@@@ -4374,14 -4359,6 +4352,14 @@@ static int vmx_set_cr4(struct kvm_vcpu 
                (to_vmx(vcpu)->rmode.vm86_active ?
                 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
  
 +      if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
 +              vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
 +                            SECONDARY_EXEC_DESC);
 +              hw_cr4 &= ~X86_CR4_UMIP;
 +      } else
 +              vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
 +                              SECONDARY_EXEC_DESC);
 +
        if (cr4 & X86_CR4_VMXE) {
                /*
                 * To use VMXON (and later other VMX instructions), a guest
@@@ -4924,10 -4901,8 +4902,8 @@@ static void free_vpid(int vpid
        spin_unlock(&vmx_vpid_lock);
  }
  
- #define MSR_TYPE_R    1
- #define MSR_TYPE_W    2
- static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
-                                               u32 msr, int type)
+ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+                                                         u32 msr, int type)
  {
        int f = sizeof(unsigned long);
  
        }
  }
  
+ static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+                                                        u32 msr, int type)
+ {
+       int f = sizeof(unsigned long);
+       if (!cpu_has_vmx_msr_bitmap())
+               return;
+       /*
+        * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+        * have the write-low and read-high bitmap offsets the wrong way round.
+        * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+        */
+       if (msr <= 0x1fff) {
+               if (type & MSR_TYPE_R)
+                       /* read-low */
+                       __set_bit(msr, msr_bitmap + 0x000 / f);
+               if (type & MSR_TYPE_W)
+                       /* write-low */
+                       __set_bit(msr, msr_bitmap + 0x800 / f);
+       } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+               msr &= 0x1fff;
+               if (type & MSR_TYPE_R)
+                       /* read-high */
+                       __set_bit(msr, msr_bitmap + 0x400 / f);
+               if (type & MSR_TYPE_W)
+                       /* write-high */
+                       __set_bit(msr, msr_bitmap + 0xc00 / f);
+       }
+ }
+ static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
+                                                     u32 msr, int type, bool value)
+ {
+       if (value)
+               vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+       else
+               vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+ }
  /*
   * If a msr is allowed by L0, we should check whether it is allowed by L1.
   * The corresponding bit will be cleared unless both of L0 and L1 allow it.
@@@ -4971,6 -4990,11 +4991,6 @@@ static void nested_vmx_disable_intercep
  {
        int f = sizeof(unsigned long);
  
 -      if (!cpu_has_vmx_msr_bitmap()) {
 -              WARN_ON(1);
 -              return;
 -      }
 -
        /*
         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
         * have the write-low and read-high bitmap offsets the wrong way round.
        }
  }
  
- static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
  {
-       if (!longmode_only)
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
-                                               msr, MSR_TYPE_R | MSR_TYPE_W);
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
-                                               msr, MSR_TYPE_R | MSR_TYPE_W);
+       u8 mode = 0;
+       if (cpu_has_secondary_exec_ctrls() &&
+           (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+            SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+               mode |= MSR_BITMAP_MODE_X2APIC;
+               if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+                       mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+       }
+       if (is_long_mode(vcpu))
+               mode |= MSR_BITMAP_MODE_LM;
+       return mode;
  }
  
  #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
  
- static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_only)
+ static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
+                                        u8 mode)
  {
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
-                                       msr, type);
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
-                                       msr, type);
-       if (!apicv_only) {
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                               msr, type);
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                               msr, type);
+       int msr;
+       for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+               unsigned word = msr / BITS_PER_LONG;
+               msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
+               msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+       }
+       if (mode & MSR_BITMAP_MODE_X2APIC) {
+               /*
+                * TPR reads and writes can be virtualized even if virtual interrupt
+                * delivery is not in use.
+                */
+               vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+               if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+                       vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
+                       vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+                       vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+               }
        }
  }
  
+ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+       u8 mode = vmx_msr_bitmap_mode(vcpu);
+       u8 changed = mode ^ vmx->msr_bitmap_mode;
+       if (!changed)
+               return;
+       vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
+                                 !(mode & MSR_BITMAP_MODE_LM));
+       if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
+               vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+       vmx->msr_bitmap_mode = mode;
+ }
  static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
  {
        return enable_apicv;
@@@ -5071,8 -5134,7 +5130,8 @@@ static void vmx_complete_nested_posted_
        max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
        if (max_irr != 256) {
                vapic_page = kmap(vmx->nested.virtual_apic_page);
 -              __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
 +              __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
 +                      vapic_page, &max_irr);
                kunmap(vmx->nested.virtual_apic_page);
  
                status = vmcs_read16(GUEST_INTR_STATUS);
@@@ -5132,15 -5194,14 +5191,15 @@@ static int vmx_deliver_nested_posted_in
  
        if (is_guest_mode(vcpu) &&
            vector == vmx->nested.posted_intr_nv) {
 -              /* the PIR and ON have been set by L1. */
 -              kvm_vcpu_trigger_posted_interrupt(vcpu, true);
                /*
                 * If a posted intr is not recognized by hardware,
                 * we will accomplish it in the next vmentry.
                 */
                vmx->nested.pi_pending = true;
                kvm_make_request(KVM_REQ_EVENT, vcpu);
 +              /* the PIR and ON have been set by L1. */
 +              if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
 +                      kvm_vcpu_kick(vcpu);
                return 0;
        }
        return -1;
@@@ -5278,7 -5339,7 +5337,7 @@@ static void vmx_refresh_apicv_exec_ctrl
        }
  
        if (cpu_has_vmx_msr_bitmap())
-               vmx_set_msr_bitmap(vcpu);
+               vmx_update_msr_bitmap(vcpu);
  }
  
  static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@@@ -5319,7 -5380,6 +5378,7 @@@ static void vmx_compute_secondary_exec_
        struct kvm_vcpu *vcpu = &vmx->vcpu;
  
        u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
 +
        if (!cpu_need_virtualize_apic_accesses(vcpu))
                exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
        if (vmx->vpid == 0)
                exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
        exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 +
 +      /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
 +       * in vmx_set_cr4.  */
 +      exec_control &= ~SECONDARY_EXEC_DESC;
 +
        /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
           (handle_vmptrld).
           We can NOT enable shadow_vmcs here because we don't have yet
@@@ -5462,12 -5517,16 +5521,12 @@@ static void vmx_vcpu_setup(struct vcpu_
  #endif
        int i;
  
 -      /* I/O */
 -      vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
 -      vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
 -
        if (enable_shadow_vmcs) {
                vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
                vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
        }
        if (cpu_has_vmx_msr_bitmap())
-               vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
+               vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
  
        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
  
@@@ -6113,12 -6172,6 +6172,12 @@@ static int handle_set_cr4(struct kvm_vc
                return kvm_set_cr4(vcpu, val);
  }
  
 +static int handle_desc(struct kvm_vcpu *vcpu)
 +{
 +      WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
 +      return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 +}
 +
  static int handle_cr(struct kvm_vcpu *vcpu)
  {
        unsigned long exit_qualification, val;
@@@ -6575,21 -6628,7 +6634,21 @@@ static int handle_ept_misconfig(struct 
        if (!is_guest_mode(vcpu) &&
            !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                trace_kvm_fast_mmio(gpa);
 -              return kvm_skip_emulated_instruction(vcpu);
 +              /*
 +               * Doing kvm_skip_emulated_instruction() depends on undefined
 +               * behavior: Intel's manual doesn't mandate
 +               * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
 +               * occurs and while on real hardware it was observed to be set,
 +               * other hypervisors (namely Hyper-V) don't set it, we end up
 +               * advancing IP with some random value. Disable fast mmio when
 +               * running nested and keep it for real hardware in hope that
 +               * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
 +               */
 +              if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
 +                      return kvm_skip_emulated_instruction(vcpu);
 +              else
 +                      return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
 +                                                     NULL, 0) == EMULATE_DONE;
        }
  
        ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@@ -6770,7 -6809,7 +6829,7 @@@ void vmx_enable_tdp(void
  
  static __init int hardware_setup(void)
  {
-       int r = -ENOMEM, i, msr;
+       int r = -ENOMEM, i;
  
        rdmsrl_safe(MSR_EFER, &host_efer);
  
        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
  
-       memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
-       memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
 -      memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
 -
 -      memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
--
        if (setup_vmcs_config(&vmcs_config) < 0) {
                r = -EIO;
                goto out;
                !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                enable_vpid = 0;
  
 -      if (!cpu_has_vmx_shadow_vmcs())
 -              enable_shadow_vmcs = 0;
 -      if (enable_shadow_vmcs)
 -              init_vmcs_shadow_fields();
 -
        if (!cpu_has_vmx_ept() ||
            !cpu_has_vmx_ept_4levels() ||
            !cpu_has_vmx_ept_mt_wb() ||
                kvm_tsc_scaling_ratio_frac_bits = 48;
        }
  
-       vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
-       vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
-       vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-       memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
-                       vmx_msr_bitmap_legacy, PAGE_SIZE);
-       memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
-                       vmx_msr_bitmap_longmode, PAGE_SIZE);
-       memcpy(vmx_msr_bitmap_legacy_x2apic,
-                       vmx_msr_bitmap_legacy, PAGE_SIZE);
-       memcpy(vmx_msr_bitmap_longmode_x2apic,
-                       vmx_msr_bitmap_longmode, PAGE_SIZE);
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
  
-       for (msr = 0x800; msr <= 0x8ff; msr++) {
-               if (msr == X2APIC_MSR(APIC_TMCCT))
-                       continue;
-               vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
-       }
-       /*
-        * TPR reads and writes can be virtualized even if virtual interrupt
-        * delivery is not in use.
-        */
-       vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W, false);
-       vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_EOI), MSR_TYPE_W, true);
-       vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W, true);
        if (enable_ept)
                vmx_enable_tdp();
        else
                kvm_x86_ops->cancel_hv_timer = NULL;
        }
  
 +      if (!cpu_has_vmx_shadow_vmcs())
 +              enable_shadow_vmcs = 0;
 +      if (enable_shadow_vmcs)
 +              init_vmcs_shadow_fields();
 +
        kvm_set_posted_intr_wakeup_handler(wakeup_handler);
  
        kvm_mce_cap_supported |= MCG_LMCE_P;
@@@ -7171,20 -7181,11 +7197,11 @@@ static int enter_vmx_operation(struct k
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs *shadow_vmcs;
+       int r;
  
-       vmx->nested.vmcs02.vmcs = alloc_vmcs();
-       vmx->nested.vmcs02.shadow_vmcs = NULL;
-       if (!vmx->nested.vmcs02.vmcs)
+       r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
+       if (r < 0)
                goto out_vmcs02;
-       loaded_vmcs_init(&vmx->nested.vmcs02);
-       if (cpu_has_vmx_msr_bitmap()) {
-               vmx->nested.msr_bitmap =
-                               (unsigned long *)__get_free_page(GFP_KERNEL);
-               if (!vmx->nested.msr_bitmap)
-                       goto out_msr_bitmap;
-               memset(vmx->nested.msr_bitmap, 0xff, PAGE_SIZE);
-       }
  
        vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
        if (!vmx->nested.cached_vmcs12)
@@@ -7212,10 -7213,7 +7229,7 @@@ out_shadow_vmcs
        kfree(vmx->nested.cached_vmcs12);
  
  out_cached_vmcs12:
-       free_page((unsigned long)vmx->nested.msr_bitmap);
- out_msr_bitmap:
-       vmx_nested_free_vmcs02(vmx);
+       free_loaded_vmcs(&vmx->nested.vmcs02);
  
  out_vmcs02:
        return -ENOMEM;
@@@ -7360,10 -7358,6 +7374,6 @@@ static void free_nested(struct vcpu_vm
        free_vpid(vmx->nested.vpid02);
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
-       if (vmx->nested.msr_bitmap) {
-               free_page((unsigned long)vmx->nested.msr_bitmap);
-               vmx->nested.msr_bitmap = NULL;
-       }
        if (enable_shadow_vmcs) {
                vmx_disable_shadow_vmcs(vmx);
                vmcs_clear(vmx->vmcs01.shadow_vmcs);
                vmx->nested.pi_desc = NULL;
        }
  
-       vmx_nested_free_vmcs02(vmx);
+       free_loaded_vmcs(&vmx->nested.vmcs02);
  }
  
  /* Emulate the VMXOFF instruction */
@@@ -7467,17 -7461,17 +7477,17 @@@ static inline int vmcs12_read_any(struc
  
        p = ((char *)(get_vmcs12(vcpu))) + offset;
  
 -      switch (vmcs_field_type(field)) {
 -      case VMCS_FIELD_TYPE_NATURAL_WIDTH:
 +      switch (vmcs_field_width(field)) {
 +      case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
                *ret = *((natural_width *)p);
                return 0;
 -      case VMCS_FIELD_TYPE_U16:
 +      case VMCS_FIELD_WIDTH_U16:
                *ret = *((u16 *)p);
                return 0;
 -      case VMCS_FIELD_TYPE_U32:
 +      case VMCS_FIELD_WIDTH_U32:
                *ret = *((u32 *)p);
                return 0;
 -      case VMCS_FIELD_TYPE_U64:
 +      case VMCS_FIELD_WIDTH_U64:
                *ret = *((u64 *)p);
                return 0;
        default:
@@@ -7494,17 -7488,17 +7504,17 @@@ static inline int vmcs12_write_any(stru
        if (offset < 0)
                return offset;
  
 -      switch (vmcs_field_type(field)) {
 -      case VMCS_FIELD_TYPE_U16:
 +      switch (vmcs_field_width(field)) {
 +      case VMCS_FIELD_WIDTH_U16:
                *(u16 *)p = field_value;
                return 0;
 -      case VMCS_FIELD_TYPE_U32:
 +      case VMCS_FIELD_WIDTH_U32:
                *(u32 *)p = field_value;
                return 0;
 -      case VMCS_FIELD_TYPE_U64:
 +      case VMCS_FIELD_WIDTH_U64:
                *(u64 *)p = field_value;
                return 0;
 -      case VMCS_FIELD_TYPE_NATURAL_WIDTH:
 +      case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
                *(natural_width *)p = field_value;
                return 0;
        default:
@@@ -7520,7 -7514,7 +7530,7 @@@ static void copy_shadow_to_vmcs12(struc
        unsigned long field;
        u64 field_value;
        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
 -      const unsigned long *fields = shadow_read_write_fields;
 +      const u16 *fields = shadow_read_write_fields;
        const int num_fields = max_shadow_read_write_fields;
  
        preempt_disable();
  
        for (i = 0; i < num_fields; i++) {
                field = fields[i];
 -              switch (vmcs_field_type(field)) {
 -              case VMCS_FIELD_TYPE_U16:
 -                      field_value = vmcs_read16(field);
 -                      break;
 -              case VMCS_FIELD_TYPE_U32:
 -                      field_value = vmcs_read32(field);
 -                      break;
 -              case VMCS_FIELD_TYPE_U64:
 -                      field_value = vmcs_read64(field);
 -                      break;
 -              case VMCS_FIELD_TYPE_NATURAL_WIDTH:
 -                      field_value = vmcs_readl(field);
 -                      break;
 -              default:
 -                      WARN_ON(1);
 -                      continue;
 -              }
 +              field_value = __vmcs_readl(field);
                vmcs12_write_any(&vmx->vcpu, field, field_value);
        }
  
  
  static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
  {
 -      const unsigned long *fields[] = {
 +      const u16 *fields[] = {
                shadow_read_write_fields,
                shadow_read_only_fields
        };
                for (i = 0; i < max_fields[q]; i++) {
                        field = fields[q][i];
                        vmcs12_read_any(&vmx->vcpu, field, &field_value);
 -
 -                      switch (vmcs_field_type(field)) {
 -                      case VMCS_FIELD_TYPE_U16:
 -                              vmcs_write16(field, (u16)field_value);
 -                              break;
 -                      case VMCS_FIELD_TYPE_U32:
 -                              vmcs_write32(field, (u32)field_value);
 -                              break;
 -                      case VMCS_FIELD_TYPE_U64:
 -                              vmcs_write64(field, (u64)field_value);
 -                              break;
 -                      case VMCS_FIELD_TYPE_NATURAL_WIDTH:
 -                              vmcs_writel(field, (long)field_value);
 -                              break;
 -                      default:
 -                              WARN_ON(1);
 -                              break;
 -                      }
 +                      __vmcs_writel(field, field_value);
                }
        }
  
@@@ -7629,10 -7656,8 +7639,10 @@@ static int handle_vmwrite(struct kvm_vc
  {
        unsigned long field;
        gva_t gva;
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 +
        /* The value to write might be 32 or 64 bits, depending on L1's long
         * mode, and eventually we need to write that into a field of several
         * possible lengths. The code below first zero-extends the value to 64
                return kvm_skip_emulated_instruction(vcpu);
        }
  
 +      switch (field) {
 +#define SHADOW_FIELD_RW(x) case x:
 +#include "vmx_shadow_fields.h"
 +              /*
 +               * The fields that can be updated by L1 without a vmexit are
 +               * always updated in the vmcs02, the others go down the slow
 +               * path of prepare_vmcs02.
 +               */
 +              break;
 +      default:
 +              vmx->nested.dirty_vmcs12 = true;
 +              break;
 +      }
 +
        nested_vmx_succeed(vcpu);
        return kvm_skip_emulated_instruction(vcpu);
  }
@@@ -7703,7 -7714,6 +7713,7 @@@ static void set_current_vmptr(struct vc
                             __pa(vmx->vmcs01.shadow_vmcs));
                vmx->nested.sync_shadow_vmcs = true;
        }
 +      vmx->nested.dirty_vmcs12 = true;
  }
  
  /* Emulate the VMPTRLD instruction */
@@@ -7924,7 -7934,7 +7934,7 @@@ static int handle_invvpid(struct kvm_vc
                return kvm_skip_emulated_instruction(vcpu);
        }
  
 -      __vmx_flush_tlb(vcpu, vmx->nested.vpid02);
 +      __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
        nested_vmx_succeed(vcpu);
  
        return kvm_skip_emulated_instruction(vcpu);
@@@ -8118,8 -8128,6 +8128,8 @@@ static int (*const kvm_vmx_exit_handler
        [EXIT_REASON_XSETBV]                  = handle_xsetbv,
        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
 +      [EXIT_REASON_GDTR_IDTR]               = handle_desc,
 +      [EXIT_REASON_LDTR_TR]                 = handle_desc,
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
@@@ -8865,7 -8873,7 +8875,7 @@@ static void vmx_set_virtual_x2apic_mode
        }
        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
  
-       vmx_set_msr_bitmap(vcpu);
+       vmx_update_msr_bitmap(vcpu);
  }
  
  static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
@@@ -8929,23 -8937,36 +8939,23 @@@ static void vmx_set_rvi(int vector
  
  static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
  {
 -      if (!is_guest_mode(vcpu)) {
 -              vmx_set_rvi(max_irr);
 -              return;
 -      }
 -
 -      if (max_irr == -1)
 -              return;
 -
 -      /*
 -       * In guest mode.  If a vmexit is needed, vmx_check_nested_events
 -       * handles it.
 -       */
 -      if (nested_exit_on_intr(vcpu))
 -              return;
 -
        /*
 -       * Else, fall back to pre-APICv interrupt injection since L2
 -       * is run without virtual interrupt delivery.
 +       * When running L2, updating RVI is only relevant when
 +       * vmcs12 virtual-interrupt-delivery enabled.
 +       * However, it can be enabled only when L1 also
 +       * intercepts external-interrupts and in that case
 +       * we should not update vmcs02 RVI but instead intercept
 +       * interrupt. Therefore, do nothing when running L2.
         */
 -      if (!kvm_event_needs_reinjection(vcpu) &&
 -          vmx_interrupt_allowed(vcpu)) {
 -              kvm_queue_interrupt(vcpu, max_irr, false);
 -              vmx_inject_irq(vcpu);
 -      }
 +      if (!is_guest_mode(vcpu))
 +              vmx_set_rvi(max_irr);
  }
  
  static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int max_irr;
 +      bool max_irr_updated;
  
        WARN_ON(!vcpu->arch.apicv_active);
        if (pi_test_on(&vmx->pi_desc)) {
                 * But on x86 this is just a compiler barrier anyway.
                 */
                smp_mb__after_atomic();
 -              max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
 +              max_irr_updated =
 +                      kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
 +
 +              /*
 +               * If we are running L2 and L1 has a new pending interrupt
 +               * which can be injected, we should re-evaluate
 +               * what should be done with this new L1 interrupt.
 +               * If L1 intercepts external-interrupts, we should
 +               * exit from L2 to L1. Otherwise, interrupt should be
 +               * delivered directly to L2.
 +               */
 +              if (is_guest_mode(vcpu) && max_irr_updated) {
 +                      if (nested_exit_on_intr(vcpu))
 +                              kvm_vcpu_exiting_guest_mode(vcpu);
 +                      else
 +                              kvm_make_request(KVM_REQ_EVENT, vcpu);
 +              }
        } else {
                max_irr = kvm_lapic_find_highest_irr(vcpu);
        }
@@@ -9054,14 -9059,14 +9064,14 @@@ static void vmx_handle_external_intr(st
  #endif
                        "pushf\n\t"
                        __ASM_SIZE(push) " $%c[cs]\n\t"
 -                      "call *%[entry]\n\t"
 +                      CALL_NOSPEC
                        :
  #ifdef CONFIG_X86_64
                        [sp]"=&r"(tmp),
  #endif
                        ASM_CALL_CONSTRAINT
                        :
 -                      [entry]"r"(entry),
 +                      THUNK_TARGET(entry),
                        [ss]"i"(__KERNEL_DS),
                        [cs]"i"(__KERNEL_CS)
                        );
@@@ -9086,12 -9091,6 +9096,12 @@@ static bool vmx_xsaves_supported(void
                SECONDARY_EXEC_XSAVES;
  }
  
 +static bool vmx_umip_emulated(void)
 +{
 +      return vmcs_config.cpu_based_2nd_exec_ctrl &
 +              SECONDARY_EXEC_DESC;
 +}
 +
  static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
  {
        u32 exit_intr_info;
@@@ -9247,7 -9246,7 +9257,7 @@@ static void vmx_arm_hv_timer(struct kvm
  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 -      unsigned long debugctlmsr, cr3, cr4;
 +      unsigned long cr3, cr4;
  
        /* Record the guest's net vcpu time for enforced NMI injections. */
        if (unlikely(!enable_vnmi &&
                __write_pkru(vcpu->arch.pkru);
  
        atomic_switch_perf_msrs(vmx);
 -      debugctlmsr = get_debugctlmsr();
  
        vmx_arm_hv_timer(vcpu);
  
        vmexit_fill_RSB();
  
        /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
 -      if (debugctlmsr)
 -              update_debugctlmsr(debugctlmsr);
 +      if (vmx->host_debugctlmsr)
 +              update_debugctlmsr(vmx->host_debugctlmsr);
  
  #ifndef CONFIG_X86_64
        /*
@@@ -9506,8 -9506,10 +9516,8 @@@ static void vmx_switch_vmcs(struct kvm_
  static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
 -       int r;
  
 -       r = vcpu_load(vcpu);
 -       BUG_ON(r);
 +       vcpu_load(vcpu);
         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
         free_nested(vmx);
         vcpu_put(vcpu);
@@@ -9532,6 -9534,7 +9542,7 @@@ static struct kvm_vcpu *vmx_create_vcpu
  {
        int err;
        struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       unsigned long *msr_bitmap;
        int cpu;
  
        if (!vmx)
        if (!vmx->guest_msrs)
                goto free_pml;
  
-       vmx->loaded_vmcs = &vmx->vmcs01;
-       vmx->loaded_vmcs->vmcs = alloc_vmcs();
-       vmx->loaded_vmcs->shadow_vmcs = NULL;
-       if (!vmx->loaded_vmcs->vmcs)
+       err = alloc_loaded_vmcs(&vmx->vmcs01);
+       if (err < 0)
                goto free_msrs;
-       loaded_vmcs_init(vmx->loaded_vmcs);
  
+       msr_bitmap = vmx->vmcs01.msr_bitmap;
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+       vmx->msr_bitmap_mode = 0;
+       vmx->loaded_vmcs = &vmx->vmcs01;
        cpu = get_cpu();
        vmx_vcpu_load(&vmx->vcpu, cpu);
        vmx->vcpu.cpu = cpu;
@@@ -9699,8 -9709,7 +9717,8 @@@ static void vmcs_set_secondary_exec_con
        u32 mask =
                SECONDARY_EXEC_SHADOW_VMCS |
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 -              SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 +              SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 +              SECONDARY_EXEC_DESC;
  
        u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
  
@@@ -9866,8 -9875,8 +9884,8 @@@ static void vmx_inject_page_fault_neste
        }
  }
  
 -static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 -                                             struct vmcs12 *vmcs12);
 +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 +                                               struct vmcs12 *vmcs12);
  
  static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                                        struct vmcs12 *vmcs12)
                        (unsigned long)(vmcs12->posted_intr_desc_addr &
                        (PAGE_SIZE - 1)));
        }
 -      if (cpu_has_vmx_msr_bitmap() &&
 -          nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
 -          nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
 -              ;
 -      else
 +      if (!nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
                vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
                                CPU_BASED_USE_MSR_BITMAPS);
  }
@@@ -10024,19 -10037,14 +10042,19 @@@ static int nested_vmx_check_tpr_shadow_
   * Merge L0's and L1's MSR bitmap, return false to indicate that
   * we do not use the hardware.
   */
 -static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 -                                             struct vmcs12 *vmcs12)
 +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 +                                               struct vmcs12 *vmcs12)
  {
        int msr;
        struct page *page;
        unsigned long *msr_bitmap_l1;
-       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
+       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
  
 +      /* Nothing to do if the MSR bitmap is not in use.  */
 +      if (!cpu_has_vmx_msr_bitmap() ||
 +          !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
 +              return false;
 +
        /* This shortcut is ok because we support only x2APIC MSRs so far. */
        if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
                return false;
        page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
        if (is_error_page(page))
                return false;
 -      msr_bitmap_l1 = (unsigned long *)kmap(page);
  
 -      memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
 +      msr_bitmap_l1 = (unsigned long *)kmap(page);
 +      if (nested_cpu_has_apic_reg_virt(vmcs12)) {
 +              /*
 +               * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
 +               * just lets the processor take the value from the virtual-APIC page;
 +               * take those 256 bits directly from the L1 bitmap.
 +               */
 +              for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
 +                      unsigned word = msr / BITS_PER_LONG;
 +                      msr_bitmap_l0[word] = msr_bitmap_l1[word];
 +                      msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
 +              }
 +      } else {
 +              for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
 +                      unsigned word = msr / BITS_PER_LONG;
 +                      msr_bitmap_l0[word] = ~0;
 +                      msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
 +              }
 +      }
  
 -      if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
 -              if (nested_cpu_has_apic_reg_virt(vmcs12))
 -                      for (msr = 0x800; msr <= 0x8ff; msr++)
 -                              nested_vmx_disable_intercept_for_msr(
 -                                      msr_bitmap_l1, msr_bitmap_l0,
 -                                      msr, MSR_TYPE_R);
 +      nested_vmx_disable_intercept_for_msr(
 +              msr_bitmap_l1, msr_bitmap_l0,
 +              X2APIC_MSR(APIC_TASKPRI),
 +              MSR_TYPE_W);
  
 +      if (nested_cpu_has_vid(vmcs12)) {
                nested_vmx_disable_intercept_for_msr(
 -                              msr_bitmap_l1, msr_bitmap_l0,
 -                              APIC_BASE_MSR + (APIC_TASKPRI >> 4),
 -                              MSR_TYPE_R | MSR_TYPE_W);
 -
 -              if (nested_cpu_has_vid(vmcs12)) {
 -                      nested_vmx_disable_intercept_for_msr(
 -                              msr_bitmap_l1, msr_bitmap_l0,
 -                              APIC_BASE_MSR + (APIC_EOI >> 4),
 -                              MSR_TYPE_W);
 -                      nested_vmx_disable_intercept_for_msr(
 -                              msr_bitmap_l1, msr_bitmap_l0,
 -                              APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
 -                              MSR_TYPE_W);
 -              }
 +                      msr_bitmap_l1, msr_bitmap_l0,
 +                      X2APIC_MSR(APIC_EOI),
 +                      MSR_TYPE_W);
 +              nested_vmx_disable_intercept_for_msr(
 +                      msr_bitmap_l1, msr_bitmap_l0,
 +                      X2APIC_MSR(APIC_SELF_IPI),
 +                      MSR_TYPE_W);
        }
        kunmap(page);
        kvm_release_page_clean(page);
@@@ -10345,12 -10344,25 +10363,12 @@@ static int nested_vmx_load_cr3(struct k
        return 0;
  }
  
 -/*
 - * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 - * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
 - * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
 - * guest in a way that will both be appropriate to L1's requests, and our
 - * needs. In addition to modifying the active vmcs (which is vmcs02), this
 - * function also has additional necessary side-effects, like setting various
 - * vcpu->arch fields.
 - * Returns 0 on success, 1 on failure. Invalid state exit qualification code
 - * is assigned to entry_failure_code on failure.
 - */
 -static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 -                        bool from_vmentry, u32 *entry_failure_code)
 +static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 +                             bool from_vmentry)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 -      u32 exec_control, vmcs12_exec_ctrl;
  
        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 -      vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
        vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
        vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
        vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
        vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
        vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
        vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
 -      vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
        vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
        vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
        vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
        vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
        vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
        vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
 -      vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
        vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
        vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
        vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
        vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
        vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
        vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
 -      vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
 -      vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
        vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
        vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
        vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
  
 +      vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
 +      vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
 +              vmcs12->guest_pending_dbg_exceptions);
 +      vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
 +      vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
 +
 +      if (nested_cpu_has_xsaves(vmcs12))
 +              vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
 +      vmcs_write64(VMCS_LINK_POINTER, -1ull);
 +
 +      if (cpu_has_vmx_posted_intr())
 +              vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
 +
 +      /*
 +       * Whether page-faults are trapped is determined by a combination of
 +       * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
 +       * If enable_ept, L0 doesn't care about page faults and we should
 +       * set all of these to L1's desires. However, if !enable_ept, L0 does
 +       * care about (at least some) page faults, and because it is not easy
 +       * (if at all possible?) to merge L0 and L1's desires, we simply ask
 +       * to exit on each and every L2 page fault. This is done by setting
 +       * MASK=MATCH=0 and (see below) EB.PF=1.
 +       * Note that below we don't need special code to set EB.PF beyond the
 +       * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
 +       * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
 +       * !enable_ept, EB.PF is 1, so the "or" will always be 1.
 +       */
 +      vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
 +              enable_ept ? vmcs12->page_fault_error_code_mask : 0);
 +      vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
 +              enable_ept ? vmcs12->page_fault_error_code_match : 0);
 +
 +      /* All VMFUNCs are currently emulated through L0 vmexits.  */
 +      if (cpu_has_vmx_vmfunc())
 +              vmcs_write64(VM_FUNCTION_CONTROL, 0);
 +
 +      if (cpu_has_vmx_apicv()) {
 +              vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
 +              vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
 +              vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
 +              vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
 +      }
 +
 +      /*
 +       * Set host-state according to L0's settings (vmcs12 is irrelevant here)
 +       * Some constant fields are set here by vmx_set_constant_host_state().
 +       * Other fields are different per CPU, and will be set later when
 +       * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
 +       */
 +      vmx_set_constant_host_state(vmx);
 +
 +      /*
 +       * Set the MSR load/store lists to match L0's settings.
 +       */
 +      vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 +      vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
 +      vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
 +      vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
 +      vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 +
 +      set_cr4_guest_host_mask(vmx);
 +
 +      if (vmx_mpx_supported())
 +              vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
 +
 +      if (enable_vpid) {
 +              if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
 +                      vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
 +              else
 +                      vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 +      }
 +
 +      /*
 +       * L1 may access the L2's PDPTR, so save them to construct vmcs12
 +       */
 +      if (enable_ept) {
 +              vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
 +              vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
 +              vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
 +              vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
 +      }
++
++      if (cpu_has_vmx_msr_bitmap())
++              vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
 +}
 +
 +/*
 + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
 + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
 + * guest in a way that will both be appropriate to L1's requests, and our
 + * needs. In addition to modifying the active vmcs (which is vmcs02), this
 + * function also has additional necessary side-effects, like setting various
 + * vcpu->arch fields.
 + * Returns 0 on success, 1 on failure. Invalid state exit qualification code
 + * is assigned to entry_failure_code on failure.
 + */
 +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 +                        bool from_vmentry, u32 *entry_failure_code)
 +{
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +      u32 exec_control, vmcs12_exec_ctrl;
 +
 +      /*
 +       * First, the fields that are shadowed.  This must be kept in sync
 +       * with vmx_shadow_fields.h.
 +       */
 +
 +      vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
 +      vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
 +      vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
 +      vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
 +      vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
 +
 +      /*
 +       * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR,
 +       * HOST_FS_BASE, HOST_GS_BASE.
 +       */
 +
        if (from_vmentry &&
            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
                kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
        } else {
                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
        }
 -      vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 -      vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
 -              vmcs12->guest_pending_dbg_exceptions);
 -      vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
 -      vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
 -
 -      if (nested_cpu_has_xsaves(vmcs12))
 -              vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
 -      vmcs_write64(VMCS_LINK_POINTER, -1ull);
  
        exec_control = vmcs12->pin_based_vm_exec_control;
  
        if (nested_cpu_has_posted_intr(vmcs12)) {
                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
                vmx->nested.pi_pending = false;
 -              vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
        } else {
                exec_control &= ~PIN_BASED_POSTED_INTR;
        }
        if (nested_cpu_has_preemption_timer(vmcs12))
                vmx_start_preemption_timer(vcpu);
  
 -      /*
 -       * Whether page-faults are trapped is determined by a combination of
 -       * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
 -       * If enable_ept, L0 doesn't care about page faults and we should
 -       * set all of these to L1's desires. However, if !enable_ept, L0 does
 -       * care about (at least some) page faults, and because it is not easy
 -       * (if at all possible?) to merge L0 and L1's desires, we simply ask
 -       * to exit on each and every L2 page fault. This is done by setting
 -       * MASK=MATCH=0 and (see below) EB.PF=1.
 -       * Note that below we don't need special code to set EB.PF beyond the
 -       * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
 -       * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
 -       * !enable_ept, EB.PF is 1, so the "or" will always be 1.
 -       */
 -      vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
 -              enable_ept ? vmcs12->page_fault_error_code_mask : 0);
 -      vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
 -              enable_ept ? vmcs12->page_fault_error_code_match : 0);
 -
        if (cpu_has_secondary_exec_ctrls()) {
                exec_control = vmx->secondary_exec_control;
  
                        exec_control |= vmcs12_exec_ctrl;
                }
  
 -              /* All VMFUNCs are currently emulated through L0 vmexits.  */
 -              if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
 -                      vmcs_write64(VM_FUNCTION_CONTROL, 0);
 -
 -              if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
 -                      vmcs_write64(EOI_EXIT_BITMAP0,
 -                              vmcs12->eoi_exit_bitmap0);
 -                      vmcs_write64(EOI_EXIT_BITMAP1,
 -                              vmcs12->eoi_exit_bitmap1);
 -                      vmcs_write64(EOI_EXIT_BITMAP2,
 -                              vmcs12->eoi_exit_bitmap2);
 -                      vmcs_write64(EOI_EXIT_BITMAP3,
 -                              vmcs12->eoi_exit_bitmap3);
 +              if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
                        vmcs_write16(GUEST_INTR_STATUS,
                                vmcs12->guest_intr_status);
 -              }
  
                /*
                 * Write an illegal value to APIC_ACCESS_ADDR. Later,
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
  
 -
 -      /*
 -       * Set host-state according to L0's settings (vmcs12 is irrelevant here)
 -       * Some constant fields are set here by vmx_set_constant_host_state().
 -       * Other fields are different per CPU, and will be set later when
 -       * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
 -       */
 -      vmx_set_constant_host_state(vmx);
 -
 -      /*
 -       * Set the MSR load/store lists to match L0's settings.
 -       */
 -      vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 -      vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
 -      vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
 -      vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
 -      vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 -
        /*
         * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
         * entry, but only if the current (host) sp changed from the value
        }
  
        /*
 -       * Merging of IO bitmap not currently supported.
 -       * Rather, exit every time.
 +       * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
 +       * for I/O port accesses.
         */
        exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
        exec_control |= CPU_BASED_UNCOND_IO_EXITING;
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
        }
  
 -      set_cr4_guest_host_mask(vmx);
 -
 -      if (from_vmentry &&
 -          vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
 -              vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
 -
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
                vmcs_write64(TSC_OFFSET,
                        vcpu->arch.tsc_offset + vmcs12->tsc_offset);
        if (kvm_has_tsc_control)
                decache_tsc_multiplier(vmx);
  
 -      if (cpu_has_vmx_msr_bitmap())
 -              vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
 -
        if (enable_vpid) {
                /*
                 * There is no direct mapping between vpid02 and vpid12, the
                 * even if spawn a lot of nested vCPUs.
                 */
                if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
 -                      vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
                        if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
                                vmx->nested.last_vpid = vmcs12->virtual_processor_id;
 -                              __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
 +                              __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true);
                        }
                } else {
 -                      vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 -                      vmx_flush_tlb(vcpu);
 +                      vmx_flush_tlb(vcpu, true);
                }
 -
        }
  
        if (enable_pml) {
        /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
        vmx_set_efer(vcpu, vcpu->arch.efer);
  
 +      if (vmx->nested.dirty_vmcs12) {
 +              prepare_vmcs02_full(vcpu, vmcs12, from_vmentry);
 +              vmx->nested.dirty_vmcs12 = false;
 +      }
 +
        /* Shadow page tables on either EPT or shadow page tables. */
        if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
                                entry_failure_code))
        if (!enable_ept)
                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
  
 -      /*
 -       * L1 may access the L2's PDPTR, so save them to construct vmcs12
 -       */
 -      if (enable_ept) {
 -              vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
 -              vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
 -              vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
 -              vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
 -      }
 -
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
        return 0;
@@@ -11087,6 -11064,7 +11108,6 @@@ static int vmx_check_nested_events(stru
                if (block_nested_events)
                        return -EBUSY;
                nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
 -              vcpu->arch.exception.pending = false;
                return 0;
        }
  
@@@ -11367,8 -11345,11 +11388,8 @@@ static void load_vmcs12_host_state(stru
                 * L1's vpid. TODO: move to a more elaborate solution, giving
                 * each L2 its own vpid and exposing the vpid feature to L1.
                 */
 -              vmx_flush_tlb(vcpu);
 +              vmx_flush_tlb(vcpu, true);
        }
 -      /* Restore posted intr vector. */
 -      if (nested_cpu_has_posted_intr(vmcs12))
 -              vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
  
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
  
        if (cpu_has_vmx_msr_bitmap())
-               vmx_set_msr_bitmap(vcpu);
+               vmx_update_msr_bitmap(vcpu);
  
        if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
                                vmcs12->vm_exit_msr_load_count))
@@@ -11629,21 -11610,6 +11650,21 @@@ static int vmx_check_intercept(struct k
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage)
  {
 +      struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 +      struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 +
 +      /*
 +       * RDPID causes #UD if disabled through secondary execution controls.
 +       * Because it is marked as EmulateOnUD, we need to intercept it here.
 +       */
 +      if (info->intercept == x86_intercept_rdtscp &&
 +          !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
 +              ctxt->exception.vector = UD_VECTOR;
 +              ctxt->exception.error_code_valid = false;
 +              return X86EMUL_PROPAGATE_FAULT;
 +      }
 +
 +      /* TODO: check more intercepts... */
        return X86EMUL_CONTINUE;
  }
  
@@@ -12157,7 -12123,6 +12178,7 @@@ static struct kvm_x86_ops vmx_x86_ops _
        .handle_external_intr = vmx_handle_external_intr,
        .mpx_supported = vmx_mpx_supported,
        .xsaves_supported = vmx_xsaves_supported,
 +      .umip_emulated = vmx_umip_emulated,
  
        .check_nested_events = vmx_check_nested_events,