Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

[sfrench/cifs-2.6.git] / arch / x86 / kvm / mmu / paging_tmpl.h
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h

index db80f7ccaa4e3e4ccbcf1379c068255eaf537277..f5958071220c9aef44e1c011487b6cce47209fb3 100644 (file)
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -16,25 +16,21 @@
   */
  
  /*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
+ * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables,
+ * as well as guest EPT tables, so the code in this file is compiled thrice,
+ * once per guest PTE type.  The per-type defines are #undef'd at the end.
   */
  
  #if PTTYPE == 64
         #define pt_element_t u64
         #define guest_walker guest_walker64
         #define FNAME(name) paging##64_##name
-       #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
-       #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
-       #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
-       #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-       #define PT_LEVEL_BITS PT64_LEVEL_BITS
+       #define PT_LEVEL_BITS 9
         #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
         #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
         #define PT_HAVE_ACCESSED_DIRTY(mmu) true
         #ifdef CONFIG_X86_64
         #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
-       #define CMPXCHG "cmpxchgq"
         #else
         #define PT_MAX_FULL_LEVELS 2
         #endif
@@ -42,36 +38,35 @@
         #define pt_element_t u32
         #define guest_walker guest_walker32
         #define FNAME(name) paging##32_##name
-       #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
-       #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
-       #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
-       #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
-       #define PT_LEVEL_BITS PT32_LEVEL_BITS
+       #define PT_LEVEL_BITS 10
         #define PT_MAX_FULL_LEVELS 2
         #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
         #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
         #define PT_HAVE_ACCESSED_DIRTY(mmu) true
-       #define CMPXCHG "cmpxchgl"
+
+       #define PT32_DIR_PSE36_SIZE 4
+       #define PT32_DIR_PSE36_SHIFT 13
+       #define PT32_DIR_PSE36_MASK \
+               (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
  #elif PTTYPE == PTTYPE_EPT
         #define pt_element_t u64
         #define guest_walker guest_walkerEPT
         #define FNAME(name) ept_##name
-       #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
-       #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
-       #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
-       #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-       #define PT_LEVEL_BITS PT64_LEVEL_BITS
+       #define PT_LEVEL_BITS 9
         #define PT_GUEST_DIRTY_SHIFT 9
         #define PT_GUEST_ACCESSED_SHIFT 8
         #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
-       #ifdef CONFIG_X86_64
-       #define CMPXCHG "cmpxchgq"
-       #endif
         #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
  #else
         #error Invalid PTTYPE value
  #endif
  
+/* Common logic, but per-type values.  These also need to be undefined. */
+#define PT_BASE_ADDR_MASK      ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
+#define PT_LVL_ADDR_MASK(lvl)  __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_LVL_OFFSET_MASK(lvl)        __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_INDEX(addr, lvl)    __PT_INDEX(addr, lvl, PT_LEVEL_BITS)
+
  #define PT_GUEST_DIRTY_MASK    (1 << PT_GUEST_DIRTY_SHIFT)
  #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
  
@@ -97,6 +92,15 @@ struct guest_walker {
         struct x86_exception fault;
  };
  
+#if PTTYPE == 32
+static inline gfn_t pse36_gfn_delta(u32 gpte)
+{
+       int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+       return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+#endif
+
  static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
  {
         return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
@@ -374,7 +378,7 @@ retry_walk:
                  * information to fix the exit_qualification or exit_info_1
                  * fields.
                  */
-               if (unlikely(real_gpa == UNMAPPED_GVA))
+               if (unlikely(real_gpa == INVALID_GPA))
                         return 0;
  
                 host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa),
@@ -421,11 +425,13 @@ retry_walk:
         gfn = gpte_to_gfn_lvl(pte, walker->level);
         gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
  
-       if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
+#if PTTYPE == 32
+       if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
                 gfn += pse36_gfn_delta(pte);
+#endif
  
         real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
-       if (real_gpa == UNMAPPED_GVA)
+       if (real_gpa == INVALID_GPA)
                 return 0;
  
         walker->gfn = real_gpa >> PAGE_SHIFT;
@@ -589,7 +595,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
         if (sp->role.direct)
                 return __direct_pte_prefetch(vcpu, sp, sptep);
  
-       i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+       i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
         spte = sp->spt + i;
  
         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
@@ -642,14 +648,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
                 gfn_t table_gfn;
  
                 clear_sp_write_flooding_count(it.sptep);
-               drop_large_spte(vcpu, it.sptep);
-
-               sp = NULL;
-               if (!is_shadow_present_pte(*it.sptep)) {
-                       table_gfn = gw->table_gfn[it.level - 2];
-                       access = gw->pt_access[it.level - 2];
-                       sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
-                                             it.level-1, false, access);
+
+               table_gfn = gw->table_gfn[it.level - 2];
+               access = gw->pt_access[it.level - 2];
+               sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
+                                         false, access);
+
+               if (sp != ERR_PTR(-EEXIST)) {
                         /*
                          * We must synchronize the pagetable before linking it
                          * because the guest doesn't need to flush tlb when
@@ -678,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
                 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
                         goto out_gpte_changed;
  
-               if (sp)
+               if (sp != ERR_PTR(-EEXIST))
                         link_shadow_page(vcpu, it.sptep, sp);
         }
  
@@ -702,16 +707,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
  
                 validate_direct_spte(vcpu, it.sptep, direct_access);
  
-               drop_large_spte(vcpu, it.sptep);
+               sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
+                                         true, direct_access);
+               if (sp == ERR_PTR(-EEXIST))
+                       continue;
  
-               if (!is_shadow_present_pte(*it.sptep)) {
-                       sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
-                                             it.level - 1, true, direct_access);
-                       link_shadow_page(vcpu, it.sptep, sp);
-                       if (fault->huge_page_disallowed &&
-                           fault->req_level >= it.level)
-                               account_huge_nx_page(vcpu->kvm, sp);
-               }
+               link_shadow_page(vcpu, it.sptep, sp);
+               if (fault->huge_page_disallowed &&
+                   fault->req_level >= it.level)
+                       account_huge_nx_page(vcpu->kvm, sp);
         }
  
         if (WARN_ON_ONCE(it.level != fault->goal_level))
@@ -888,7 +892,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
         WARN_ON(sp->role.level != PG_LEVEL_4K);
  
         if (PTTYPE == 32)
-               offset = sp->role.quadrant << PT64_LEVEL_BITS;
+               offset = sp->role.quadrant << SPTE_LEVEL_BITS;
  
         return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
  }
@@ -929,7 +933,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
                                 break;
  
                         pte_gpa = FNAME(get_level1_sp_gpa)(sp);
-                       pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+                       pte_gpa += spte_index(sptep) * sizeof(pt_element_t);
  
                         mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
                         if (is_shadow_present_pte(old_spte))
@@ -958,7 +962,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                                struct x86_exception *exception)
  {
         struct guest_walker walker;
-       gpa_t gpa = UNMAPPED_GVA;
+       gpa_t gpa = INVALID_GPA;
         int r;
  
  #ifndef CONFIG_X86_64
@@ -978,7 +982,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
  }
  
  /*
- * Using the cached information from sp->gfns is safe because:
+ * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
+ * safe because:
   * - The spte has a reference to the struct page, so the pfn for a given gfn
   *   can't change unless all sptes pointing to it are nuked first.
   *
@@ -1023,7 +1028,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  
         first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
  
-       for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+       for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
                 u64 *sptep, spte;
                 struct kvm_memory_slot *slot;
                 unsigned pte_access;
@@ -1053,12 +1058,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                 if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
                         continue;
  
-               if (gfn != sp->gfns[i]) {
+               /*
+                * Drop the SPTE if the new protections would result in a RWX=0
+                * SPTE or if the gfn is changing.  The RWX=0 case only affects
+                * EPT with execute-only support, i.e. EPT without an effective
+                * "present" bit, as all other paging modes will create a
+                * read-only SPTE if pte_access is zero.
+                */
+               if ((!pte_access && !shadow_present_mask) ||
+                   gfn != kvm_mmu_page_get_gfn(sp, i)) {
                         drop_spte(vcpu->kvm, &sp->spt[i]);
                         flush = true;
                         continue;
                 }
  
+               /* Update the shadowed access bits in case they changed. */
+               kvm_mmu_page_set_access(sp, i, pte_access);
+
                 sptep = &sp->spt[i];
                 spte = *sptep;
                 host_writable = spte & shadow_host_writable_mask;
@@ -1070,6 +1086,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                 flush |= mmu_spte_update(sptep, spte);
         }
  
+       /*
+        * Note, any flush is purely for KVM's correctness, e.g. when dropping
+        * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
+        * unmap or dirty logging event doesn't fail to flush.  The guest is
+        * responsible for flushing the TLB to ensure any changes in protection
+        * bits are recognized, i.e. until the guest flushes or page faults on
+        * a relevant address, KVM is architecturally allowed to let vCPUs use
+        * cached translations with the old protection bits.
+        */
         return flush;
  }
  
@@ -1084,7 +1109,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  #undef PT_MAX_FULL_LEVELS
  #undef gpte_to_gfn
  #undef gpte_to_gfn_lvl
-#undef CMPXCHG
  #undef PT_GUEST_ACCESSED_MASK
  #undef PT_GUEST_DIRTY_MASK
  #undef PT_GUEST_DIRTY_SHIFT