arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   3
   4 #include "mmu.h"
   5 #include "mmu_internal.h"
   6 #include "mmutrace.h"
   7 #include "tdp_iter.h"
   8 #include "tdp_mmu.h"
   9 #include "spte.h"
  10
  11 #include <asm/cmpxchg.h>
  12 #include <trace/events/kvm.h>
  13
  14 /* Initializes the TDP MMU for the VM, if enabled. */
  15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  16 {
  17         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  18         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  19 }
  20
  21 /* Arbitrarily returns true so that this may be used in if statements. */
  22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
  23                                                              bool shared)
  24 {
  25         if (shared)
  26                 lockdep_assert_held_read(&kvm->mmu_lock);
  27         else
  28                 lockdep_assert_held_write(&kvm->mmu_lock);
  29
  30         return true;
  31 }
  32
  33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  34 {
  35         /*
  36          * Invalidate all roots, which besides the obvious, schedules all roots
  37          * for zapping and thus puts the TDP MMU's reference to each root, i.e.
  38          * ultimately frees all roots.
  39          */
  40         kvm_tdp_mmu_invalidate_all_roots(kvm);
  41         kvm_tdp_mmu_zap_invalidated_roots(kvm);
  42
  43         WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
  44         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  45
  46         /*
  47          * Ensure that all the outstanding RCU callbacks to free shadow pages
  48          * can run before the VM is torn down.  Putting the last reference to
  49          * zapped roots will create new callbacks.
  50          */
  51         rcu_barrier();
  52 }
  53
  54 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  55 {
  56         free_page((unsigned long)sp->spt);
  57         kmem_cache_free(mmu_page_header_cache, sp);
  58 }
  59
  60 /*
  61  * This is called through call_rcu in order to free TDP page table memory
  62  * safely with respect to other kernel threads that may be operating on
  63  * the memory.
  64  * By only accessing TDP MMU page table memory in an RCU read critical
  65  * section, and freeing it after a grace period, lockless access to that
  66  * memory won't use it after it is freed.
  67  */
  68 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
  69 {
  70         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
  71                                                rcu_head);
  72
  73         tdp_mmu_free_sp(sp);
  74 }
  75
  76 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  77 {
  78         /*
  79          * Either read or write is okay, but mmu_lock must be held because
  80          * writers are not required to take tdp_mmu_pages_lock.
  81          */
  82         lockdep_assert_held(&kvm->mmu_lock);
  83
  84         if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
  85                 return;
  86
  87         /*
  88          * The TDP MMU itself holds a reference to each root until the root is
  89          * explicitly invalidated, i.e. the final reference should be never be
  90          * put for a valid root.
  91          */
  92         KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
  93
  94         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
  95         list_del_rcu(&root->link);
  96         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
  97         call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
  98 }
  99
 100 /*
 101  * Returns the next root after @prev_root (or the first root if @prev_root is
 102  * NULL).  A reference to the returned root is acquired, and the reference to
 103  * @prev_root is released (the caller obviously must hold a reference to
 104  * @prev_root if it's non-NULL).
 105  *
 106  * If @only_valid is true, invalid roots are skipped.
 107  *
 108  * Returns NULL if the end of tdp_mmu_roots was reached.
 109  */
 110 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 111                                               struct kvm_mmu_page *prev_root,
 112                                               bool only_valid)
 113 {
 114         struct kvm_mmu_page *next_root;
 115
 116         /*
 117          * While the roots themselves are RCU-protected, fields such as
 118          * role.invalid are protected by mmu_lock.
 119          */
 120         lockdep_assert_held(&kvm->mmu_lock);
 121
 122         rcu_read_lock();
 123
 124         if (prev_root)
 125                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 126                                                   &prev_root->link,
 127                                                   typeof(*prev_root), link);
 128         else
 129                 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 130                                                    typeof(*next_root), link);
 131
 132         while (next_root) {
 133                 if ((!only_valid || !next_root->role.invalid) &&
 134                     kvm_tdp_mmu_get_root(next_root))
 135                         break;
 136
 137                 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
 138                                 &next_root->link, typeof(*next_root), link);
 139         }
 140
 141         rcu_read_unlock();
 142
 143         if (prev_root)
 144                 kvm_tdp_mmu_put_root(kvm, prev_root);
 145
 146         return next_root;
 147 }
 148
 149 /*
 150  * Note: this iterator gets and puts references to the roots it iterates over.
 151  * This makes it safe to release the MMU lock and yield within the loop, but
 152  * if exiting the loop early, the caller must drop the reference to the most
 153  * recent root. (Unless keeping a live reference is desirable.)
 154  *
 155  * If shared is set, this function is operating under the MMU lock in read
 156  * mode.
 157  */
 158 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
 159         for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid);                \
 160              _root;                                                             \
 161              _root = tdp_mmu_next_root(_kvm, _root, _only_valid))               \
 162                 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&          \
 163                     kvm_mmu_page_as_id(_root) != _as_id) {                      \
 164                 } else
 165
 166 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)    \
 167         __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
 168
 169 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared)                  \
 170         for (_root = tdp_mmu_next_root(_kvm, NULL, false);                      \
 171              _root;                                                             \
 172              _root = tdp_mmu_next_root(_kvm, _root, false))                     \
 173                 if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) {         \
 174                 } else
 175
 176 /*
 177  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
 178  * the implication being that any flow that holds mmu_lock for read is
 179  * inherently yield-friendly and should use the yield-safe variant above.
 180  * Holding mmu_lock for write obviates the need for RCU protection as the list
 181  * is guaranteed to be stable.
 182  */
 183 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                      \
 184         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)     \
 185                 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&    \
 186                     kvm_mmu_page_as_id(_root) != _as_id) {              \
 187                 } else
 188
 189 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
 190 {
 191         struct kvm_mmu_page *sp;
 192
 193         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 194         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 195
 196         return sp;
 197 }
 198
 199 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
 200                             gfn_t gfn, union kvm_mmu_page_role role)
 201 {
 202         INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
 203
 204         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 205
 206         sp->role = role;
 207         sp->gfn = gfn;
 208         sp->ptep = sptep;
 209         sp->tdp_mmu_page = true;
 210
 211         trace_kvm_mmu_get_page(sp, true);
 212 }
 213
 214 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
 215                                   struct tdp_iter *iter)
 216 {
 217         struct kvm_mmu_page *parent_sp;
 218         union kvm_mmu_page_role role;
 219
 220         parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
 221
 222         role = parent_sp->role;
 223         role.level--;
 224
 225         tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
 226 }
 227
 228 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 229 {
 230         union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
 231         struct kvm *kvm = vcpu->kvm;
 232         struct kvm_mmu_page *root;
 233
 234         lockdep_assert_held_write(&kvm->mmu_lock);
 235
 236         /*
 237          * Check for an existing root before allocating a new one.  Note, the
 238          * role check prevents consuming an invalid root.
 239          */
 240         for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 241                 if (root->role.word == role.word &&
 242                     kvm_tdp_mmu_get_root(root))
 243                         goto out;
 244         }
 245
 246         root = tdp_mmu_alloc_sp(vcpu);
 247         tdp_mmu_init_sp(root, NULL, 0, role);
 248
 249         /*
 250          * TDP MMU roots are kept until they are explicitly invalidated, either
 251          * by a memslot update or by the destruction of the VM.  Initialize the
 252          * refcount to two; one reference for the vCPU, and one reference for
 253          * the TDP MMU itself, which is held until the root is invalidated and
 254          * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
 255          */
 256         refcount_set(&root->tdp_mmu_root_count, 2);
 257
 258         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 259         list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
 260         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 261
 262 out:
 263         return __pa(root->spt);
 264 }
 265
 266 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 267                                 u64 old_spte, u64 new_spte, int level,
 268                                 bool shared);
 269
 270 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 271 {
 272         kvm_account_pgtable_pages((void *)sp->spt, +1);
 273         atomic64_inc(&kvm->arch.tdp_mmu_pages);
 274 }
 275
 276 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 277 {
 278         kvm_account_pgtable_pages((void *)sp->spt, -1);
 279         atomic64_dec(&kvm->arch.tdp_mmu_pages);
 280 }
 281
 282 /**
 283  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
 284  *
 285  * @kvm: kvm instance
 286  * @sp: the page to be removed
 287  * @shared: This operation may not be running under the exclusive use of
 288  *          the MMU lock and the operation must synchronize with other
 289  *          threads that might be adding or removing pages.
 290  */
 291 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
 292                               bool shared)
 293 {
 294         tdp_unaccount_mmu_page(kvm, sp);
 295
 296         if (!sp->nx_huge_page_disallowed)
 297                 return;
 298
 299         if (shared)
 300                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 301         else
 302                 lockdep_assert_held_write(&kvm->mmu_lock);
 303
 304         sp->nx_huge_page_disallowed = false;
 305         untrack_possible_nx_huge_page(kvm, sp);
 306
 307         if (shared)
 308                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 309 }
 310
 311 /**
 312  * handle_removed_pt() - handle a page table removed from the TDP structure
 313  *
 314  * @kvm: kvm instance
 315  * @pt: the page removed from the paging structure
 316  * @shared: This operation may not be running under the exclusive use
 317  *          of the MMU lock and the operation must synchronize with other
 318  *          threads that might be modifying SPTEs.
 319  *
 320  * Given a page table that has been removed from the TDP paging structure,
 321  * iterates through the page table to clear SPTEs and free child page tables.
 322  *
 323  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
 324  * protection. Since this thread removed it from the paging structure,
 325  * this thread will be responsible for ensuring the page is freed. Hence the
 326  * early rcu_dereferences in the function.
 327  */
 328 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
 329 {
 330         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
 331         int level = sp->role.level;
 332         gfn_t base_gfn = sp->gfn;
 333         int i;
 334
 335         trace_kvm_mmu_prepare_zap_page(sp);
 336
 337         tdp_mmu_unlink_sp(kvm, sp, shared);
 338
 339         for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
 340                 tdp_ptep_t sptep = pt + i;
 341                 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
 342                 u64 old_spte;
 343
 344                 if (shared) {
 345                         /*
 346                          * Set the SPTE to a nonpresent value that other
 347                          * threads will not overwrite. If the SPTE was
 348                          * already marked as removed then another thread
 349                          * handling a page fault could overwrite it, so
 350                          * set the SPTE until it is set from some other
 351                          * value to the removed SPTE value.
 352                          */
 353                         for (;;) {
 354                                 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
 355                                 if (!is_removed_spte(old_spte))
 356                                         break;
 357                                 cpu_relax();
 358                         }
 359                 } else {
 360                         /*
 361                          * If the SPTE is not MMU-present, there is no backing
 362                          * page associated with the SPTE and so no side effects
 363                          * that need to be recorded, and exclusive ownership of
 364                          * mmu_lock ensures the SPTE can't be made present.
 365                          * Note, zapping MMIO SPTEs is also unnecessary as they
 366                          * are guarded by the memslots generation, not by being
 367                          * unreachable.
 368                          */
 369                         old_spte = kvm_tdp_mmu_read_spte(sptep);
 370                         if (!is_shadow_present_pte(old_spte))
 371                                 continue;
 372
 373                         /*
 374                          * Use the common helper instead of a raw WRITE_ONCE as
 375                          * the SPTE needs to be updated atomically if it can be
 376                          * modified by a different vCPU outside of mmu_lock.
 377                          * Even though the parent SPTE is !PRESENT, the TLB
 378                          * hasn't yet been flushed, and both Intel and AMD
 379                          * document that A/D assists can use upper-level PxE
 380                          * entries that are cached in the TLB, i.e. the CPU can
 381                          * still access the page and mark it dirty.
 382                          *
 383                          * No retry is needed in the atomic update path as the
 384                          * sole concern is dropping a Dirty bit, i.e. no other
 385                          * task can zap/remove the SPTE as mmu_lock is held for
 386                          * write.  Marking the SPTE as a removed SPTE is not
 387                          * strictly necessary for the same reason, but using
 388                          * the remove SPTE value keeps the shared/exclusive
 389                          * paths consistent and allows the handle_changed_spte()
 390                          * call below to hardcode the new value to REMOVED_SPTE.
 391                          *
 392                          * Note, even though dropping a Dirty bit is the only
 393                          * scenario where a non-atomic update could result in a
 394                          * functional bug, simply checking the Dirty bit isn't
 395                          * sufficient as a fast page fault could read the upper
 396                          * level SPTE before it is zapped, and then make this
 397                          * target SPTE writable, resume the guest, and set the
 398                          * Dirty bit between reading the SPTE above and writing
 399                          * it here.
 400                          */
 401                         old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
 402                                                           REMOVED_SPTE, level);
 403                 }
 404                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 405                                     old_spte, REMOVED_SPTE, level, shared);
 406         }
 407
 408         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 409 }
 410
 411 /**
 412  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 413  * @kvm: kvm instance
 414  * @as_id: the address space of the paging structure the SPTE was a part of
 415  * @gfn: the base GFN that was mapped by the SPTE
 416  * @old_spte: The value of the SPTE before the change
 417  * @new_spte: The value of the SPTE after the change
 418  * @level: the level of the PT the SPTE is part of in the paging structure
 419  * @shared: This operation may not be running under the exclusive use of
 420  *          the MMU lock and the operation must synchronize with other
 421  *          threads that might be modifying SPTEs.
 422  *
 423  * Handle bookkeeping that might result from the modification of a SPTE.  Note,
 424  * dirty logging updates are handled in common code, not here (see make_spte()
 425  * and fast_pf_fix_direct_spte()).
 426  */
 427 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 428                                 u64 old_spte, u64 new_spte, int level,
 429                                 bool shared)
 430 {
 431         bool was_present = is_shadow_present_pte(old_spte);
 432         bool is_present = is_shadow_present_pte(new_spte);
 433         bool was_leaf = was_present && is_last_spte(old_spte, level);
 434         bool is_leaf = is_present && is_last_spte(new_spte, level);
 435         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 436
 437         WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
 438         WARN_ON_ONCE(level < PG_LEVEL_4K);
 439         WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 440
 441         /*
 442          * If this warning were to trigger it would indicate that there was a
 443          * missing MMU notifier or a race with some notifier handler.
 444          * A present, leaf SPTE should never be directly replaced with another
 445          * present leaf SPTE pointing to a different PFN. A notifier handler
 446          * should be zapping the SPTE before the main MM's page table is
 447          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 448          * thread before replacement.
 449          */
 450         if (was_leaf && is_leaf && pfn_changed) {
 451                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 452                        "SPTE with another present leaf SPTE mapping a\n"
 453                        "different PFN!\n"
 454                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 455                        as_id, gfn, old_spte, new_spte, level);
 456
 457                 /*
 458                  * Crash the host to prevent error propagation and guest data
 459                  * corruption.
 460                  */
 461                 BUG();
 462         }
 463
 464         if (old_spte == new_spte)
 465                 return;
 466
 467         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 468
 469         if (is_leaf)
 470                 check_spte_writable_invariants(new_spte);
 471
 472         /*
 473          * The only times a SPTE should be changed from a non-present to
 474          * non-present state is when an MMIO entry is installed/modified/
 475          * removed. In that case, there is nothing to do here.
 476          */
 477         if (!was_present && !is_present) {
 478                 /*
 479                  * If this change does not involve a MMIO SPTE or removed SPTE,
 480                  * it is unexpected. Log the change, though it should not
 481                  * impact the guest since both the former and current SPTEs
 482                  * are nonpresent.
 483                  */
 484                 if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
 485                                  !is_mmio_spte(new_spte) &&
 486                                  !is_removed_spte(new_spte)))
 487                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 488                                "should not be replaced with another,\n"
 489                                "different nonpresent SPTE, unless one or both\n"
 490                                "are MMIO SPTEs, or the new SPTE is\n"
 491                                "a temporary removed SPTE.\n"
 492                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 493                                as_id, gfn, old_spte, new_spte, level);
 494                 return;
 495         }
 496
 497         if (is_leaf != was_leaf)
 498                 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
 499
 500         if (was_leaf && is_dirty_spte(old_spte) &&
 501             (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
 502                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 503
 504         /*
 505          * Recursively handle child PTs if the change removed a subtree from
 506          * the paging structure.  Note the WARN on the PFN changing without the
 507          * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
 508          * pages are kernel allocations and should never be migrated.
 509          */
 510         if (was_present && !was_leaf &&
 511             (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
 512                 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
 513
 514         if (was_leaf && is_accessed_spte(old_spte) &&
 515             (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
 516                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 517 }
 518
 519 /*
 520  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
 521  * and handle the associated bookkeeping.  Do not mark the page dirty
 522  * in KVM's dirty bitmaps.
 523  *
 524  * If setting the SPTE fails because it has changed, iter->old_spte will be
 525  * refreshed to the current value of the spte.
 526  *
 527  * @kvm: kvm instance
 528  * @iter: a tdp_iter instance currently on the SPTE that should be set
 529  * @new_spte: The value the SPTE should be set to
 530  * Return:
 531  * * 0      - If the SPTE was set.
 532  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
 533  *            no side-effects other than setting iter->old_spte to the last
 534  *            known value of the spte.
 535  */
 536 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
 537                                           struct tdp_iter *iter,
 538                                           u64 new_spte)
 539 {
 540         u64 *sptep = rcu_dereference(iter->sptep);
 541
 542         /*
 543          * The caller is responsible for ensuring the old SPTE is not a REMOVED
 544          * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
 545          * and pre-checking before inserting a new SPTE is advantageous as it
 546          * avoids unnecessary work.
 547          */
 548         WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
 549
 550         lockdep_assert_held_read(&kvm->mmu_lock);
 551
 552         /*
 553          * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
 554          * does not hold the mmu_lock.  On failure, i.e. if a different logical
 555          * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
 556          * the current value, so the caller operates on fresh data, e.g. if it
 557          * retries tdp_mmu_set_spte_atomic()
 558          */
 559         if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
 560                 return -EBUSY;
 561
 562         handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
 563                             new_spte, iter->level, true);
 564
 565         return 0;
 566 }
 567
 568 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 569                                           struct tdp_iter *iter)
 570 {
 571         int ret;
 572
 573         /*
 574          * Freeze the SPTE by setting it to a special,
 575          * non-present value. This will stop other threads from
 576          * immediately installing a present entry in its place
 577          * before the TLBs are flushed.
 578          */
 579         ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
 580         if (ret)
 581                 return ret;
 582
 583         kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
 584
 585         /*
 586          * No other thread can overwrite the removed SPTE as they must either
 587          * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
 588          * overwrite the special removed SPTE value. No bookkeeping is needed
 589          * here since the SPTE is going from non-present to non-present.  Use
 590          * the raw write helper to avoid an unnecessary check on volatile bits.
 591          */
 592         __kvm_tdp_mmu_write_spte(iter->sptep, 0);
 593
 594         return 0;
 595 }
 596
 597
 598 /*
 599  * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 600  * @kvm:              KVM instance
 601  * @as_id:            Address space ID, i.e. regular vs. SMM
 602  * @sptep:            Pointer to the SPTE
 603  * @old_spte:         The current value of the SPTE
 604  * @new_spte:         The new value that will be set for the SPTE
 605  * @gfn:              The base GFN that was (or will be) mapped by the SPTE
 606  * @level:            The level _containing_ the SPTE (its parent PT's level)
 607  *
 608  * Returns the old SPTE value, which _may_ be different than @old_spte if the
 609  * SPTE had voldatile bits.
 610  */
 611 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
 612                             u64 old_spte, u64 new_spte, gfn_t gfn, int level)
 613 {
 614         lockdep_assert_held_write(&kvm->mmu_lock);
 615
 616         /*
 617          * No thread should be using this function to set SPTEs to or from the
 618          * temporary removed SPTE value.
 619          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 620          * should be used. If operating under the MMU lock in write mode, the
 621          * use of the removed SPTE should not be necessary.
 622          */
 623         WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
 624
 625         old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
 626
 627         handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
 628         return old_spte;
 629 }
 630
 631 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 632                                          u64 new_spte)
 633 {
 634         WARN_ON_ONCE(iter->yielded);
 635         iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
 636                                           iter->old_spte, new_spte,
 637                                           iter->gfn, iter->level);
 638 }
 639
 640 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 641         for_each_tdp_pte(_iter, _root, _start, _end)
 642
 643 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 644         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 645                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 646                     !is_last_spte(_iter.old_spte, _iter.level))         \
 647                         continue;                                       \
 648                 else
 649
 650 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 651         for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
 652
 653 /*
 654  * Yield if the MMU lock is contended or this thread needs to return control
 655  * to the scheduler.
 656  *
 657  * If this function should yield and flush is set, it will perform a remote
 658  * TLB flush before yielding.
 659  *
 660  * If this function yields, iter->yielded is set and the caller must skip to
 661  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
 662  * over the paging structures to allow the iterator to continue its traversal
 663  * from the paging structure root.
 664  *
 665  * Returns true if this function yielded.
 666  */
 667 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
 668                                                           struct tdp_iter *iter,
 669                                                           bool flush, bool shared)
 670 {
 671         WARN_ON_ONCE(iter->yielded);
 672
 673         /* Ensure forward progress has been made before yielding. */
 674         if (iter->next_last_level_gfn == iter->yielded_gfn)
 675                 return false;
 676
 677         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 678                 if (flush)
 679                         kvm_flush_remote_tlbs(kvm);
 680
 681                 rcu_read_unlock();
 682
 683                 if (shared)
 684                         cond_resched_rwlock_read(&kvm->mmu_lock);
 685                 else
 686                         cond_resched_rwlock_write(&kvm->mmu_lock);
 687
 688                 rcu_read_lock();
 689
 690                 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
 691
 692                 iter->yielded = true;
 693         }
 694
 695         return iter->yielded;
 696 }
 697
 698 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
 699 {
 700         /*
 701          * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
 702          * a gpa range that would exceed the max gfn, and KVM does not create
 703          * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
 704          * the slow emulation path every time.
 705          */
 706         return kvm_mmu_max_gfn() + 1;
 707 }
 708
 709 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
 710                                bool shared, int zap_level)
 711 {
 712         struct tdp_iter iter;
 713
 714         gfn_t end = tdp_mmu_max_gfn_exclusive();
 715         gfn_t start = 0;
 716
 717         for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
 718 retry:
 719                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
 720                         continue;
 721
 722                 if (!is_shadow_present_pte(iter.old_spte))
 723                         continue;
 724
 725                 if (iter.level > zap_level)
 726                         continue;
 727
 728                 if (!shared)
 729                         tdp_mmu_iter_set_spte(kvm, &iter, 0);
 730                 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
 731                         goto retry;
 732         }
 733 }
 734
 735 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
 736                              bool shared)
 737 {
 738
 739         /*
 740          * The root must have an elevated refcount so that it's reachable via
 741          * mmu_notifier callbacks, which allows this path to yield and drop
 742          * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
 743          * must drop all references to relevant pages prior to completing the
 744          * callback.  Dropping mmu_lock with an unreachable root would result
 745          * in zapping SPTEs after a relevant mmu_notifier callback completes
 746          * and lead to use-after-free as zapping a SPTE triggers "writeback" of
 747          * dirty accessed bits to the SPTE's associated struct page.
 748          */
 749         WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
 750
 751         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 752
 753         rcu_read_lock();
 754
 755         /*
 756          * To avoid RCU stalls due to recursively removing huge swaths of SPs,
 757          * split the zap into two passes.  On the first pass, zap at the 1gb
 758          * level, and then zap top-level SPs on the second pass.  "1gb" is not
 759          * arbitrary, as KVM must be able to zap a 1gb shadow page without
 760          * inducing a stall to allow in-place replacement with a 1gb hugepage.
 761          *
 762          * Because zapping a SP recurses on its children, stepping down to
 763          * PG_LEVEL_4K in the iterator itself is unnecessary.
 764          */
 765         __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
 766         __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
 767
 768         rcu_read_unlock();
 769 }
 770
 771 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 772 {
 773         u64 old_spte;
 774
 775         /*
 776          * This helper intentionally doesn't allow zapping a root shadow page,
 777          * which doesn't have a parent page table and thus no associated entry.
 778          */
 779         if (WARN_ON_ONCE(!sp->ptep))
 780                 return false;
 781
 782         old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
 783         if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
 784                 return false;
 785
 786         tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
 787                          sp->gfn, sp->role.level + 1);
 788
 789         return true;
 790 }
 791
 792 /*
 793  * If can_yield is true, will release the MMU lock and reschedule if the
 794  * scheduler needs the CPU or there is contention on the MMU lock. If this
 795  * function cannot yield, it will not release the MMU lock or reschedule and
 796  * the caller must ensure it does not supply too large a GFN range, or the
 797  * operation can cause a soft lockup.
 798  */
 799 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
 800                               gfn_t start, gfn_t end, bool can_yield, bool flush)
 801 {
 802         struct tdp_iter iter;
 803
 804         end = min(end, tdp_mmu_max_gfn_exclusive());
 805
 806         lockdep_assert_held_write(&kvm->mmu_lock);
 807
 808         rcu_read_lock();
 809
 810         for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
 811                 if (can_yield &&
 812                     tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
 813                         flush = false;
 814                         continue;
 815                 }
 816
 817                 if (!is_shadow_present_pte(iter.old_spte) ||
 818                     !is_last_spte(iter.old_spte, iter.level))
 819                         continue;
 820
 821                 tdp_mmu_iter_set_spte(kvm, &iter, 0);
 822                 flush = true;
 823         }
 824
 825         rcu_read_unlock();
 826
 827         /*
 828          * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
 829          * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
 830          */
 831         return flush;
 832 }
 833
 834 /*
 835  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
 836  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
 837  * more SPTEs were zapped since the MMU lock was last acquired.
 838  */
 839 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
 840 {
 841         struct kvm_mmu_page *root;
 842
 843         for_each_tdp_mmu_root_yield_safe(kvm, root, false)
 844                 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
 845
 846         return flush;
 847 }
 848
 849 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 850 {
 851         struct kvm_mmu_page *root;
 852
 853         /*
 854          * Zap all roots, including invalid roots, as all SPTEs must be dropped
 855          * before returning to the caller.  Zap directly even if the root is
 856          * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
 857          * all that expensive and mmu_lock is already held, which means the
 858          * worker has yielded, i.e. flushing the work instead of zapping here
 859          * isn't guaranteed to be any faster.
 860          *
 861          * A TLB flush is unnecessary, KVM zaps everything if and only the VM
 862          * is being destroyed or the userspace VMM has exited.  In both cases,
 863          * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
 864          */
 865         for_each_tdp_mmu_root_yield_safe(kvm, root, false)
 866                 tdp_mmu_zap_root(kvm, root, false);
 867 }
 868
 869 /*
 870  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
 871  * zap" completes.
 872  */
 873 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
 874 {
 875         struct kvm_mmu_page *root;
 876
 877         read_lock(&kvm->mmu_lock);
 878
 879         for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
 880                 if (!root->tdp_mmu_scheduled_root_to_zap)
 881                         continue;
 882
 883                 root->tdp_mmu_scheduled_root_to_zap = false;
 884                 KVM_BUG_ON(!root->role.invalid, kvm);
 885
 886                 /*
 887                  * A TLB flush is not necessary as KVM performs a local TLB
 888                  * flush when allocating a new root (see kvm_mmu_load()), and
 889                  * when migrating a vCPU to a different pCPU.  Note, the local
 890                  * TLB flush on reuse also invalidates paging-structure-cache
 891                  * entries, i.e. TLB entries for intermediate paging structures,
 892                  * that may be zapped, as such entries are associated with the
 893                  * ASID on both VMX and SVM.
 894                  */
 895                 tdp_mmu_zap_root(kvm, root, true);
 896
 897                 /*
 898                  * The referenced needs to be put *after* zapping the root, as
 899                  * the root must be reachable by mmu_notifiers while it's being
 900                  * zapped
 901                  */
 902                 kvm_tdp_mmu_put_root(kvm, root);
 903         }
 904
 905         read_unlock(&kvm->mmu_lock);
 906 }
 907
 908 /*
 909  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
 910  * is about to be zapped, e.g. in response to a memslots update.  The actual
 911  * zapping is done separately so that it happens with mmu_lock with read,
 912  * whereas invalidating roots must be done with mmu_lock held for write (unless
 913  * the VM is being destroyed).
 914  *
 915  * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
 916  * See kvm_tdp_mmu_get_vcpu_root_hpa().
 917  */
 918 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
 919 {
 920         struct kvm_mmu_page *root;
 921
 922         /*
 923          * mmu_lock must be held for write to ensure that a root doesn't become
 924          * invalid while there are active readers (invalidating a root while
 925          * there are active readers may or may not be problematic in practice,
 926          * but it's uncharted territory and not supported).
 927          *
 928          * Waive the assertion if there are no users of @kvm, i.e. the VM is
 929          * being destroyed after all references have been put, or if no vCPUs
 930          * have been created (which means there are no roots), i.e. the VM is
 931          * being destroyed in an error path of KVM_CREATE_VM.
 932          */
 933         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
 934             refcount_read(&kvm->users_count) && kvm->created_vcpus)
 935                 lockdep_assert_held_write(&kvm->mmu_lock);
 936
 937         /*
 938          * As above, mmu_lock isn't held when destroying the VM!  There can't
 939          * be other references to @kvm, i.e. nothing else can invalidate roots
 940          * or get/put references to roots.
 941          */
 942         list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
 943                 /*
 944                  * Note, invalid roots can outlive a memslot update!  Invalid
 945                  * roots must be *zapped* before the memslot update completes,
 946                  * but a different task can acquire a reference and keep the
 947                  * root alive after its been zapped.
 948                  */
 949                 if (!root->role.invalid) {
 950                         root->tdp_mmu_scheduled_root_to_zap = true;
 951                         root->role.invalid = true;
 952                 }
 953         }
 954 }
 955
 956 /*
 957  * Installs a last-level SPTE to handle a TDP page fault.
 958  * (NPT/EPT violation/misconfiguration)
 959  */
 960 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 961                                           struct kvm_page_fault *fault,
 962                                           struct tdp_iter *iter)
 963 {
 964         struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
 965         u64 new_spte;
 966         int ret = RET_PF_FIXED;
 967         bool wrprot = false;
 968
 969         if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
 970                 return RET_PF_RETRY;
 971
 972         if (unlikely(!fault->slot))
 973                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 974         else
 975                 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
 976                                          fault->pfn, iter->old_spte, fault->prefetch, true,
 977                                          fault->map_writable, &new_spte);
 978
 979         if (new_spte == iter->old_spte)
 980                 ret = RET_PF_SPURIOUS;
 981         else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 982                 return RET_PF_RETRY;
 983         else if (is_shadow_present_pte(iter->old_spte) &&
 984                  !is_last_spte(iter->old_spte, iter->level))
 985                 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
 986
 987         /*
 988          * If the page fault was caused by a write but the page is write
 989          * protected, emulation is needed. If the emulation was skipped,
 990          * the vCPU would have the same fault again.
 991          */
 992         if (wrprot) {
 993                 if (fault->write)
 994                         ret = RET_PF_EMULATE;
 995         }
 996
 997         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 998         if (unlikely(is_mmio_spte(new_spte))) {
 999                 vcpu->stat.pf_mmio_spte_created++;
1000                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1001                                      new_spte);
1002                 ret = RET_PF_EMULATE;
1003         } else {
1004                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1005                                        rcu_dereference(iter->sptep));
1006         }
1007
1008         return ret;
1009 }
1010
1011 /*
1012  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1013  * provided page table.
1014  *
1015  * @kvm: kvm instance
1016  * @iter: a tdp_iter instance currently on the SPTE that should be set
1017  * @sp: The new TDP page table to install.
1018  * @shared: This operation is running under the MMU lock in read mode.
1019  *
1020  * Returns: 0 if the new page table was installed. Non-0 if the page table
1021  *          could not be installed (e.g. the atomic compare-exchange failed).
1022  */
1023 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1024                            struct kvm_mmu_page *sp, bool shared)
1025 {
1026         u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1027         int ret = 0;
1028
1029         if (shared) {
1030                 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1031                 if (ret)
1032                         return ret;
1033         } else {
1034                 tdp_mmu_iter_set_spte(kvm, iter, spte);
1035         }
1036
1037         tdp_account_mmu_page(kvm, sp);
1038
1039         return 0;
1040 }
1041
1042 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1043                                    struct kvm_mmu_page *sp, bool shared);
1044
1045 /*
1046  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1047  * page tables and SPTEs to translate the faulting guest physical address.
1048  */
1049 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1050 {
1051         struct kvm_mmu *mmu = vcpu->arch.mmu;
1052         struct kvm *kvm = vcpu->kvm;
1053         struct tdp_iter iter;
1054         struct kvm_mmu_page *sp;
1055         int ret = RET_PF_RETRY;
1056
1057         kvm_mmu_hugepage_adjust(vcpu, fault);
1058
1059         trace_kvm_mmu_spte_requested(fault);
1060
1061         rcu_read_lock();
1062
1063         tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1064                 int r;
1065
1066                 if (fault->nx_huge_page_workaround_enabled)
1067                         disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1068
1069                 /*
1070                  * If SPTE has been frozen by another thread, just give up and
1071                  * retry, avoiding unnecessary page table allocation and free.
1072                  */
1073                 if (is_removed_spte(iter.old_spte))
1074                         goto retry;
1075
1076                 if (iter.level == fault->goal_level)
1077                         goto map_target_level;
1078
1079                 /* Step down into the lower level page table if it exists. */
1080                 if (is_shadow_present_pte(iter.old_spte) &&
1081                     !is_large_pte(iter.old_spte))
1082                         continue;
1083
1084                 /*
1085                  * The SPTE is either non-present or points to a huge page that
1086                  * needs to be split.
1087                  */
1088                 sp = tdp_mmu_alloc_sp(vcpu);
1089                 tdp_mmu_init_child_sp(sp, &iter);
1090
1091                 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1092
1093                 if (is_shadow_present_pte(iter.old_spte))
1094                         r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1095                 else
1096                         r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1097
1098                 /*
1099                  * Force the guest to retry if installing an upper level SPTE
1100                  * failed, e.g. because a different task modified the SPTE.
1101                  */
1102                 if (r) {
1103                         tdp_mmu_free_sp(sp);
1104                         goto retry;
1105                 }
1106
1107                 if (fault->huge_page_disallowed &&
1108                     fault->req_level >= iter.level) {
1109                         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1110                         if (sp->nx_huge_page_disallowed)
1111                                 track_possible_nx_huge_page(kvm, sp);
1112                         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1113                 }
1114         }
1115
1116         /*
1117          * The walk aborted before reaching the target level, e.g. because the
1118          * iterator detected an upper level SPTE was frozen during traversal.
1119          */
1120         WARN_ON_ONCE(iter.level == fault->goal_level);
1121         goto retry;
1122
1123 map_target_level:
1124         ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1125
1126 retry:
1127         rcu_read_unlock();
1128         return ret;
1129 }
1130
1131 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1132                                  bool flush)
1133 {
1134         struct kvm_mmu_page *root;
1135
1136         __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
1137                 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1138                                           range->may_block, flush);
1139
1140         return flush;
1141 }
1142
1143 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1144                               struct kvm_gfn_range *range);
1145
1146 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1147                                                    struct kvm_gfn_range *range,
1148                                                    tdp_handler_t handler)
1149 {
1150         struct kvm_mmu_page *root;
1151         struct tdp_iter iter;
1152         bool ret = false;
1153
1154         /*
1155          * Don't support rescheduling, none of the MMU notifiers that funnel
1156          * into this helper allow blocking; it'd be dead, wasteful code.
1157          */
1158         for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1159                 rcu_read_lock();
1160
1161                 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1162                         ret |= handler(kvm, &iter, range);
1163
1164                 rcu_read_unlock();
1165         }
1166
1167         return ret;
1168 }
1169
1170 /*
1171  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1172  * if any of the GFNs in the range have been accessed.
1173  *
1174  * No need to mark the corresponding PFN as accessed as this call is coming
1175  * from the clear_young() or clear_flush_young() notifier, which uses the
1176  * return value to determine if the page has been accessed.
1177  */
1178 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1179                           struct kvm_gfn_range *range)
1180 {
1181         u64 new_spte;
1182
1183         /* If we have a non-accessed entry we don't need to change the pte. */
1184         if (!is_accessed_spte(iter->old_spte))
1185                 return false;
1186
1187         if (spte_ad_enabled(iter->old_spte)) {
1188                 iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
1189                                                          iter->old_spte,
1190                                                          shadow_accessed_mask,
1191                                                          iter->level);
1192                 new_spte = iter->old_spte & ~shadow_accessed_mask;
1193         } else {
1194                 /*
1195                  * Capture the dirty status of the page, so that it doesn't get
1196                  * lost when the SPTE is marked for access tracking.
1197                  */
1198                 if (is_writable_pte(iter->old_spte))
1199                         kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
1200
1201                 new_spte = mark_spte_for_access_track(iter->old_spte);
1202                 iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
1203                                                         iter->old_spte, new_spte,
1204                                                         iter->level);
1205         }
1206
1207         trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1208                                        iter->old_spte, new_spte);
1209         return true;
1210 }
1211
1212 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1213 {
1214         return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1215 }
1216
1217 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1218                          struct kvm_gfn_range *range)
1219 {
1220         return is_accessed_spte(iter->old_spte);
1221 }
1222
1223 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1224 {
1225         return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1226 }
1227
1228 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1229                          struct kvm_gfn_range *range)
1230 {
1231         u64 new_spte;
1232
1233         /* Huge pages aren't expected to be modified without first being zapped. */
1234         WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
1235
1236         if (iter->level != PG_LEVEL_4K ||
1237             !is_shadow_present_pte(iter->old_spte))
1238                 return false;
1239
1240         /*
1241          * Note, when changing a read-only SPTE, it's not strictly necessary to
1242          * zero the SPTE before setting the new PFN, but doing so preserves the
1243          * invariant that the PFN of a present * leaf SPTE can never change.
1244          * See handle_changed_spte().
1245          */
1246         tdp_mmu_iter_set_spte(kvm, iter, 0);
1247
1248         if (!pte_write(range->arg.pte)) {
1249                 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1250                                                                   pte_pfn(range->arg.pte));
1251
1252                 tdp_mmu_iter_set_spte(kvm, iter, new_spte);
1253         }
1254
1255         return true;
1256 }
1257
1258 /*
1259  * Handle the changed_pte MMU notifier for the TDP MMU.
1260  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1261  * notifier.
1262  * Returns non-zero if a flush is needed before releasing the MMU lock.
1263  */
1264 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1265 {
1266         /*
1267          * No need to handle the remote TLB flush under RCU protection, the
1268          * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1269          * shadow page. See the WARN on pfn_changed in handle_changed_spte().
1270          */
1271         return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1272 }
1273
1274 /*
1275  * Remove write access from all SPTEs at or above min_level that map GFNs
1276  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1277  * be flushed.
1278  */
1279 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1280                              gfn_t start, gfn_t end, int min_level)
1281 {
1282         struct tdp_iter iter;
1283         u64 new_spte;
1284         bool spte_set = false;
1285
1286         rcu_read_lock();
1287
1288         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1289
1290         for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1291 retry:
1292                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1293                         continue;
1294
1295                 if (!is_shadow_present_pte(iter.old_spte) ||
1296                     !is_last_spte(iter.old_spte, iter.level) ||
1297                     !(iter.old_spte & PT_WRITABLE_MASK))
1298                         continue;
1299
1300                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1301
1302                 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1303                         goto retry;
1304
1305                 spte_set = true;
1306         }
1307
1308         rcu_read_unlock();
1309         return spte_set;
1310 }
1311
1312 /*
1313  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1314  * only affect leaf SPTEs down to min_level.
1315  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1316  */
1317 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1318                              const struct kvm_memory_slot *slot, int min_level)
1319 {
1320         struct kvm_mmu_page *root;
1321         bool spte_set = false;
1322
1323         lockdep_assert_held_read(&kvm->mmu_lock);
1324
1325         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1326                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1327                              slot->base_gfn + slot->npages, min_level);
1328
1329         return spte_set;
1330 }
1331
1332 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1333 {
1334         struct kvm_mmu_page *sp;
1335
1336         gfp |= __GFP_ZERO;
1337
1338         sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1339         if (!sp)
1340                 return NULL;
1341
1342         sp->spt = (void *)__get_free_page(gfp);
1343         if (!sp->spt) {
1344                 kmem_cache_free(mmu_page_header_cache, sp);
1345                 return NULL;
1346         }
1347
1348         return sp;
1349 }
1350
1351 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1352                                                        struct tdp_iter *iter,
1353                                                        bool shared)
1354 {
1355         struct kvm_mmu_page *sp;
1356
1357         /*
1358          * Since we are allocating while under the MMU lock we have to be
1359          * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1360          * reclaim and to avoid making any filesystem callbacks (which can end
1361          * up invoking KVM MMU notifiers, resulting in a deadlock).
1362          *
1363          * If this allocation fails we drop the lock and retry with reclaim
1364          * allowed.
1365          */
1366         sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1367         if (sp)
1368                 return sp;
1369
1370         rcu_read_unlock();
1371
1372         if (shared)
1373                 read_unlock(&kvm->mmu_lock);
1374         else
1375                 write_unlock(&kvm->mmu_lock);
1376
1377         iter->yielded = true;
1378         sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1379
1380         if (shared)
1381                 read_lock(&kvm->mmu_lock);
1382         else
1383                 write_lock(&kvm->mmu_lock);
1384
1385         rcu_read_lock();
1386
1387         return sp;
1388 }
1389
1390 /* Note, the caller is responsible for initializing @sp. */
1391 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1392                                    struct kvm_mmu_page *sp, bool shared)
1393 {
1394         const u64 huge_spte = iter->old_spte;
1395         const int level = iter->level;
1396         int ret, i;
1397
1398         /*
1399          * No need for atomics when writing to sp->spt since the page table has
1400          * not been linked in yet and thus is not reachable from any other CPU.
1401          */
1402         for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1403                 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1404
1405         /*
1406          * Replace the huge spte with a pointer to the populated lower level
1407          * page table. Since we are making this change without a TLB flush vCPUs
1408          * will see a mix of the split mappings and the original huge mapping,
1409          * depending on what's currently in their TLB. This is fine from a
1410          * correctness standpoint since the translation will be the same either
1411          * way.
1412          */
1413         ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1414         if (ret)
1415                 goto out;
1416
1417         /*
1418          * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1419          * are overwriting from the page stats. But we have to manually update
1420          * the page stats with the new present child pages.
1421          */
1422         kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1423
1424 out:
1425         trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1426         return ret;
1427 }
1428
1429 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1430                                          struct kvm_mmu_page *root,
1431                                          gfn_t start, gfn_t end,
1432                                          int target_level, bool shared)
1433 {
1434         struct kvm_mmu_page *sp = NULL;
1435         struct tdp_iter iter;
1436         int ret = 0;
1437
1438         rcu_read_lock();
1439
1440         /*
1441          * Traverse the page table splitting all huge pages above the target
1442          * level into one lower level. For example, if we encounter a 1GB page
1443          * we split it into 512 2MB pages.
1444          *
1445          * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1446          * to visit an SPTE before ever visiting its children, which means we
1447          * will correctly recursively split huge pages that are more than one
1448          * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1449          * and then splitting each of those to 512 4KB pages).
1450          */
1451         for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1452 retry:
1453                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1454                         continue;
1455
1456                 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1457                         continue;
1458
1459                 if (!sp) {
1460                         sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1461                         if (!sp) {
1462                                 ret = -ENOMEM;
1463                                 trace_kvm_mmu_split_huge_page(iter.gfn,
1464                                                               iter.old_spte,
1465                                                               iter.level, ret);
1466                                 break;
1467                         }
1468
1469                         if (iter.yielded)
1470                                 continue;
1471                 }
1472
1473                 tdp_mmu_init_child_sp(sp, &iter);
1474
1475                 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1476                         goto retry;
1477
1478                 sp = NULL;
1479         }
1480
1481         rcu_read_unlock();
1482
1483         /*
1484          * It's possible to exit the loop having never used the last sp if, for
1485          * example, a vCPU doing HugePage NX splitting wins the race and
1486          * installs its own sp in place of the last sp we tried to split.
1487          */
1488         if (sp)
1489                 tdp_mmu_free_sp(sp);
1490
1491         return ret;
1492 }
1493
1494
1495 /*
1496  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1497  */
1498 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1499                                       const struct kvm_memory_slot *slot,
1500                                       gfn_t start, gfn_t end,
1501                                       int target_level, bool shared)
1502 {
1503         struct kvm_mmu_page *root;
1504         int r = 0;
1505
1506         kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1507
1508         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1509                 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1510                 if (r) {
1511                         kvm_tdp_mmu_put_root(kvm, root);
1512                         break;
1513                 }
1514         }
1515 }
1516
1517 /*
1518  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1519  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1520  * If AD bits are not enabled, this will require clearing the writable bit on
1521  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1522  * be flushed.
1523  */
1524 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1525                            gfn_t start, gfn_t end)
1526 {
1527         u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1528         struct tdp_iter iter;
1529         bool spte_set = false;
1530
1531         rcu_read_lock();
1532
1533         tdp_root_for_each_pte(iter, root, start, end) {
1534 retry:
1535                 if (!is_shadow_present_pte(iter.old_spte) ||
1536                     !is_last_spte(iter.old_spte, iter.level))
1537                         continue;
1538
1539                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1540                         continue;
1541
1542                 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1543                                 spte_ad_need_write_protect(iter.old_spte));
1544
1545                 if (!(iter.old_spte & dbit))
1546                         continue;
1547
1548                 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1549                         goto retry;
1550
1551                 spte_set = true;
1552         }
1553
1554         rcu_read_unlock();
1555         return spte_set;
1556 }
1557
1558 /*
1559  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1560  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1561  * If AD bits are not enabled, this will require clearing the writable bit on
1562  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1563  * be flushed.
1564  */
1565 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1566                                   const struct kvm_memory_slot *slot)
1567 {
1568         struct kvm_mmu_page *root;
1569         bool spte_set = false;
1570
1571         lockdep_assert_held_read(&kvm->mmu_lock);
1572
1573         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1574                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1575                                 slot->base_gfn + slot->npages);
1576
1577         return spte_set;
1578 }
1579
1580 /*
1581  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1582  * set in mask, starting at gfn. The given memslot is expected to contain all
1583  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1584  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1585  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1586  */
1587 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1588                                   gfn_t gfn, unsigned long mask, bool wrprot)
1589 {
1590         u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1591                                                    shadow_dirty_mask;
1592         struct tdp_iter iter;
1593
1594         lockdep_assert_held_write(&kvm->mmu_lock);
1595
1596         rcu_read_lock();
1597
1598         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1599                                     gfn + BITS_PER_LONG) {
1600                 if (!mask)
1601                         break;
1602
1603                 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1604                                 spte_ad_need_write_protect(iter.old_spte));
1605
1606                 if (iter.level > PG_LEVEL_4K ||
1607                     !(mask & (1UL << (iter.gfn - gfn))))
1608                         continue;
1609
1610                 mask &= ~(1UL << (iter.gfn - gfn));
1611
1612                 if (!(iter.old_spte & dbit))
1613                         continue;
1614
1615                 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1616                                                         iter.old_spte, dbit,
1617                                                         iter.level);
1618
1619                 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1620                                                iter.old_spte,
1621                                                iter.old_spte & ~dbit);
1622                 kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
1623         }
1624
1625         rcu_read_unlock();
1626 }
1627
1628 /*
1629  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1630  * set in mask, starting at gfn. The given memslot is expected to contain all
1631  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1632  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1633  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1634  */
1635 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1636                                        struct kvm_memory_slot *slot,
1637                                        gfn_t gfn, unsigned long mask,
1638                                        bool wrprot)
1639 {
1640         struct kvm_mmu_page *root;
1641
1642         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1643                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1644 }
1645
1646 static void zap_collapsible_spte_range(struct kvm *kvm,
1647                                        struct kvm_mmu_page *root,
1648                                        const struct kvm_memory_slot *slot)
1649 {
1650         gfn_t start = slot->base_gfn;
1651         gfn_t end = start + slot->npages;
1652         struct tdp_iter iter;
1653         int max_mapping_level;
1654
1655         rcu_read_lock();
1656
1657         for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1658 retry:
1659                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1660                         continue;
1661
1662                 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1663                     !is_shadow_present_pte(iter.old_spte))
1664                         continue;
1665
1666                 /*
1667                  * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1668                  * a large page size, then its parent would have been zapped
1669                  * instead of stepping down.
1670                  */
1671                 if (is_last_spte(iter.old_spte, iter.level))
1672                         continue;
1673
1674                 /*
1675                  * If iter.gfn resides outside of the slot, i.e. the page for
1676                  * the current level overlaps but is not contained by the slot,
1677                  * then the SPTE can't be made huge.  More importantly, trying
1678                  * to query that info from slot->arch.lpage_info will cause an
1679                  * out-of-bounds access.
1680                  */
1681                 if (iter.gfn < start || iter.gfn >= end)
1682                         continue;
1683
1684                 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1685                                                               iter.gfn, PG_LEVEL_NUM);
1686                 if (max_mapping_level < iter.level)
1687                         continue;
1688
1689                 /* Note, a successful atomic zap also does a remote TLB flush. */
1690                 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1691                         goto retry;
1692         }
1693
1694         rcu_read_unlock();
1695 }
1696
1697 /*
1698  * Zap non-leaf SPTEs (and free their associated page tables) which could
1699  * be replaced by huge pages, for GFNs within the slot.
1700  */
1701 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1702                                        const struct kvm_memory_slot *slot)
1703 {
1704         struct kvm_mmu_page *root;
1705
1706         lockdep_assert_held_read(&kvm->mmu_lock);
1707
1708         for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1709                 zap_collapsible_spte_range(kvm, root, slot);
1710 }
1711
1712 /*
1713  * Removes write access on the last level SPTE mapping this GFN and unsets the
1714  * MMU-writable bit to ensure future writes continue to be intercepted.
1715  * Returns true if an SPTE was set and a TLB flush is needed.
1716  */
1717 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1718                               gfn_t gfn, int min_level)
1719 {
1720         struct tdp_iter iter;
1721         u64 new_spte;
1722         bool spte_set = false;
1723
1724         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1725
1726         rcu_read_lock();
1727
1728         for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1729                 if (!is_shadow_present_pte(iter.old_spte) ||
1730                     !is_last_spte(iter.old_spte, iter.level))
1731                         continue;
1732
1733                 new_spte = iter.old_spte &
1734                         ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1735
1736                 if (new_spte == iter.old_spte)
1737                         break;
1738
1739                 tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1740                 spte_set = true;
1741         }
1742
1743         rcu_read_unlock();
1744
1745         return spte_set;
1746 }
1747
1748 /*
1749  * Removes write access on the last level SPTE mapping this GFN and unsets the
1750  * MMU-writable bit to ensure future writes continue to be intercepted.
1751  * Returns true if an SPTE was set and a TLB flush is needed.
1752  */
1753 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1754                                    struct kvm_memory_slot *slot, gfn_t gfn,
1755                                    int min_level)
1756 {
1757         struct kvm_mmu_page *root;
1758         bool spte_set = false;
1759
1760         lockdep_assert_held_write(&kvm->mmu_lock);
1761         for_each_tdp_mmu_root(kvm, root, slot->as_id)
1762                 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1763
1764         return spte_set;
1765 }
1766
1767 /*
1768  * Return the level of the lowest level SPTE added to sptes.
1769  * That SPTE may be non-present.
1770  *
1771  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1772  */
1773 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1774                          int *root_level)
1775 {
1776         struct tdp_iter iter;
1777         struct kvm_mmu *mmu = vcpu->arch.mmu;
1778         gfn_t gfn = addr >> PAGE_SHIFT;
1779         int leaf = -1;
1780
1781         *root_level = vcpu->arch.mmu->root_role.level;
1782
1783         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1784                 leaf = iter.level;
1785                 sptes[leaf] = iter.old_spte;
1786         }
1787
1788         return leaf;
1789 }
1790
1791 /*
1792  * Returns the last level spte pointer of the shadow page walk for the given
1793  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1794  * walk could be performed, returns NULL and *spte does not contain valid data.
1795  *
1796  * Contract:
1797  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1798  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1799  *
1800  * WARNING: This function is only intended to be called during fast_page_fault.
1801  */
1802 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1803                                         u64 *spte)
1804 {
1805         struct tdp_iter iter;
1806         struct kvm_mmu *mmu = vcpu->arch.mmu;
1807         gfn_t gfn = addr >> PAGE_SHIFT;
1808         tdp_ptep_t sptep = NULL;
1809
1810         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1811                 *spte = iter.old_spte;
1812                 sptep = iter.sptep;
1813         }
1814
1815         /*
1816          * Perform the rcu_dereference to get the raw spte pointer value since
1817          * we are passing it up to fast_page_fault, which is shared with the
1818          * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1819          * annotation.
1820          *
1821          * This is safe since fast_page_fault obeys the contracts of this
1822          * function as well as all TDP MMU contracts around modifying SPTEs
1823          * outside of mmu_lock.
1824          */
1825         return rcu_dereference(sptep);
1826 }