arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 #include <asm/cmpxchg.h>
  11 #include <trace/events/kvm.h>
  12
  13 static bool __read_mostly tdp_mmu_enabled = false;
  14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
  15
  16 /* Initializes the TDP MMU for the VM, if enabled. */
  17 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  18 {
  19         if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
  20                 return;
  21
  22         /* This should not be changed for the lifetime of the VM. */
  23         kvm->arch.tdp_mmu_enabled = true;
  24
  25         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  26         spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
  27         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  28 }
  29
  30 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  31 {
  32         if (!kvm->arch.tdp_mmu_enabled)
  33                 return;
  34
  35         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  36
  37         /*
  38          * Ensure that all the outstanding RCU callbacks to free shadow pages
  39          * can run before the VM is torn down.
  40          */
  41         rcu_barrier();
  42 }
  43
  44 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
  45 {
  46         if (kvm_mmu_put_root(kvm, root))
  47                 kvm_tdp_mmu_free_root(kvm, root);
  48 }
  49
  50 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
  51                                            struct kvm_mmu_page *root)
  52 {
  53         lockdep_assert_held_write(&kvm->mmu_lock);
  54
  55         if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
  56                 return false;
  57
  58         kvm_mmu_get_root(kvm, root);
  59         return true;
  60
  61 }
  62
  63 static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  64                                                      struct kvm_mmu_page *root)
  65 {
  66         struct kvm_mmu_page *next_root;
  67
  68         next_root = list_next_entry(root, link);
  69         tdp_mmu_put_root(kvm, root);
  70         return next_root;
  71 }
  72
  73 /*
  74  * Note: this iterator gets and puts references to the roots it iterates over.
  75  * This makes it safe to release the MMU lock and yield within the loop, but
  76  * if exiting the loop early, the caller must drop the reference to the most
  77  * recent root. (Unless keeping a live reference is desirable.)
  78  */
  79 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                           \
  80         for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
  81                                       typeof(*_root), link);            \
  82              tdp_mmu_next_root_valid(_kvm, _root);                      \
  83              _root = tdp_mmu_next_root(_kvm, _root))
  84
  85 #define for_each_tdp_mmu_root(_kvm, _root)                              \
  86         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  87
  88 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  89                           gfn_t start, gfn_t end, bool can_yield);
  90
  91 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  92 {
  93         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  94
  95         lockdep_assert_held_write(&kvm->mmu_lock);
  96
  97         WARN_ON(root->root_count);
  98         WARN_ON(!root->tdp_mmu_page);
  99
 100         list_del(&root->link);
 101
 102         zap_gfn_range(kvm, root, 0, max_gfn, false);
 103
 104         free_page((unsigned long)root->spt);
 105         kmem_cache_free(mmu_page_header_cache, root);
 106 }
 107
 108 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
 109                                                    int level)
 110 {
 111         union kvm_mmu_page_role role;
 112
 113         role = vcpu->arch.mmu->mmu_role.base;
 114         role.level = level;
 115         role.direct = true;
 116         role.gpte_is_8_bytes = true;
 117         role.access = ACC_ALL;
 118
 119         return role;
 120 }
 121
 122 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 123                                                int level)
 124 {
 125         struct kvm_mmu_page *sp;
 126
 127         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 128         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
 129         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 130
 131         sp->role.word = page_role_for_level(vcpu, level).word;
 132         sp->gfn = gfn;
 133         sp->tdp_mmu_page = true;
 134
 135         trace_kvm_mmu_get_page(sp, true);
 136
 137         return sp;
 138 }
 139
 140 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 141 {
 142         union kvm_mmu_page_role role;
 143         struct kvm *kvm = vcpu->kvm;
 144         struct kvm_mmu_page *root;
 145
 146         lockdep_assert_held_write(&kvm->mmu_lock);
 147
 148         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 149
 150         /* Check for an existing root before allocating a new one. */
 151         for_each_tdp_mmu_root(kvm, root) {
 152                 if (root->role.word == role.word) {
 153                         kvm_mmu_get_root(kvm, root);
 154                         goto out;
 155                 }
 156         }
 157
 158         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 159         root->root_count = 1;
 160
 161         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 162
 163 out:
 164         return __pa(root->spt);
 165 }
 166
 167 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 168 {
 169         free_page((unsigned long)sp->spt);
 170         kmem_cache_free(mmu_page_header_cache, sp);
 171 }
 172
 173 /*
 174  * This is called through call_rcu in order to free TDP page table memory
 175  * safely with respect to other kernel threads that may be operating on
 176  * the memory.
 177  * By only accessing TDP MMU page table memory in an RCU read critical
 178  * section, and freeing it after a grace period, lockless access to that
 179  * memory won't use it after it is freed.
 180  */
 181 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 182 {
 183         struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
 184                                                rcu_head);
 185
 186         tdp_mmu_free_sp(sp);
 187 }
 188
 189 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 190                                 u64 old_spte, u64 new_spte, int level,
 191                                 bool shared);
 192
 193 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 194 {
 195         return sp->role.smm ? 1 : 0;
 196 }
 197
 198 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 199 {
 200         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 201
 202         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 203                 return;
 204
 205         if (is_accessed_spte(old_spte) &&
 206             (!is_accessed_spte(new_spte) || pfn_changed))
 207                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 208 }
 209
 210 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 211                                           u64 old_spte, u64 new_spte, int level)
 212 {
 213         bool pfn_changed;
 214         struct kvm_memory_slot *slot;
 215
 216         if (level > PG_LEVEL_4K)
 217                 return;
 218
 219         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 220
 221         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 222             is_writable_pte(new_spte)) {
 223                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 224                 mark_page_dirty_in_slot(kvm, slot, gfn);
 225         }
 226 }
 227
 228 /**
 229  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
 230  *
 231  * @kvm: kvm instance
 232  * @sp: the new page
 233  * @shared: This operation may not be running under the exclusive use of
 234  *          the MMU lock and the operation must synchronize with other
 235  *          threads that might be adding or removing pages.
 236  * @account_nx: This page replaces a NX large page and should be marked for
 237  *              eventual reclaim.
 238  */
 239 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 240                               bool shared, bool account_nx)
 241 {
 242         if (shared)
 243                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 244         else
 245                 lockdep_assert_held_write(&kvm->mmu_lock);
 246
 247         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
 248         if (account_nx)
 249                 account_huge_nx_page(kvm, sp);
 250
 251         if (shared)
 252                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 253 }
 254
 255 /**
 256  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
 257  *
 258  * @kvm: kvm instance
 259  * @sp: the page to be removed
 260  * @shared: This operation may not be running under the exclusive use of
 261  *          the MMU lock and the operation must synchronize with other
 262  *          threads that might be adding or removing pages.
 263  */
 264 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 265                                 bool shared)
 266 {
 267         if (shared)
 268                 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 269         else
 270                 lockdep_assert_held_write(&kvm->mmu_lock);
 271
 272         list_del(&sp->link);
 273         if (sp->lpage_disallowed)
 274                 unaccount_huge_nx_page(kvm, sp);
 275
 276         if (shared)
 277                 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 278 }
 279
 280 /**
 281  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
 282  *
 283  * @kvm: kvm instance
 284  * @pt: the page removed from the paging structure
 285  * @shared: This operation may not be running under the exclusive use
 286  *          of the MMU lock and the operation must synchronize with other
 287  *          threads that might be modifying SPTEs.
 288  *
 289  * Given a page table that has been removed from the TDP paging structure,
 290  * iterates through the page table to clear SPTEs and free child page tables.
 291  */
 292 static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
 293                                         bool shared)
 294 {
 295         struct kvm_mmu_page *sp = sptep_to_sp(pt);
 296         int level = sp->role.level;
 297         gfn_t base_gfn = sp->gfn;
 298         u64 old_child_spte;
 299         u64 *sptep;
 300         gfn_t gfn;
 301         int i;
 302
 303         trace_kvm_mmu_prepare_zap_page(sp);
 304
 305         tdp_mmu_unlink_page(kvm, sp, shared);
 306
 307         for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 308                 sptep = pt + i;
 309                 gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 310
 311                 if (shared) {
 312                         /*
 313                          * Set the SPTE to a nonpresent value that other
 314                          * threads will not overwrite. If the SPTE was
 315                          * already marked as removed then another thread
 316                          * handling a page fault could overwrite it, so
 317                          * set the SPTE until it is set from some other
 318                          * value to the removed SPTE value.
 319                          */
 320                         for (;;) {
 321                                 old_child_spte = xchg(sptep, REMOVED_SPTE);
 322                                 if (!is_removed_spte(old_child_spte))
 323                                         break;
 324                                 cpu_relax();
 325                         }
 326                 } else {
 327                         /*
 328                          * If the SPTE is not MMU-present, there is no backing
 329                          * page associated with the SPTE and so no side effects
 330                          * that need to be recorded, and exclusive ownership of
 331                          * mmu_lock ensures the SPTE can't be made present.
 332                          * Note, zapping MMIO SPTEs is also unnecessary as they
 333                          * are guarded by the memslots generation, not by being
 334                          * unreachable.
 335                          */
 336                         old_child_spte = READ_ONCE(*sptep);
 337                         if (!is_shadow_present_pte(old_child_spte))
 338                                 continue;
 339
 340                         /*
 341                          * Marking the SPTE as a removed SPTE is not
 342                          * strictly necessary here as the MMU lock will
 343                          * stop other threads from concurrently modifying
 344                          * this SPTE. Using the removed SPTE value keeps
 345                          * the two branches consistent and simplifies
 346                          * the function.
 347                          */
 348                         WRITE_ONCE(*sptep, REMOVED_SPTE);
 349                 }
 350                 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
 351                                     old_child_spte, REMOVED_SPTE, level - 1,
 352                                     shared);
 353         }
 354
 355         kvm_flush_remote_tlbs_with_address(kvm, gfn,
 356                                            KVM_PAGES_PER_HPAGE(level));
 357
 358         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
 359 }
 360
 361 /**
 362  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 363  * @kvm: kvm instance
 364  * @as_id: the address space of the paging structure the SPTE was a part of
 365  * @gfn: the base GFN that was mapped by the SPTE
 366  * @old_spte: The value of the SPTE before the change
 367  * @new_spte: The value of the SPTE after the change
 368  * @level: the level of the PT the SPTE is part of in the paging structure
 369  * @shared: This operation may not be running under the exclusive use of
 370  *          the MMU lock and the operation must synchronize with other
 371  *          threads that might be modifying SPTEs.
 372  *
 373  * Handle bookkeeping that might result from the modification of a SPTE.
 374  * This function must be called for all TDP SPTE modifications.
 375  */
 376 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 377                                   u64 old_spte, u64 new_spte, int level,
 378                                   bool shared)
 379 {
 380         bool was_present = is_shadow_present_pte(old_spte);
 381         bool is_present = is_shadow_present_pte(new_spte);
 382         bool was_leaf = was_present && is_last_spte(old_spte, level);
 383         bool is_leaf = is_present && is_last_spte(new_spte, level);
 384         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 385
 386         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 387         WARN_ON(level < PG_LEVEL_4K);
 388         WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
 389
 390         /*
 391          * If this warning were to trigger it would indicate that there was a
 392          * missing MMU notifier or a race with some notifier handler.
 393          * A present, leaf SPTE should never be directly replaced with another
 394          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 395          * should be zapping the SPTE before the main MM's page table is
 396          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 397          * thread before replacement.
 398          */
 399         if (was_leaf && is_leaf && pfn_changed) {
 400                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 401                        "SPTE with another present leaf SPTE mapping a\n"
 402                        "different PFN!\n"
 403                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 404                        as_id, gfn, old_spte, new_spte, level);
 405
 406                 /*
 407                  * Crash the host to prevent error propagation and guest data
 408                  * courruption.
 409                  */
 410                 BUG();
 411         }
 412
 413         if (old_spte == new_spte)
 414                 return;
 415
 416         trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 417
 418         /*
 419          * The only times a SPTE should be changed from a non-present to
 420          * non-present state is when an MMIO entry is installed/modified/
 421          * removed. In that case, there is nothing to do here.
 422          */
 423         if (!was_present && !is_present) {
 424                 /*
 425                  * If this change does not involve a MMIO SPTE or removed SPTE,
 426                  * it is unexpected. Log the change, though it should not
 427                  * impact the guest since both the former and current SPTEs
 428                  * are nonpresent.
 429                  */
 430                 if (WARN_ON(!is_mmio_spte(old_spte) &&
 431                             !is_mmio_spte(new_spte) &&
 432                             !is_removed_spte(new_spte)))
 433                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 434                                "should not be replaced with another,\n"
 435                                "different nonpresent SPTE, unless one or both\n"
 436                                "are MMIO SPTEs, or the new SPTE is\n"
 437                                "a temporary removed SPTE.\n"
 438                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 439                                as_id, gfn, old_spte, new_spte, level);
 440                 return;
 441         }
 442
 443
 444         if (was_leaf && is_dirty_spte(old_spte) &&
 445             (!is_dirty_spte(new_spte) || pfn_changed))
 446                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 447
 448         /*
 449          * Recursively handle child PTs if the change removed a subtree from
 450          * the paging structure.
 451          */
 452         if (was_present && !was_leaf && (pfn_changed || !is_present))
 453                 handle_removed_tdp_mmu_page(kvm,
 454                                 spte_to_child_pt(old_spte, level), shared);
 455 }
 456
 457 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 458                                 u64 old_spte, u64 new_spte, int level,
 459                                 bool shared)
 460 {
 461         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
 462                               shared);
 463         handle_changed_spte_acc_track(old_spte, new_spte, level);
 464         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 465                                       new_spte, level);
 466 }
 467
 468 /*
 469  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
 470  * associated bookkeeping
 471  *
 472  * @kvm: kvm instance
 473  * @iter: a tdp_iter instance currently on the SPTE that should be set
 474  * @new_spte: The value the SPTE should be set to
 475  * Returns: true if the SPTE was set, false if it was not. If false is returned,
 476  *          this function will have no side-effects.
 477  */
 478 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
 479                                            struct tdp_iter *iter,
 480                                            u64 new_spte)
 481 {
 482         u64 *root_pt = tdp_iter_root_pt(iter);
 483         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 484         int as_id = kvm_mmu_page_as_id(root);
 485
 486         lockdep_assert_held_read(&kvm->mmu_lock);
 487
 488         /*
 489          * Do not change removed SPTEs. Only the thread that froze the SPTE
 490          * may modify it.
 491          */
 492         if (iter->old_spte == REMOVED_SPTE)
 493                 return false;
 494
 495         if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
 496                       new_spte) != iter->old_spte)
 497                 return false;
 498
 499         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 500                             iter->level, true);
 501
 502         return true;
 503 }
 504
 505 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 506                                            struct tdp_iter *iter)
 507 {
 508         /*
 509          * Freeze the SPTE by setting it to a special,
 510          * non-present value. This will stop other threads from
 511          * immediately installing a present entry in its place
 512          * before the TLBs are flushed.
 513          */
 514         if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
 515                 return false;
 516
 517         kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
 518                                            KVM_PAGES_PER_HPAGE(iter->level));
 519
 520         /*
 521          * No other thread can overwrite the removed SPTE as they
 522          * must either wait on the MMU lock or use
 523          * tdp_mmu_set_spte_atomic which will not overrite the
 524          * special removed SPTE value. No bookkeeping is needed
 525          * here since the SPTE is going from non-present
 526          * to non-present.
 527          */
 528         WRITE_ONCE(*iter->sptep, 0);
 529
 530         return true;
 531 }
 532
 533
 534 /*
 535  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
 536  * @kvm: kvm instance
 537  * @iter: a tdp_iter instance currently on the SPTE that should be set
 538  * @new_spte: The value the SPTE should be set to
 539  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
 540  *                    of the page. Should be set unless handling an MMU
 541  *                    notifier for access tracking. Leaving record_acc_track
 542  *                    unset in that case prevents page accesses from being
 543  *                    double counted.
 544  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
 545  *                    appropriate for the change being made. Should be set
 546  *                    unless performing certain dirty logging operations.
 547  *                    Leaving record_dirty_log unset in that case prevents page
 548  *                    writes from being double counted.
 549  */
 550 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 551                                       u64 new_spte, bool record_acc_track,
 552                                       bool record_dirty_log)
 553 {
 554         tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
 555         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 556         int as_id = kvm_mmu_page_as_id(root);
 557
 558         lockdep_assert_held_write(&kvm->mmu_lock);
 559
 560         /*
 561          * No thread should be using this function to set SPTEs to the
 562          * temporary removed SPTE value.
 563          * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
 564          * should be used. If operating under the MMU lock in write mode, the
 565          * use of the removed SPTE should not be necessary.
 566          */
 567         WARN_ON(iter->old_spte == REMOVED_SPTE);
 568
 569         WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 570
 571         __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 572                               iter->level, false);
 573         if (record_acc_track)
 574                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 575                                               iter->level);
 576         if (record_dirty_log)
 577                 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
 578                                               iter->old_spte, new_spte,
 579                                               iter->level);
 580 }
 581
 582 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 583                                     u64 new_spte)
 584 {
 585         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 586 }
 587
 588 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 589                                                  struct tdp_iter *iter,
 590                                                  u64 new_spte)
 591 {
 592         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 593 }
 594
 595 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 596                                                  struct tdp_iter *iter,
 597                                                  u64 new_spte)
 598 {
 599         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 600 }
 601
 602 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 603         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 604
 605 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 606         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 607                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 608                     !is_last_spte(_iter.old_spte, _iter.level))         \
 609                         continue;                                       \
 610                 else
 611
 612 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 613         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 614                          _mmu->shadow_root_level, _start, _end)
 615
 616 /*
 617  * Yield if the MMU lock is contended or this thread needs to return control
 618  * to the scheduler.
 619  *
 620  * If this function should yield and flush is set, it will perform a remote
 621  * TLB flush before yielding.
 622  *
 623  * If this function yields, it will also reset the tdp_iter's walk over the
 624  * paging structure and the calling function should skip to the next
 625  * iteration to allow the iterator to continue its traversal from the
 626  * paging structure root.
 627  *
 628  * Return true if this function yielded and the iterator's traversal was reset.
 629  * Return false if a yield was not needed.
 630  */
 631 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 632                                              struct tdp_iter *iter, bool flush)
 633 {
 634         /* Ensure forward progress has been made before yielding. */
 635         if (iter->next_last_level_gfn == iter->yielded_gfn)
 636                 return false;
 637
 638         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 639                 rcu_read_unlock();
 640
 641                 if (flush)
 642                         kvm_flush_remote_tlbs(kvm);
 643
 644                 cond_resched_rwlock_write(&kvm->mmu_lock);
 645                 rcu_read_lock();
 646
 647                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
 648
 649                 tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
 650                                iter->root_level, iter->min_level,
 651                                iter->next_last_level_gfn);
 652
 653                 return true;
 654         }
 655
 656         return false;
 657 }
 658
 659 /*
 660  * Tears down the mappings for the range of gfns, [start, end), and frees the
 661  * non-root pages mapping GFNs strictly within that range. Returns true if
 662  * SPTEs have been cleared and a TLB flush is needed before releasing the
 663  * MMU lock.
 664  * If can_yield is true, will release the MMU lock and reschedule if the
 665  * scheduler needs the CPU or there is contention on the MMU lock. If this
 666  * function cannot yield, it will not release the MMU lock or reschedule and
 667  * the caller must ensure it does not supply too large a GFN range, or the
 668  * operation can cause a soft lockup.
 669  */
 670 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 671                           gfn_t start, gfn_t end, bool can_yield)
 672 {
 673         struct tdp_iter iter;
 674         bool flush_needed = false;
 675
 676         rcu_read_lock();
 677
 678         tdp_root_for_each_pte(iter, root, start, end) {
 679                 if (can_yield &&
 680                     tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
 681                         flush_needed = false;
 682                         continue;
 683                 }
 684
 685                 if (!is_shadow_present_pte(iter.old_spte))
 686                         continue;
 687
 688                 /*
 689                  * If this is a non-last-level SPTE that covers a larger range
 690                  * than should be zapped, continue, and zap the mappings at a
 691                  * lower level.
 692                  */
 693                 if ((iter.gfn < start ||
 694                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 695                     !is_last_spte(iter.old_spte, iter.level))
 696                         continue;
 697
 698                 tdp_mmu_set_spte(kvm, &iter, 0);
 699                 flush_needed = true;
 700         }
 701
 702         rcu_read_unlock();
 703         return flush_needed;
 704 }
 705
 706 /*
 707  * Tears down the mappings for the range of gfns, [start, end), and frees the
 708  * non-root pages mapping GFNs strictly within that range. Returns true if
 709  * SPTEs have been cleared and a TLB flush is needed before releasing the
 710  * MMU lock.
 711  */
 712 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 713 {
 714         struct kvm_mmu_page *root;
 715         bool flush = false;
 716
 717         for_each_tdp_mmu_root_yield_safe(kvm, root)
 718                 flush |= zap_gfn_range(kvm, root, start, end, true);
 719
 720         return flush;
 721 }
 722
 723 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 724 {
 725         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 726         bool flush;
 727
 728         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 729         if (flush)
 730                 kvm_flush_remote_tlbs(kvm);
 731 }
 732
 733 /*
 734  * Installs a last-level SPTE to handle a TDP page fault.
 735  * (NPT/EPT violation/misconfiguration)
 736  */
 737 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 738                                           int map_writable,
 739                                           struct tdp_iter *iter,
 740                                           kvm_pfn_t pfn, bool prefault)
 741 {
 742         u64 new_spte;
 743         int ret = 0;
 744         int make_spte_ret = 0;
 745
 746         if (unlikely(is_noslot_pfn(pfn)))
 747                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 748         else
 749                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 750                                          pfn, iter->old_spte, prefault, true,
 751                                          map_writable, !shadow_accessed_mask,
 752                                          &new_spte);
 753
 754         if (new_spte == iter->old_spte)
 755                 ret = RET_PF_SPURIOUS;
 756         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 757                 return RET_PF_RETRY;
 758
 759         /*
 760          * If the page fault was caused by a write but the page is write
 761          * protected, emulation is needed. If the emulation was skipped,
 762          * the vCPU would have the same fault again.
 763          */
 764         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 765                 if (write)
 766                         ret = RET_PF_EMULATE;
 767                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 768         }
 769
 770         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 771         if (unlikely(is_mmio_spte(new_spte))) {
 772                 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
 773                                      new_spte);
 774                 ret = RET_PF_EMULATE;
 775         } else
 776                 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 777                                        rcu_dereference(iter->sptep));
 778
 779         trace_kvm_mmu_set_spte(iter->level, iter->gfn,
 780                                rcu_dereference(iter->sptep));
 781         if (!prefault)
 782                 vcpu->stat.pf_fixed++;
 783
 784         return ret;
 785 }
 786
 787 /*
 788  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 789  * page tables and SPTEs to translate the faulting guest physical address.
 790  */
 791 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 792                     int map_writable, int max_level, kvm_pfn_t pfn,
 793                     bool prefault)
 794 {
 795         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 796         bool write = error_code & PFERR_WRITE_MASK;
 797         bool exec = error_code & PFERR_FETCH_MASK;
 798         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 799         struct kvm_mmu *mmu = vcpu->arch.mmu;
 800         struct tdp_iter iter;
 801         struct kvm_mmu_page *sp;
 802         u64 *child_pt;
 803         u64 new_spte;
 804         int ret;
 805         gfn_t gfn = gpa >> PAGE_SHIFT;
 806         int level;
 807         int req_level;
 808
 809         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 810                 return RET_PF_RETRY;
 811         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 812                 return RET_PF_RETRY;
 813
 814         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 815                                         huge_page_disallowed, &req_level);
 816
 817         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 818
 819         rcu_read_lock();
 820
 821         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 822                 if (nx_huge_page_workaround_enabled)
 823                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 824                                                    iter.level, &pfn, &level);
 825
 826                 if (iter.level == level)
 827                         break;
 828
 829                 /*
 830                  * If there is an SPTE mapping a large page at a higher level
 831                  * than the target, that SPTE must be cleared and replaced
 832                  * with a non-leaf SPTE.
 833                  */
 834                 if (is_shadow_present_pte(iter.old_spte) &&
 835                     is_large_pte(iter.old_spte)) {
 836                         if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
 837                                 break;
 838
 839                         /*
 840                          * The iter must explicitly re-read the spte here
 841                          * because the new value informs the !present
 842                          * path below.
 843                          */
 844                         iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
 845                 }
 846
 847                 if (!is_shadow_present_pte(iter.old_spte)) {
 848                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 849                         child_pt = sp->spt;
 850
 851                         new_spte = make_nonleaf_spte(child_pt,
 852                                                      !shadow_accessed_mask);
 853
 854                         if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
 855                                                     new_spte)) {
 856                                 tdp_mmu_link_page(vcpu->kvm, sp, true,
 857                                                   huge_page_disallowed &&
 858                                                   req_level >= iter.level);
 859
 860                                 trace_kvm_mmu_get_page(sp, true);
 861                         } else {
 862                                 tdp_mmu_free_sp(sp);
 863                                 break;
 864                         }
 865                 }
 866         }
 867
 868         if (iter.level != level) {
 869                 rcu_read_unlock();
 870                 return RET_PF_RETRY;
 871         }
 872
 873         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 874                                               pfn, prefault);
 875         rcu_read_unlock();
 876
 877         return ret;
 878 }
 879
 880 static __always_inline int
 881 kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
 882                              unsigned long start,
 883                              unsigned long end,
 884                              unsigned long data,
 885                              int (*handler)(struct kvm *kvm,
 886                                             struct kvm_memory_slot *slot,
 887                                             struct kvm_mmu_page *root,
 888                                             gfn_t start,
 889                                             gfn_t end,
 890                                             unsigned long data))
 891 {
 892         struct kvm_memslots *slots;
 893         struct kvm_memory_slot *memslot;
 894         struct kvm_mmu_page *root;
 895         int ret = 0;
 896         int as_id;
 897
 898         for_each_tdp_mmu_root_yield_safe(kvm, root) {
 899                 as_id = kvm_mmu_page_as_id(root);
 900                 slots = __kvm_memslots(kvm, as_id);
 901                 kvm_for_each_memslot(memslot, slots) {
 902                         unsigned long hva_start, hva_end;
 903                         gfn_t gfn_start, gfn_end;
 904
 905                         hva_start = max(start, memslot->userspace_addr);
 906                         hva_end = min(end, memslot->userspace_addr +
 907                                       (memslot->npages << PAGE_SHIFT));
 908                         if (hva_start >= hva_end)
 909                                 continue;
 910                         /*
 911                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 912                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 913                          */
 914                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 915                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 916
 917                         ret |= handler(kvm, memslot, root, gfn_start,
 918                                        gfn_end, data);
 919                 }
 920         }
 921
 922         return ret;
 923 }
 924
 925 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 926                                      struct kvm_memory_slot *slot,
 927                                      struct kvm_mmu_page *root, gfn_t start,
 928                                      gfn_t end, unsigned long unused)
 929 {
 930         return zap_gfn_range(kvm, root, start, end, false);
 931 }
 932
 933 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 934                               unsigned long end)
 935 {
 936         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 937                                             zap_gfn_range_hva_wrapper);
 938 }
 939
 940 /*
 941  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 942  * if any of the GFNs in the range have been accessed.
 943  */
 944 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 945                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 946                          unsigned long unused)
 947 {
 948         struct tdp_iter iter;
 949         int young = 0;
 950         u64 new_spte = 0;
 951
 952         rcu_read_lock();
 953
 954         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 955                 /*
 956                  * If we have a non-accessed entry we don't need to change the
 957                  * pte.
 958                  */
 959                 if (!is_accessed_spte(iter.old_spte))
 960                         continue;
 961
 962                 new_spte = iter.old_spte;
 963
 964                 if (spte_ad_enabled(new_spte)) {
 965                         clear_bit((ffs(shadow_accessed_mask) - 1),
 966                                   (unsigned long *)&new_spte);
 967                 } else {
 968                         /*
 969                          * Capture the dirty status of the page, so that it doesn't get
 970                          * lost when the SPTE is marked for access tracking.
 971                          */
 972                         if (is_writable_pte(new_spte))
 973                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 974
 975                         new_spte = mark_spte_for_access_track(new_spte);
 976                 }
 977                 new_spte &= ~shadow_dirty_mask;
 978
 979                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
 980                 young = 1;
 981
 982                 trace_kvm_age_page(iter.gfn, iter.level, slot, young);
 983         }
 984
 985         rcu_read_unlock();
 986
 987         return young;
 988 }
 989
 990 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
 991                               unsigned long end)
 992 {
 993         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 994                                             age_gfn_range);
 995 }
 996
 997 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 998                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
 999                         unsigned long unused2)
1000 {
1001         struct tdp_iter iter;
1002
1003         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
1004                 if (is_accessed_spte(iter.old_spte))
1005                         return 1;
1006
1007         return 0;
1008 }
1009
1010 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
1011 {
1012         return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
1013                                             test_age_gfn);
1014 }
1015
1016 /*
1017  * Handle the changed_pte MMU notifier for the TDP MMU.
1018  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1019  * notifier.
1020  * Returns non-zero if a flush is needed before releasing the MMU lock.
1021  */
1022 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
1023                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
1024                         unsigned long data)
1025 {
1026         struct tdp_iter iter;
1027         pte_t *ptep = (pte_t *)data;
1028         kvm_pfn_t new_pfn;
1029         u64 new_spte;
1030         int need_flush = 0;
1031
1032         rcu_read_lock();
1033
1034         WARN_ON(pte_huge(*ptep));
1035
1036         new_pfn = pte_pfn(*ptep);
1037
1038         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
1039                 if (iter.level != PG_LEVEL_4K)
1040                         continue;
1041
1042                 if (!is_shadow_present_pte(iter.old_spte))
1043                         break;
1044
1045                 /*
1046                  * Note, when changing a read-only SPTE, it's not strictly
1047                  * necessary to zero the SPTE before setting the new PFN, but
1048                  * doing so preserves the invariant that the PFN of a present
1049                  * leaf SPTE can never change.  See __handle_changed_spte().
1050                  */
1051                 tdp_mmu_set_spte(kvm, &iter, 0);
1052
1053                 if (!pte_write(*ptep)) {
1054                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1055                                         iter.old_spte, new_pfn);
1056
1057                         tdp_mmu_set_spte(kvm, &iter, new_spte);
1058                 }
1059
1060                 need_flush = 1;
1061         }
1062
1063         if (need_flush)
1064                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1065
1066         rcu_read_unlock();
1067
1068         return 0;
1069 }
1070
1071 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
1072                              pte_t *host_ptep)
1073 {
1074         return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
1075                                             (unsigned long)host_ptep,
1076                                             set_tdp_spte);
1077 }
1078
1079 /*
1080  * Remove write access from all the SPTEs mapping GFNs [start, end). If
1081  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
1082  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1083  */
1084 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1085                              gfn_t start, gfn_t end, int min_level)
1086 {
1087         struct tdp_iter iter;
1088         u64 new_spte;
1089         bool spte_set = false;
1090
1091         rcu_read_lock();
1092
1093         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1094
1095         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1096                                    min_level, start, end) {
1097                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1098                         continue;
1099
1100                 if (!is_shadow_present_pte(iter.old_spte) ||
1101                     !is_last_spte(iter.old_spte, iter.level) ||
1102                     !(iter.old_spte & PT_WRITABLE_MASK))
1103                         continue;
1104
1105                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1106
1107                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1108                 spte_set = true;
1109         }
1110
1111         rcu_read_unlock();
1112         return spte_set;
1113 }
1114
1115 /*
1116  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1117  * only affect leaf SPTEs down to min_level.
1118  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1119  */
1120 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
1121                              int min_level)
1122 {
1123         struct kvm_mmu_page *root;
1124         int root_as_id;
1125         bool spte_set = false;
1126
1127         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1128                 root_as_id = kvm_mmu_page_as_id(root);
1129                 if (root_as_id != slot->as_id)
1130                         continue;
1131
1132                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1133                              slot->base_gfn + slot->npages, min_level);
1134         }
1135
1136         return spte_set;
1137 }
1138
1139 /*
1140  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1141  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1142  * If AD bits are not enabled, this will require clearing the writable bit on
1143  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1144  * be flushed.
1145  */
1146 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1147                            gfn_t start, gfn_t end)
1148 {
1149         struct tdp_iter iter;
1150         u64 new_spte;
1151         bool spte_set = false;
1152
1153         rcu_read_lock();
1154
1155         tdp_root_for_each_leaf_pte(iter, root, start, end) {
1156                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
1157                         continue;
1158
1159                 if (spte_ad_need_write_protect(iter.old_spte)) {
1160                         if (is_writable_pte(iter.old_spte))
1161                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1162                         else
1163                                 continue;
1164                 } else {
1165                         if (iter.old_spte & shadow_dirty_mask)
1166                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1167                         else
1168                                 continue;
1169                 }
1170
1171                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1172                 spte_set = true;
1173         }
1174
1175         rcu_read_unlock();
1176         return spte_set;
1177 }
1178
1179 /*
1180  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1181  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1182  * If AD bits are not enabled, this will require clearing the writable bit on
1183  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1184  * be flushed.
1185  */
1186 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
1187 {
1188         struct kvm_mmu_page *root;
1189         int root_as_id;
1190         bool spte_set = false;
1191
1192         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1193                 root_as_id = kvm_mmu_page_as_id(root);
1194                 if (root_as_id != slot->as_id)
1195                         continue;
1196
1197                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1198                                 slot->base_gfn + slot->npages);
1199         }
1200
1201         return spte_set;
1202 }
1203
1204 /*
1205  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1206  * set in mask, starting at gfn. The given memslot is expected to contain all
1207  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1208  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1209  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1210  */
1211 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1212                                   gfn_t gfn, unsigned long mask, bool wrprot)
1213 {
1214         struct tdp_iter iter;
1215         u64 new_spte;
1216
1217         rcu_read_lock();
1218
1219         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1220                                     gfn + BITS_PER_LONG) {
1221                 if (!mask)
1222                         break;
1223
1224                 if (iter.level > PG_LEVEL_4K ||
1225                     !(mask & (1UL << (iter.gfn - gfn))))
1226                         continue;
1227
1228                 mask &= ~(1UL << (iter.gfn - gfn));
1229
1230                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1231                         if (is_writable_pte(iter.old_spte))
1232                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1233                         else
1234                                 continue;
1235                 } else {
1236                         if (iter.old_spte & shadow_dirty_mask)
1237                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
1238                         else
1239                                 continue;
1240                 }
1241
1242                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1243         }
1244
1245         rcu_read_unlock();
1246 }
1247
1248 /*
1249  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1250  * set in mask, starting at gfn. The given memslot is expected to contain all
1251  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1252  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1253  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1254  */
1255 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1256                                        struct kvm_memory_slot *slot,
1257                                        gfn_t gfn, unsigned long mask,
1258                                        bool wrprot)
1259 {
1260         struct kvm_mmu_page *root;
1261         int root_as_id;
1262
1263         lockdep_assert_held_write(&kvm->mmu_lock);
1264         for_each_tdp_mmu_root(kvm, root) {
1265                 root_as_id = kvm_mmu_page_as_id(root);
1266                 if (root_as_id != slot->as_id)
1267                         continue;
1268
1269                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1270         }
1271 }
1272
1273 /*
1274  * Clear leaf entries which could be replaced by large mappings, for
1275  * GFNs within the slot.
1276  */
1277 static void zap_collapsible_spte_range(struct kvm *kvm,
1278                                        struct kvm_mmu_page *root,
1279                                        struct kvm_memory_slot *slot)
1280 {
1281         gfn_t start = slot->base_gfn;
1282         gfn_t end = start + slot->npages;
1283         struct tdp_iter iter;
1284         kvm_pfn_t pfn;
1285         bool spte_set = false;
1286
1287         rcu_read_lock();
1288
1289         tdp_root_for_each_pte(iter, root, start, end) {
1290                 if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
1291                         spte_set = false;
1292                         continue;
1293                 }
1294
1295                 if (!is_shadow_present_pte(iter.old_spte) ||
1296                     !is_last_spte(iter.old_spte, iter.level))
1297                         continue;
1298
1299                 pfn = spte_to_pfn(iter.old_spte);
1300                 if (kvm_is_reserved_pfn(pfn) ||
1301                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1302                                                             pfn, PG_LEVEL_NUM))
1303                         continue;
1304
1305                 tdp_mmu_set_spte(kvm, &iter, 0);
1306
1307                 spte_set = true;
1308         }
1309
1310         rcu_read_unlock();
1311         if (spte_set)
1312                 kvm_flush_remote_tlbs(kvm);
1313 }
1314
1315 /*
1316  * Clear non-leaf entries (and free associated page tables) which could
1317  * be replaced by large mappings, for GFNs within the slot.
1318  */
1319 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1320                                        struct kvm_memory_slot *slot)
1321 {
1322         struct kvm_mmu_page *root;
1323         int root_as_id;
1324
1325         for_each_tdp_mmu_root_yield_safe(kvm, root) {
1326                 root_as_id = kvm_mmu_page_as_id(root);
1327                 if (root_as_id != slot->as_id)
1328                         continue;
1329
1330                 zap_collapsible_spte_range(kvm, root, slot);
1331         }
1332 }
1333
1334 /*
1335  * Removes write access on the last level SPTE mapping this GFN and unsets the
1336  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1337  * Returns true if an SPTE was set and a TLB flush is needed.
1338  */
1339 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1340                               gfn_t gfn)
1341 {
1342         struct tdp_iter iter;
1343         u64 new_spte;
1344         bool spte_set = false;
1345
1346         rcu_read_lock();
1347
1348         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
1349                 if (!is_writable_pte(iter.old_spte))
1350                         break;
1351
1352                 new_spte = iter.old_spte &
1353                         ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
1354
1355                 tdp_mmu_set_spte(kvm, &iter, new_spte);
1356                 spte_set = true;
1357         }
1358
1359         rcu_read_unlock();
1360
1361         return spte_set;
1362 }
1363
1364 /*
1365  * Removes write access on the last level SPTE mapping this GFN and unsets the
1366  * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
1367  * Returns true if an SPTE was set and a TLB flush is needed.
1368  */
1369 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1370                                    struct kvm_memory_slot *slot, gfn_t gfn)
1371 {
1372         struct kvm_mmu_page *root;
1373         int root_as_id;
1374         bool spte_set = false;
1375
1376         lockdep_assert_held_write(&kvm->mmu_lock);
1377         for_each_tdp_mmu_root(kvm, root) {
1378                 root_as_id = kvm_mmu_page_as_id(root);
1379                 if (root_as_id != slot->as_id)
1380                         continue;
1381
1382                 spte_set |= write_protect_gfn(kvm, root, gfn);
1383         }
1384         return spte_set;
1385 }
1386
1387 /*
1388  * Return the level of the lowest level SPTE added to sptes.
1389  * That SPTE may be non-present.
1390  */
1391 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1392                          int *root_level)
1393 {
1394         struct tdp_iter iter;
1395         struct kvm_mmu *mmu = vcpu->arch.mmu;
1396         gfn_t gfn = addr >> PAGE_SHIFT;
1397         int leaf = -1;
1398
1399         *root_level = vcpu->arch.mmu->shadow_root_level;
1400
1401         rcu_read_lock();
1402
1403         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1404                 leaf = iter.level;
1405                 sptes[leaf] = iter.old_spte;
1406         }
1407
1408         rcu_read_unlock();
1409
1410         return leaf;
1411 }