1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5 #include "mmu_internal.h"
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
14 /* Initializes the TDP MMU for the VM, if enabled. */
15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
21 /* Arbitrarily returns true so that this may be used in if statements. */
22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
26 lockdep_assert_held_read(&kvm->mmu_lock);
28 lockdep_assert_held_write(&kvm->mmu_lock);
33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
36 * Invalidate all roots, which besides the obvious, schedules all roots
37 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38 * ultimately frees all roots.
40 kvm_tdp_mmu_invalidate_all_roots(kvm);
41 kvm_tdp_mmu_zap_invalidated_roots(kvm);
43 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
47 * Ensure that all the outstanding RCU callbacks to free shadow pages
48 * can run before the VM is torn down. Putting the last reference to
49 * zapped roots will create new callbacks.
54 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
56 free_page((unsigned long)sp->spt);
57 kmem_cache_free(mmu_page_header_cache, sp);
61 * This is called through call_rcu in order to free TDP page table memory
62 * safely with respect to other kernel threads that may be operating on
64 * By only accessing TDP MMU page table memory in an RCU read critical
65 * section, and freeing it after a grace period, lockless access to that
66 * memory won't use it after it is freed.
68 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
70 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
76 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
79 * Either read or write is okay, but mmu_lock must be held because
80 * writers are not required to take tdp_mmu_pages_lock.
82 lockdep_assert_held(&kvm->mmu_lock);
84 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
88 * The TDP MMU itself holds a reference to each root until the root is
89 * explicitly invalidated, i.e. the final reference should be never be
90 * put for a valid root.
92 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
94 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
95 list_del_rcu(&root->link);
96 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
97 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
101 * Returns the next root after @prev_root (or the first root if @prev_root is
102 * NULL). A reference to the returned root is acquired, and the reference to
103 * @prev_root is released (the caller obviously must hold a reference to
104 * @prev_root if it's non-NULL).
106 * If @only_valid is true, invalid roots are skipped.
108 * Returns NULL if the end of tdp_mmu_roots was reached.
110 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
111 struct kvm_mmu_page *prev_root,
114 struct kvm_mmu_page *next_root;
117 * While the roots themselves are RCU-protected, fields such as
118 * role.invalid are protected by mmu_lock.
120 lockdep_assert_held(&kvm->mmu_lock);
125 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
127 typeof(*prev_root), link);
129 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
130 typeof(*next_root), link);
133 if ((!only_valid || !next_root->role.invalid) &&
134 kvm_tdp_mmu_get_root(next_root))
137 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
138 &next_root->link, typeof(*next_root), link);
144 kvm_tdp_mmu_put_root(kvm, prev_root);
150 * Note: this iterator gets and puts references to the roots it iterates over.
151 * This makes it safe to release the MMU lock and yield within the loop, but
152 * if exiting the loop early, the caller must drop the reference to the most
153 * recent root. (Unless keeping a live reference is desirable.)
155 * If shared is set, this function is operating under the MMU lock in read
158 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
159 for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
161 _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
162 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
163 kvm_mmu_page_as_id(_root) != _as_id) { \
166 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
167 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
169 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \
170 for (_root = tdp_mmu_next_root(_kvm, NULL, false); \
172 _root = tdp_mmu_next_root(_kvm, _root, false)) \
173 if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \
177 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
178 * the implication being that any flow that holds mmu_lock for read is
179 * inherently yield-friendly and should use the yield-safe variant above.
180 * Holding mmu_lock for write obviates the need for RCU protection as the list
181 * is guaranteed to be stable.
183 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
184 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
185 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
186 kvm_mmu_page_as_id(_root) != _as_id) { \
189 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
191 struct kvm_mmu_page *sp;
193 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
194 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
199 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
200 gfn_t gfn, union kvm_mmu_page_role role)
202 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
204 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
209 sp->tdp_mmu_page = true;
211 trace_kvm_mmu_get_page(sp, true);
214 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
215 struct tdp_iter *iter)
217 struct kvm_mmu_page *parent_sp;
218 union kvm_mmu_page_role role;
220 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
222 role = parent_sp->role;
225 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
228 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
230 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
231 struct kvm *kvm = vcpu->kvm;
232 struct kvm_mmu_page *root;
234 lockdep_assert_held_write(&kvm->mmu_lock);
237 * Check for an existing root before allocating a new one. Note, the
238 * role check prevents consuming an invalid root.
240 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
241 if (root->role.word == role.word &&
242 kvm_tdp_mmu_get_root(root))
246 root = tdp_mmu_alloc_sp(vcpu);
247 tdp_mmu_init_sp(root, NULL, 0, role);
250 * TDP MMU roots are kept until they are explicitly invalidated, either
251 * by a memslot update or by the destruction of the VM. Initialize the
252 * refcount to two; one reference for the vCPU, and one reference for
253 * the TDP MMU itself, which is held until the root is invalidated and
254 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
256 refcount_set(&root->tdp_mmu_root_count, 2);
258 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
259 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
260 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
263 return __pa(root->spt);
266 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
267 u64 old_spte, u64 new_spte, int level,
270 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
272 kvm_account_pgtable_pages((void *)sp->spt, +1);
273 atomic64_inc(&kvm->arch.tdp_mmu_pages);
276 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
278 kvm_account_pgtable_pages((void *)sp->spt, -1);
279 atomic64_dec(&kvm->arch.tdp_mmu_pages);
283 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
286 * @sp: the page to be removed
287 * @shared: This operation may not be running under the exclusive use of
288 * the MMU lock and the operation must synchronize with other
289 * threads that might be adding or removing pages.
291 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
294 tdp_unaccount_mmu_page(kvm, sp);
296 if (!sp->nx_huge_page_disallowed)
300 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
302 lockdep_assert_held_write(&kvm->mmu_lock);
304 sp->nx_huge_page_disallowed = false;
305 untrack_possible_nx_huge_page(kvm, sp);
308 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
312 * handle_removed_pt() - handle a page table removed from the TDP structure
315 * @pt: the page removed from the paging structure
316 * @shared: This operation may not be running under the exclusive use
317 * of the MMU lock and the operation must synchronize with other
318 * threads that might be modifying SPTEs.
320 * Given a page table that has been removed from the TDP paging structure,
321 * iterates through the page table to clear SPTEs and free child page tables.
323 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
324 * protection. Since this thread removed it from the paging structure,
325 * this thread will be responsible for ensuring the page is freed. Hence the
326 * early rcu_dereferences in the function.
328 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
330 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
331 int level = sp->role.level;
332 gfn_t base_gfn = sp->gfn;
335 trace_kvm_mmu_prepare_zap_page(sp);
337 tdp_mmu_unlink_sp(kvm, sp, shared);
339 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
340 tdp_ptep_t sptep = pt + i;
341 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
346 * Set the SPTE to a nonpresent value that other
347 * threads will not overwrite. If the SPTE was
348 * already marked as removed then another thread
349 * handling a page fault could overwrite it, so
350 * set the SPTE until it is set from some other
351 * value to the removed SPTE value.
354 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
355 if (!is_removed_spte(old_spte))
361 * If the SPTE is not MMU-present, there is no backing
362 * page associated with the SPTE and so no side effects
363 * that need to be recorded, and exclusive ownership of
364 * mmu_lock ensures the SPTE can't be made present.
365 * Note, zapping MMIO SPTEs is also unnecessary as they
366 * are guarded by the memslots generation, not by being
369 old_spte = kvm_tdp_mmu_read_spte(sptep);
370 if (!is_shadow_present_pte(old_spte))
374 * Use the common helper instead of a raw WRITE_ONCE as
375 * the SPTE needs to be updated atomically if it can be
376 * modified by a different vCPU outside of mmu_lock.
377 * Even though the parent SPTE is !PRESENT, the TLB
378 * hasn't yet been flushed, and both Intel and AMD
379 * document that A/D assists can use upper-level PxE
380 * entries that are cached in the TLB, i.e. the CPU can
381 * still access the page and mark it dirty.
383 * No retry is needed in the atomic update path as the
384 * sole concern is dropping a Dirty bit, i.e. no other
385 * task can zap/remove the SPTE as mmu_lock is held for
386 * write. Marking the SPTE as a removed SPTE is not
387 * strictly necessary for the same reason, but using
388 * the remove SPTE value keeps the shared/exclusive
389 * paths consistent and allows the handle_changed_spte()
390 * call below to hardcode the new value to REMOVED_SPTE.
392 * Note, even though dropping a Dirty bit is the only
393 * scenario where a non-atomic update could result in a
394 * functional bug, simply checking the Dirty bit isn't
395 * sufficient as a fast page fault could read the upper
396 * level SPTE before it is zapped, and then make this
397 * target SPTE writable, resume the guest, and set the
398 * Dirty bit between reading the SPTE above and writing
401 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
402 REMOVED_SPTE, level);
404 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
405 old_spte, REMOVED_SPTE, level, shared);
408 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
412 * handle_changed_spte - handle bookkeeping associated with an SPTE change
414 * @as_id: the address space of the paging structure the SPTE was a part of
415 * @gfn: the base GFN that was mapped by the SPTE
416 * @old_spte: The value of the SPTE before the change
417 * @new_spte: The value of the SPTE after the change
418 * @level: the level of the PT the SPTE is part of in the paging structure
419 * @shared: This operation may not be running under the exclusive use of
420 * the MMU lock and the operation must synchronize with other
421 * threads that might be modifying SPTEs.
423 * Handle bookkeeping that might result from the modification of a SPTE. Note,
424 * dirty logging updates are handled in common code, not here (see make_spte()
425 * and fast_pf_fix_direct_spte()).
427 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
428 u64 old_spte, u64 new_spte, int level,
431 bool was_present = is_shadow_present_pte(old_spte);
432 bool is_present = is_shadow_present_pte(new_spte);
433 bool was_leaf = was_present && is_last_spte(old_spte, level);
434 bool is_leaf = is_present && is_last_spte(new_spte, level);
435 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
437 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
438 WARN_ON_ONCE(level < PG_LEVEL_4K);
439 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
442 * If this warning were to trigger it would indicate that there was a
443 * missing MMU notifier or a race with some notifier handler.
444 * A present, leaf SPTE should never be directly replaced with another
445 * present leaf SPTE pointing to a different PFN. A notifier handler
446 * should be zapping the SPTE before the main MM's page table is
447 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
448 * thread before replacement.
450 if (was_leaf && is_leaf && pfn_changed) {
451 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
452 "SPTE with another present leaf SPTE mapping a\n"
454 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
455 as_id, gfn, old_spte, new_spte, level);
458 * Crash the host to prevent error propagation and guest data
464 if (old_spte == new_spte)
467 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
470 check_spte_writable_invariants(new_spte);
473 * The only times a SPTE should be changed from a non-present to
474 * non-present state is when an MMIO entry is installed/modified/
475 * removed. In that case, there is nothing to do here.
477 if (!was_present && !is_present) {
479 * If this change does not involve a MMIO SPTE or removed SPTE,
480 * it is unexpected. Log the change, though it should not
481 * impact the guest since both the former and current SPTEs
484 if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
485 !is_mmio_spte(new_spte) &&
486 !is_removed_spte(new_spte)))
487 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
488 "should not be replaced with another,\n"
489 "different nonpresent SPTE, unless one or both\n"
490 "are MMIO SPTEs, or the new SPTE is\n"
491 "a temporary removed SPTE.\n"
492 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
493 as_id, gfn, old_spte, new_spte, level);
497 if (is_leaf != was_leaf)
498 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
500 if (was_leaf && is_dirty_spte(old_spte) &&
501 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
502 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
505 * Recursively handle child PTs if the change removed a subtree from
506 * the paging structure. Note the WARN on the PFN changing without the
507 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
508 * pages are kernel allocations and should never be migrated.
510 if (was_present && !was_leaf &&
511 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
512 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
514 if (was_leaf && is_accessed_spte(old_spte) &&
515 (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
516 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
520 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
521 * and handle the associated bookkeeping. Do not mark the page dirty
522 * in KVM's dirty bitmaps.
524 * If setting the SPTE fails because it has changed, iter->old_spte will be
525 * refreshed to the current value of the spte.
528 * @iter: a tdp_iter instance currently on the SPTE that should be set
529 * @new_spte: The value the SPTE should be set to
531 * * 0 - If the SPTE was set.
532 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
533 * no side-effects other than setting iter->old_spte to the last
534 * known value of the spte.
536 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
537 struct tdp_iter *iter,
540 u64 *sptep = rcu_dereference(iter->sptep);
543 * The caller is responsible for ensuring the old SPTE is not a REMOVED
544 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
545 * and pre-checking before inserting a new SPTE is advantageous as it
546 * avoids unnecessary work.
548 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
550 lockdep_assert_held_read(&kvm->mmu_lock);
553 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
554 * does not hold the mmu_lock. On failure, i.e. if a different logical
555 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
556 * the current value, so the caller operates on fresh data, e.g. if it
557 * retries tdp_mmu_set_spte_atomic()
559 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
562 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
563 new_spte, iter->level, true);
568 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
569 struct tdp_iter *iter)
574 * Freeze the SPTE by setting it to a special,
575 * non-present value. This will stop other threads from
576 * immediately installing a present entry in its place
577 * before the TLBs are flushed.
579 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
583 kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
586 * No other thread can overwrite the removed SPTE as they must either
587 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
588 * overwrite the special removed SPTE value. No bookkeeping is needed
589 * here since the SPTE is going from non-present to non-present. Use
590 * the raw write helper to avoid an unnecessary check on volatile bits.
592 __kvm_tdp_mmu_write_spte(iter->sptep, 0);
599 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
601 * @as_id: Address space ID, i.e. regular vs. SMM
602 * @sptep: Pointer to the SPTE
603 * @old_spte: The current value of the SPTE
604 * @new_spte: The new value that will be set for the SPTE
605 * @gfn: The base GFN that was (or will be) mapped by the SPTE
606 * @level: The level _containing_ the SPTE (its parent PT's level)
608 * Returns the old SPTE value, which _may_ be different than @old_spte if the
609 * SPTE had voldatile bits.
611 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
612 u64 old_spte, u64 new_spte, gfn_t gfn, int level)
614 lockdep_assert_held_write(&kvm->mmu_lock);
617 * No thread should be using this function to set SPTEs to or from the
618 * temporary removed SPTE value.
619 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
620 * should be used. If operating under the MMU lock in write mode, the
621 * use of the removed SPTE should not be necessary.
623 WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
625 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
627 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
631 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
634 WARN_ON_ONCE(iter->yielded);
635 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
636 iter->old_spte, new_spte,
637 iter->gfn, iter->level);
640 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
641 for_each_tdp_pte(_iter, _root, _start, _end)
643 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
644 tdp_root_for_each_pte(_iter, _root, _start, _end) \
645 if (!is_shadow_present_pte(_iter.old_spte) || \
646 !is_last_spte(_iter.old_spte, _iter.level)) \
650 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
651 for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
654 * Yield if the MMU lock is contended or this thread needs to return control
657 * If this function should yield and flush is set, it will perform a remote
658 * TLB flush before yielding.
660 * If this function yields, iter->yielded is set and the caller must skip to
661 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
662 * over the paging structures to allow the iterator to continue its traversal
663 * from the paging structure root.
665 * Returns true if this function yielded.
667 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
668 struct tdp_iter *iter,
669 bool flush, bool shared)
671 WARN_ON_ONCE(iter->yielded);
673 /* Ensure forward progress has been made before yielding. */
674 if (iter->next_last_level_gfn == iter->yielded_gfn)
677 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
679 kvm_flush_remote_tlbs(kvm);
684 cond_resched_rwlock_read(&kvm->mmu_lock);
686 cond_resched_rwlock_write(&kvm->mmu_lock);
690 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
692 iter->yielded = true;
695 return iter->yielded;
698 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
701 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
702 * a gpa range that would exceed the max gfn, and KVM does not create
703 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
704 * the slow emulation path every time.
706 return kvm_mmu_max_gfn() + 1;
709 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
710 bool shared, int zap_level)
712 struct tdp_iter iter;
714 gfn_t end = tdp_mmu_max_gfn_exclusive();
717 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
719 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
722 if (!is_shadow_present_pte(iter.old_spte))
725 if (iter.level > zap_level)
729 tdp_mmu_iter_set_spte(kvm, &iter, 0);
730 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
735 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
740 * The root must have an elevated refcount so that it's reachable via
741 * mmu_notifier callbacks, which allows this path to yield and drop
742 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
743 * must drop all references to relevant pages prior to completing the
744 * callback. Dropping mmu_lock with an unreachable root would result
745 * in zapping SPTEs after a relevant mmu_notifier callback completes
746 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
747 * dirty accessed bits to the SPTE's associated struct page.
749 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
751 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
756 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
757 * split the zap into two passes. On the first pass, zap at the 1gb
758 * level, and then zap top-level SPs on the second pass. "1gb" is not
759 * arbitrary, as KVM must be able to zap a 1gb shadow page without
760 * inducing a stall to allow in-place replacement with a 1gb hugepage.
762 * Because zapping a SP recurses on its children, stepping down to
763 * PG_LEVEL_4K in the iterator itself is unnecessary.
765 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
766 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
771 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
776 * This helper intentionally doesn't allow zapping a root shadow page,
777 * which doesn't have a parent page table and thus no associated entry.
779 if (WARN_ON_ONCE(!sp->ptep))
782 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
783 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
786 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
787 sp->gfn, sp->role.level + 1);
793 * If can_yield is true, will release the MMU lock and reschedule if the
794 * scheduler needs the CPU or there is contention on the MMU lock. If this
795 * function cannot yield, it will not release the MMU lock or reschedule and
796 * the caller must ensure it does not supply too large a GFN range, or the
797 * operation can cause a soft lockup.
799 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
800 gfn_t start, gfn_t end, bool can_yield, bool flush)
802 struct tdp_iter iter;
804 end = min(end, tdp_mmu_max_gfn_exclusive());
806 lockdep_assert_held_write(&kvm->mmu_lock);
810 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
812 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
817 if (!is_shadow_present_pte(iter.old_spte) ||
818 !is_last_spte(iter.old_spte, iter.level))
821 tdp_mmu_iter_set_spte(kvm, &iter, 0);
828 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
829 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
835 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
836 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
837 * more SPTEs were zapped since the MMU lock was last acquired.
839 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
841 struct kvm_mmu_page *root;
843 for_each_tdp_mmu_root_yield_safe(kvm, root, false)
844 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
849 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
851 struct kvm_mmu_page *root;
854 * Zap all roots, including invalid roots, as all SPTEs must be dropped
855 * before returning to the caller. Zap directly even if the root is
856 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
857 * all that expensive and mmu_lock is already held, which means the
858 * worker has yielded, i.e. flushing the work instead of zapping here
859 * isn't guaranteed to be any faster.
861 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
862 * is being destroyed or the userspace VMM has exited. In both cases,
863 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
865 for_each_tdp_mmu_root_yield_safe(kvm, root, false)
866 tdp_mmu_zap_root(kvm, root, false);
870 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
873 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
875 struct kvm_mmu_page *root;
877 read_lock(&kvm->mmu_lock);
879 for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
880 if (!root->tdp_mmu_scheduled_root_to_zap)
883 root->tdp_mmu_scheduled_root_to_zap = false;
884 KVM_BUG_ON(!root->role.invalid, kvm);
887 * A TLB flush is not necessary as KVM performs a local TLB
888 * flush when allocating a new root (see kvm_mmu_load()), and
889 * when migrating a vCPU to a different pCPU. Note, the local
890 * TLB flush on reuse also invalidates paging-structure-cache
891 * entries, i.e. TLB entries for intermediate paging structures,
892 * that may be zapped, as such entries are associated with the
893 * ASID on both VMX and SVM.
895 tdp_mmu_zap_root(kvm, root, true);
898 * The referenced needs to be put *after* zapping the root, as
899 * the root must be reachable by mmu_notifiers while it's being
902 kvm_tdp_mmu_put_root(kvm, root);
905 read_unlock(&kvm->mmu_lock);
909 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
910 * is about to be zapped, e.g. in response to a memslots update. The actual
911 * zapping is done separately so that it happens with mmu_lock with read,
912 * whereas invalidating roots must be done with mmu_lock held for write (unless
913 * the VM is being destroyed).
915 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
916 * See kvm_tdp_mmu_get_vcpu_root_hpa().
918 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
920 struct kvm_mmu_page *root;
923 * mmu_lock must be held for write to ensure that a root doesn't become
924 * invalid while there are active readers (invalidating a root while
925 * there are active readers may or may not be problematic in practice,
926 * but it's uncharted territory and not supported).
928 * Waive the assertion if there are no users of @kvm, i.e. the VM is
929 * being destroyed after all references have been put, or if no vCPUs
930 * have been created (which means there are no roots), i.e. the VM is
931 * being destroyed in an error path of KVM_CREATE_VM.
933 if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
934 refcount_read(&kvm->users_count) && kvm->created_vcpus)
935 lockdep_assert_held_write(&kvm->mmu_lock);
938 * As above, mmu_lock isn't held when destroying the VM! There can't
939 * be other references to @kvm, i.e. nothing else can invalidate roots
940 * or get/put references to roots.
942 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
944 * Note, invalid roots can outlive a memslot update! Invalid
945 * roots must be *zapped* before the memslot update completes,
946 * but a different task can acquire a reference and keep the
947 * root alive after its been zapped.
949 if (!root->role.invalid) {
950 root->tdp_mmu_scheduled_root_to_zap = true;
951 root->role.invalid = true;
957 * Installs a last-level SPTE to handle a TDP page fault.
958 * (NPT/EPT violation/misconfiguration)
960 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
961 struct kvm_page_fault *fault,
962 struct tdp_iter *iter)
964 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
966 int ret = RET_PF_FIXED;
969 if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
972 if (unlikely(!fault->slot))
973 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
975 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
976 fault->pfn, iter->old_spte, fault->prefetch, true,
977 fault->map_writable, &new_spte);
979 if (new_spte == iter->old_spte)
980 ret = RET_PF_SPURIOUS;
981 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
983 else if (is_shadow_present_pte(iter->old_spte) &&
984 !is_last_spte(iter->old_spte, iter->level))
985 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
988 * If the page fault was caused by a write but the page is write
989 * protected, emulation is needed. If the emulation was skipped,
990 * the vCPU would have the same fault again.
994 ret = RET_PF_EMULATE;
997 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
998 if (unlikely(is_mmio_spte(new_spte))) {
999 vcpu->stat.pf_mmio_spte_created++;
1000 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1002 ret = RET_PF_EMULATE;
1004 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1005 rcu_dereference(iter->sptep));
1012 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1013 * provided page table.
1015 * @kvm: kvm instance
1016 * @iter: a tdp_iter instance currently on the SPTE that should be set
1017 * @sp: The new TDP page table to install.
1018 * @shared: This operation is running under the MMU lock in read mode.
1020 * Returns: 0 if the new page table was installed. Non-0 if the page table
1021 * could not be installed (e.g. the atomic compare-exchange failed).
1023 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1024 struct kvm_mmu_page *sp, bool shared)
1026 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1030 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1034 tdp_mmu_iter_set_spte(kvm, iter, spte);
1037 tdp_account_mmu_page(kvm, sp);
1042 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1043 struct kvm_mmu_page *sp, bool shared);
1046 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1047 * page tables and SPTEs to translate the faulting guest physical address.
1049 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1051 struct kvm_mmu *mmu = vcpu->arch.mmu;
1052 struct kvm *kvm = vcpu->kvm;
1053 struct tdp_iter iter;
1054 struct kvm_mmu_page *sp;
1055 int ret = RET_PF_RETRY;
1057 kvm_mmu_hugepage_adjust(vcpu, fault);
1059 trace_kvm_mmu_spte_requested(fault);
1063 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1066 if (fault->nx_huge_page_workaround_enabled)
1067 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1070 * If SPTE has been frozen by another thread, just give up and
1071 * retry, avoiding unnecessary page table allocation and free.
1073 if (is_removed_spte(iter.old_spte))
1076 if (iter.level == fault->goal_level)
1077 goto map_target_level;
1079 /* Step down into the lower level page table if it exists. */
1080 if (is_shadow_present_pte(iter.old_spte) &&
1081 !is_large_pte(iter.old_spte))
1085 * The SPTE is either non-present or points to a huge page that
1086 * needs to be split.
1088 sp = tdp_mmu_alloc_sp(vcpu);
1089 tdp_mmu_init_child_sp(sp, &iter);
1091 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1093 if (is_shadow_present_pte(iter.old_spte))
1094 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1096 r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1099 * Force the guest to retry if installing an upper level SPTE
1100 * failed, e.g. because a different task modified the SPTE.
1103 tdp_mmu_free_sp(sp);
1107 if (fault->huge_page_disallowed &&
1108 fault->req_level >= iter.level) {
1109 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1110 if (sp->nx_huge_page_disallowed)
1111 track_possible_nx_huge_page(kvm, sp);
1112 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1117 * The walk aborted before reaching the target level, e.g. because the
1118 * iterator detected an upper level SPTE was frozen during traversal.
1120 WARN_ON_ONCE(iter.level == fault->goal_level);
1124 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1131 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1134 struct kvm_mmu_page *root;
1136 __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
1137 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1138 range->may_block, flush);
1143 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1144 struct kvm_gfn_range *range);
1146 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1147 struct kvm_gfn_range *range,
1148 tdp_handler_t handler)
1150 struct kvm_mmu_page *root;
1151 struct tdp_iter iter;
1155 * Don't support rescheduling, none of the MMU notifiers that funnel
1156 * into this helper allow blocking; it'd be dead, wasteful code.
1158 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1161 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1162 ret |= handler(kvm, &iter, range);
1171 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1172 * if any of the GFNs in the range have been accessed.
1174 * No need to mark the corresponding PFN as accessed as this call is coming
1175 * from the clear_young() or clear_flush_young() notifier, which uses the
1176 * return value to determine if the page has been accessed.
1178 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1179 struct kvm_gfn_range *range)
1183 /* If we have a non-accessed entry we don't need to change the pte. */
1184 if (!is_accessed_spte(iter->old_spte))
1187 if (spte_ad_enabled(iter->old_spte)) {
1188 iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
1190 shadow_accessed_mask,
1192 new_spte = iter->old_spte & ~shadow_accessed_mask;
1195 * Capture the dirty status of the page, so that it doesn't get
1196 * lost when the SPTE is marked for access tracking.
1198 if (is_writable_pte(iter->old_spte))
1199 kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
1201 new_spte = mark_spte_for_access_track(iter->old_spte);
1202 iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
1203 iter->old_spte, new_spte,
1207 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1208 iter->old_spte, new_spte);
1212 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1214 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1217 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1218 struct kvm_gfn_range *range)
1220 return is_accessed_spte(iter->old_spte);
1223 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1225 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1228 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1229 struct kvm_gfn_range *range)
1233 /* Huge pages aren't expected to be modified without first being zapped. */
1234 WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
1236 if (iter->level != PG_LEVEL_4K ||
1237 !is_shadow_present_pte(iter->old_spte))
1241 * Note, when changing a read-only SPTE, it's not strictly necessary to
1242 * zero the SPTE before setting the new PFN, but doing so preserves the
1243 * invariant that the PFN of a present * leaf SPTE can never change.
1244 * See handle_changed_spte().
1246 tdp_mmu_iter_set_spte(kvm, iter, 0);
1248 if (!pte_write(range->arg.pte)) {
1249 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1250 pte_pfn(range->arg.pte));
1252 tdp_mmu_iter_set_spte(kvm, iter, new_spte);
1259 * Handle the changed_pte MMU notifier for the TDP MMU.
1260 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1262 * Returns non-zero if a flush is needed before releasing the MMU lock.
1264 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1267 * No need to handle the remote TLB flush under RCU protection, the
1268 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1269 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
1271 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1275 * Remove write access from all SPTEs at or above min_level that map GFNs
1276 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1279 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1280 gfn_t start, gfn_t end, int min_level)
1282 struct tdp_iter iter;
1284 bool spte_set = false;
1288 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1290 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1292 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1295 if (!is_shadow_present_pte(iter.old_spte) ||
1296 !is_last_spte(iter.old_spte, iter.level) ||
1297 !(iter.old_spte & PT_WRITABLE_MASK))
1300 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1302 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1313 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1314 * only affect leaf SPTEs down to min_level.
1315 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1317 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1318 const struct kvm_memory_slot *slot, int min_level)
1320 struct kvm_mmu_page *root;
1321 bool spte_set = false;
1323 lockdep_assert_held_read(&kvm->mmu_lock);
1325 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1326 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1327 slot->base_gfn + slot->npages, min_level);
1332 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1334 struct kvm_mmu_page *sp;
1338 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1342 sp->spt = (void *)__get_free_page(gfp);
1344 kmem_cache_free(mmu_page_header_cache, sp);
1351 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1352 struct tdp_iter *iter,
1355 struct kvm_mmu_page *sp;
1358 * Since we are allocating while under the MMU lock we have to be
1359 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1360 * reclaim and to avoid making any filesystem callbacks (which can end
1361 * up invoking KVM MMU notifiers, resulting in a deadlock).
1363 * If this allocation fails we drop the lock and retry with reclaim
1366 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1373 read_unlock(&kvm->mmu_lock);
1375 write_unlock(&kvm->mmu_lock);
1377 iter->yielded = true;
1378 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1381 read_lock(&kvm->mmu_lock);
1383 write_lock(&kvm->mmu_lock);
1390 /* Note, the caller is responsible for initializing @sp. */
1391 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1392 struct kvm_mmu_page *sp, bool shared)
1394 const u64 huge_spte = iter->old_spte;
1395 const int level = iter->level;
1399 * No need for atomics when writing to sp->spt since the page table has
1400 * not been linked in yet and thus is not reachable from any other CPU.
1402 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1403 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1406 * Replace the huge spte with a pointer to the populated lower level
1407 * page table. Since we are making this change without a TLB flush vCPUs
1408 * will see a mix of the split mappings and the original huge mapping,
1409 * depending on what's currently in their TLB. This is fine from a
1410 * correctness standpoint since the translation will be the same either
1413 ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1418 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1419 * are overwriting from the page stats. But we have to manually update
1420 * the page stats with the new present child pages.
1422 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1425 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1429 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1430 struct kvm_mmu_page *root,
1431 gfn_t start, gfn_t end,
1432 int target_level, bool shared)
1434 struct kvm_mmu_page *sp = NULL;
1435 struct tdp_iter iter;
1441 * Traverse the page table splitting all huge pages above the target
1442 * level into one lower level. For example, if we encounter a 1GB page
1443 * we split it into 512 2MB pages.
1445 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1446 * to visit an SPTE before ever visiting its children, which means we
1447 * will correctly recursively split huge pages that are more than one
1448 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1449 * and then splitting each of those to 512 4KB pages).
1451 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1453 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1456 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1460 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1463 trace_kvm_mmu_split_huge_page(iter.gfn,
1473 tdp_mmu_init_child_sp(sp, &iter);
1475 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1484 * It's possible to exit the loop having never used the last sp if, for
1485 * example, a vCPU doing HugePage NX splitting wins the race and
1486 * installs its own sp in place of the last sp we tried to split.
1489 tdp_mmu_free_sp(sp);
1496 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1498 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1499 const struct kvm_memory_slot *slot,
1500 gfn_t start, gfn_t end,
1501 int target_level, bool shared)
1503 struct kvm_mmu_page *root;
1506 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1508 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1509 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1511 kvm_tdp_mmu_put_root(kvm, root);
1518 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1519 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1520 * If AD bits are not enabled, this will require clearing the writable bit on
1521 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1524 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1525 gfn_t start, gfn_t end)
1527 u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1528 struct tdp_iter iter;
1529 bool spte_set = false;
1533 tdp_root_for_each_pte(iter, root, start, end) {
1535 if (!is_shadow_present_pte(iter.old_spte) ||
1536 !is_last_spte(iter.old_spte, iter.level))
1539 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1542 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1543 spte_ad_need_write_protect(iter.old_spte));
1545 if (!(iter.old_spte & dbit))
1548 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1559 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1560 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1561 * If AD bits are not enabled, this will require clearing the writable bit on
1562 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1565 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1566 const struct kvm_memory_slot *slot)
1568 struct kvm_mmu_page *root;
1569 bool spte_set = false;
1571 lockdep_assert_held_read(&kvm->mmu_lock);
1573 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1574 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1575 slot->base_gfn + slot->npages);
1581 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1582 * set in mask, starting at gfn. The given memslot is expected to contain all
1583 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1584 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1585 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1587 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1588 gfn_t gfn, unsigned long mask, bool wrprot)
1590 u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1592 struct tdp_iter iter;
1594 lockdep_assert_held_write(&kvm->mmu_lock);
1598 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1599 gfn + BITS_PER_LONG) {
1603 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1604 spte_ad_need_write_protect(iter.old_spte));
1606 if (iter.level > PG_LEVEL_4K ||
1607 !(mask & (1UL << (iter.gfn - gfn))))
1610 mask &= ~(1UL << (iter.gfn - gfn));
1612 if (!(iter.old_spte & dbit))
1615 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1616 iter.old_spte, dbit,
1619 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1621 iter.old_spte & ~dbit);
1622 kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
1629 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1630 * set in mask, starting at gfn. The given memslot is expected to contain all
1631 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1632 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1633 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1635 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1636 struct kvm_memory_slot *slot,
1637 gfn_t gfn, unsigned long mask,
1640 struct kvm_mmu_page *root;
1642 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1643 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1646 static void zap_collapsible_spte_range(struct kvm *kvm,
1647 struct kvm_mmu_page *root,
1648 const struct kvm_memory_slot *slot)
1650 gfn_t start = slot->base_gfn;
1651 gfn_t end = start + slot->npages;
1652 struct tdp_iter iter;
1653 int max_mapping_level;
1657 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1659 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1662 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1663 !is_shadow_present_pte(iter.old_spte))
1667 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1668 * a large page size, then its parent would have been zapped
1669 * instead of stepping down.
1671 if (is_last_spte(iter.old_spte, iter.level))
1675 * If iter.gfn resides outside of the slot, i.e. the page for
1676 * the current level overlaps but is not contained by the slot,
1677 * then the SPTE can't be made huge. More importantly, trying
1678 * to query that info from slot->arch.lpage_info will cause an
1679 * out-of-bounds access.
1681 if (iter.gfn < start || iter.gfn >= end)
1684 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1685 iter.gfn, PG_LEVEL_NUM);
1686 if (max_mapping_level < iter.level)
1689 /* Note, a successful atomic zap also does a remote TLB flush. */
1690 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1698 * Zap non-leaf SPTEs (and free their associated page tables) which could
1699 * be replaced by huge pages, for GFNs within the slot.
1701 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1702 const struct kvm_memory_slot *slot)
1704 struct kvm_mmu_page *root;
1706 lockdep_assert_held_read(&kvm->mmu_lock);
1708 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1709 zap_collapsible_spte_range(kvm, root, slot);
1713 * Removes write access on the last level SPTE mapping this GFN and unsets the
1714 * MMU-writable bit to ensure future writes continue to be intercepted.
1715 * Returns true if an SPTE was set and a TLB flush is needed.
1717 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1718 gfn_t gfn, int min_level)
1720 struct tdp_iter iter;
1722 bool spte_set = false;
1724 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1728 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1729 if (!is_shadow_present_pte(iter.old_spte) ||
1730 !is_last_spte(iter.old_spte, iter.level))
1733 new_spte = iter.old_spte &
1734 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1736 if (new_spte == iter.old_spte)
1739 tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1749 * Removes write access on the last level SPTE mapping this GFN and unsets the
1750 * MMU-writable bit to ensure future writes continue to be intercepted.
1751 * Returns true if an SPTE was set and a TLB flush is needed.
1753 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1754 struct kvm_memory_slot *slot, gfn_t gfn,
1757 struct kvm_mmu_page *root;
1758 bool spte_set = false;
1760 lockdep_assert_held_write(&kvm->mmu_lock);
1761 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1762 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1768 * Return the level of the lowest level SPTE added to sptes.
1769 * That SPTE may be non-present.
1771 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1773 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1776 struct tdp_iter iter;
1777 struct kvm_mmu *mmu = vcpu->arch.mmu;
1778 gfn_t gfn = addr >> PAGE_SHIFT;
1781 *root_level = vcpu->arch.mmu->root_role.level;
1783 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1785 sptes[leaf] = iter.old_spte;
1792 * Returns the last level spte pointer of the shadow page walk for the given
1793 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1794 * walk could be performed, returns NULL and *spte does not contain valid data.
1797 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1798 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1800 * WARNING: This function is only intended to be called during fast_page_fault.
1802 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1805 struct tdp_iter iter;
1806 struct kvm_mmu *mmu = vcpu->arch.mmu;
1807 gfn_t gfn = addr >> PAGE_SHIFT;
1808 tdp_ptep_t sptep = NULL;
1810 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1811 *spte = iter.old_spte;
1816 * Perform the rcu_dereference to get the raw spte pointer value since
1817 * we are passing it up to fast_page_fault, which is shared with the
1818 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1821 * This is safe since fast_page_fault obeys the contracts of this
1822 * function as well as all TDP MMU contracts around modifying SPTEs
1823 * outside of mmu_lock.
1825 return rcu_dereference(sptep);