1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5 #include "mmu_internal.h"
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
14 /* Initializes the TDP MMU for the VM, if enabled. */
15 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
17 struct workqueue_struct *wq;
19 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
23 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
24 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
25 kvm->arch.tdp_mmu_zap_wq = wq;
29 /* Arbitrarily returns true so that this may be used in if statements. */
30 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
34 lockdep_assert_held_read(&kvm->mmu_lock);
36 lockdep_assert_held_write(&kvm->mmu_lock);
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
44 * Invalidate all roots, which besides the obvious, schedules all roots
45 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
46 * ultimately frees all roots.
48 kvm_tdp_mmu_invalidate_all_roots(kvm);
51 * Destroying a workqueue also first flushes the workqueue, i.e. no
52 * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
54 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
56 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
60 * Ensure that all the outstanding RCU callbacks to free shadow pages
61 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
62 * can call kvm_tdp_mmu_put_root and create new callbacks.
67 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
69 free_page((unsigned long)sp->spt);
70 kmem_cache_free(mmu_page_header_cache, sp);
74 * This is called through call_rcu in order to free TDP page table memory
75 * safely with respect to other kernel threads that may be operating on
77 * By only accessing TDP MMU page table memory in an RCU read critical
78 * section, and freeing it after a grace period, lockless access to that
79 * memory won't use it after it is freed.
81 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
83 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
89 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
92 static void tdp_mmu_zap_root_work(struct work_struct *work)
94 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
96 struct kvm *kvm = root->tdp_mmu_async_data;
98 read_lock(&kvm->mmu_lock);
101 * A TLB flush is not necessary as KVM performs a local TLB flush when
102 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
103 * to a different pCPU. Note, the local TLB flush on reuse also
104 * invalidates any paging-structure-cache entries, i.e. TLB entries for
105 * intermediate paging structures, that may be zapped, as such entries
106 * are associated with the ASID on both VMX and SVM.
108 tdp_mmu_zap_root(kvm, root, true);
111 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
112 * avoiding an infinite loop. By design, the root is reachable while
113 * it's being asynchronously zapped, thus a different task can put its
114 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
115 * asynchronously zapped root is unavoidable.
117 kvm_tdp_mmu_put_root(kvm, root, true);
119 read_unlock(&kvm->mmu_lock);
122 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
124 root->tdp_mmu_async_data = kvm;
125 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
126 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
129 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
132 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
134 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
138 * The TDP MMU itself holds a reference to each root until the root is
139 * explicitly invalidated, i.e. the final reference should be never be
140 * put for a valid root.
142 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
144 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
145 list_del_rcu(&root->link);
146 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
147 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
151 * Returns the next root after @prev_root (or the first root if @prev_root is
152 * NULL). A reference to the returned root is acquired, and the reference to
153 * @prev_root is released (the caller obviously must hold a reference to
154 * @prev_root if it's non-NULL).
156 * If @only_valid is true, invalid roots are skipped.
158 * Returns NULL if the end of tdp_mmu_roots was reached.
160 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
161 struct kvm_mmu_page *prev_root,
162 bool shared, bool only_valid)
164 struct kvm_mmu_page *next_root;
169 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
171 typeof(*prev_root), link);
173 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
174 typeof(*next_root), link);
177 if ((!only_valid || !next_root->role.invalid) &&
178 kvm_tdp_mmu_get_root(next_root))
181 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
182 &next_root->link, typeof(*next_root), link);
188 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
194 * Note: this iterator gets and puts references to the roots it iterates over.
195 * This makes it safe to release the MMU lock and yield within the loop, but
196 * if exiting the loop early, the caller must drop the reference to the most
197 * recent root. (Unless keeping a live reference is desirable.)
199 * If shared is set, this function is operating under the MMU lock in read
200 * mode. In the unlikely event that this thread must free a root, the lock
201 * will be temporarily dropped and reacquired in write mode.
203 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
204 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
206 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
207 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
208 kvm_mmu_page_as_id(_root) != _as_id) { \
211 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
212 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
214 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
215 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
218 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
219 * the implication being that any flow that holds mmu_lock for read is
220 * inherently yield-friendly and should use the yield-safe variant above.
221 * Holding mmu_lock for write obviates the need for RCU protection as the list
222 * is guaranteed to be stable.
224 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
225 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
226 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
227 kvm_mmu_page_as_id(_root) != _as_id) { \
230 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
232 struct kvm_mmu_page *sp;
234 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
235 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
240 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
241 gfn_t gfn, union kvm_mmu_page_role role)
243 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
245 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
250 sp->tdp_mmu_page = true;
252 trace_kvm_mmu_get_page(sp, true);
255 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
256 struct tdp_iter *iter)
258 struct kvm_mmu_page *parent_sp;
259 union kvm_mmu_page_role role;
261 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
263 role = parent_sp->role;
266 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
269 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
271 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
272 struct kvm *kvm = vcpu->kvm;
273 struct kvm_mmu_page *root;
275 lockdep_assert_held_write(&kvm->mmu_lock);
278 * Check for an existing root before allocating a new one. Note, the
279 * role check prevents consuming an invalid root.
281 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
282 if (root->role.word == role.word &&
283 kvm_tdp_mmu_get_root(root))
287 root = tdp_mmu_alloc_sp(vcpu);
288 tdp_mmu_init_sp(root, NULL, 0, role);
291 * TDP MMU roots are kept until they are explicitly invalidated, either
292 * by a memslot update or by the destruction of the VM. Initialize the
293 * refcount to two; one reference for the vCPU, and one reference for
294 * the TDP MMU itself, which is held until the root is invalidated and
295 * is ultimately put by tdp_mmu_zap_root_work().
297 refcount_set(&root->tdp_mmu_root_count, 2);
299 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
300 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
301 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
304 return __pa(root->spt);
307 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
308 u64 old_spte, u64 new_spte, int level,
311 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
313 kvm_account_pgtable_pages((void *)sp->spt, +1);
314 atomic64_inc(&kvm->arch.tdp_mmu_pages);
317 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
319 kvm_account_pgtable_pages((void *)sp->spt, -1);
320 atomic64_dec(&kvm->arch.tdp_mmu_pages);
324 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
327 * @sp: the page to be removed
328 * @shared: This operation may not be running under the exclusive use of
329 * the MMU lock and the operation must synchronize with other
330 * threads that might be adding or removing pages.
332 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
335 tdp_unaccount_mmu_page(kvm, sp);
337 if (!sp->nx_huge_page_disallowed)
341 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
343 lockdep_assert_held_write(&kvm->mmu_lock);
345 sp->nx_huge_page_disallowed = false;
346 untrack_possible_nx_huge_page(kvm, sp);
349 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
353 * handle_removed_pt() - handle a page table removed from the TDP structure
356 * @pt: the page removed from the paging structure
357 * @shared: This operation may not be running under the exclusive use
358 * of the MMU lock and the operation must synchronize with other
359 * threads that might be modifying SPTEs.
361 * Given a page table that has been removed from the TDP paging structure,
362 * iterates through the page table to clear SPTEs and free child page tables.
364 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
365 * protection. Since this thread removed it from the paging structure,
366 * this thread will be responsible for ensuring the page is freed. Hence the
367 * early rcu_dereferences in the function.
369 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
371 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
372 int level = sp->role.level;
373 gfn_t base_gfn = sp->gfn;
376 trace_kvm_mmu_prepare_zap_page(sp);
378 tdp_mmu_unlink_sp(kvm, sp, shared);
380 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
381 tdp_ptep_t sptep = pt + i;
382 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
387 * Set the SPTE to a nonpresent value that other
388 * threads will not overwrite. If the SPTE was
389 * already marked as removed then another thread
390 * handling a page fault could overwrite it, so
391 * set the SPTE until it is set from some other
392 * value to the removed SPTE value.
395 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
396 if (!is_removed_spte(old_spte))
402 * If the SPTE is not MMU-present, there is no backing
403 * page associated with the SPTE and so no side effects
404 * that need to be recorded, and exclusive ownership of
405 * mmu_lock ensures the SPTE can't be made present.
406 * Note, zapping MMIO SPTEs is also unnecessary as they
407 * are guarded by the memslots generation, not by being
410 old_spte = kvm_tdp_mmu_read_spte(sptep);
411 if (!is_shadow_present_pte(old_spte))
415 * Use the common helper instead of a raw WRITE_ONCE as
416 * the SPTE needs to be updated atomically if it can be
417 * modified by a different vCPU outside of mmu_lock.
418 * Even though the parent SPTE is !PRESENT, the TLB
419 * hasn't yet been flushed, and both Intel and AMD
420 * document that A/D assists can use upper-level PxE
421 * entries that are cached in the TLB, i.e. the CPU can
422 * still access the page and mark it dirty.
424 * No retry is needed in the atomic update path as the
425 * sole concern is dropping a Dirty bit, i.e. no other
426 * task can zap/remove the SPTE as mmu_lock is held for
427 * write. Marking the SPTE as a removed SPTE is not
428 * strictly necessary for the same reason, but using
429 * the remove SPTE value keeps the shared/exclusive
430 * paths consistent and allows the handle_changed_spte()
431 * call below to hardcode the new value to REMOVED_SPTE.
433 * Note, even though dropping a Dirty bit is the only
434 * scenario where a non-atomic update could result in a
435 * functional bug, simply checking the Dirty bit isn't
436 * sufficient as a fast page fault could read the upper
437 * level SPTE before it is zapped, and then make this
438 * target SPTE writable, resume the guest, and set the
439 * Dirty bit between reading the SPTE above and writing
442 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
443 REMOVED_SPTE, level);
445 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
446 old_spte, REMOVED_SPTE, level, shared);
449 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
453 * handle_changed_spte - handle bookkeeping associated with an SPTE change
455 * @as_id: the address space of the paging structure the SPTE was a part of
456 * @gfn: the base GFN that was mapped by the SPTE
457 * @old_spte: The value of the SPTE before the change
458 * @new_spte: The value of the SPTE after the change
459 * @level: the level of the PT the SPTE is part of in the paging structure
460 * @shared: This operation may not be running under the exclusive use of
461 * the MMU lock and the operation must synchronize with other
462 * threads that might be modifying SPTEs.
464 * Handle bookkeeping that might result from the modification of a SPTE. Note,
465 * dirty logging updates are handled in common code, not here (see make_spte()
466 * and fast_pf_fix_direct_spte()).
468 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
469 u64 old_spte, u64 new_spte, int level,
472 bool was_present = is_shadow_present_pte(old_spte);
473 bool is_present = is_shadow_present_pte(new_spte);
474 bool was_leaf = was_present && is_last_spte(old_spte, level);
475 bool is_leaf = is_present && is_last_spte(new_spte, level);
476 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
478 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
479 WARN_ON(level < PG_LEVEL_4K);
480 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
483 * If this warning were to trigger it would indicate that there was a
484 * missing MMU notifier or a race with some notifier handler.
485 * A present, leaf SPTE should never be directly replaced with another
486 * present leaf SPTE pointing to a different PFN. A notifier handler
487 * should be zapping the SPTE before the main MM's page table is
488 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
489 * thread before replacement.
491 if (was_leaf && is_leaf && pfn_changed) {
492 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
493 "SPTE with another present leaf SPTE mapping a\n"
495 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
496 as_id, gfn, old_spte, new_spte, level);
499 * Crash the host to prevent error propagation and guest data
505 if (old_spte == new_spte)
508 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
511 check_spte_writable_invariants(new_spte);
514 * The only times a SPTE should be changed from a non-present to
515 * non-present state is when an MMIO entry is installed/modified/
516 * removed. In that case, there is nothing to do here.
518 if (!was_present && !is_present) {
520 * If this change does not involve a MMIO SPTE or removed SPTE,
521 * it is unexpected. Log the change, though it should not
522 * impact the guest since both the former and current SPTEs
525 if (WARN_ON(!is_mmio_spte(old_spte) &&
526 !is_mmio_spte(new_spte) &&
527 !is_removed_spte(new_spte)))
528 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
529 "should not be replaced with another,\n"
530 "different nonpresent SPTE, unless one or both\n"
531 "are MMIO SPTEs, or the new SPTE is\n"
532 "a temporary removed SPTE.\n"
533 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
534 as_id, gfn, old_spte, new_spte, level);
538 if (is_leaf != was_leaf)
539 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
541 if (was_leaf && is_dirty_spte(old_spte) &&
542 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
543 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
546 * Recursively handle child PTs if the change removed a subtree from
547 * the paging structure. Note the WARN on the PFN changing without the
548 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
549 * pages are kernel allocations and should never be migrated.
551 if (was_present && !was_leaf &&
552 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
553 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
555 if (was_leaf && is_accessed_spte(old_spte) &&
556 (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
557 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
561 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
562 * and handle the associated bookkeeping. Do not mark the page dirty
563 * in KVM's dirty bitmaps.
565 * If setting the SPTE fails because it has changed, iter->old_spte will be
566 * refreshed to the current value of the spte.
569 * @iter: a tdp_iter instance currently on the SPTE that should be set
570 * @new_spte: The value the SPTE should be set to
572 * * 0 - If the SPTE was set.
573 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
574 * no side-effects other than setting iter->old_spte to the last
575 * known value of the spte.
577 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
578 struct tdp_iter *iter,
581 u64 *sptep = rcu_dereference(iter->sptep);
584 * The caller is responsible for ensuring the old SPTE is not a REMOVED
585 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
586 * and pre-checking before inserting a new SPTE is advantageous as it
587 * avoids unnecessary work.
589 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
591 lockdep_assert_held_read(&kvm->mmu_lock);
594 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
595 * does not hold the mmu_lock.
597 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
600 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
601 new_spte, iter->level, true);
606 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
607 struct tdp_iter *iter)
612 * Freeze the SPTE by setting it to a special,
613 * non-present value. This will stop other threads from
614 * immediately installing a present entry in its place
615 * before the TLBs are flushed.
617 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
621 kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
624 * No other thread can overwrite the removed SPTE as they must either
625 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
626 * overwrite the special removed SPTE value. No bookkeeping is needed
627 * here since the SPTE is going from non-present to non-present. Use
628 * the raw write helper to avoid an unnecessary check on volatile bits.
630 __kvm_tdp_mmu_write_spte(iter->sptep, 0);
637 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
639 * @as_id: Address space ID, i.e. regular vs. SMM
640 * @sptep: Pointer to the SPTE
641 * @old_spte: The current value of the SPTE
642 * @new_spte: The new value that will be set for the SPTE
643 * @gfn: The base GFN that was (or will be) mapped by the SPTE
644 * @level: The level _containing_ the SPTE (its parent PT's level)
646 * Returns the old SPTE value, which _may_ be different than @old_spte if the
647 * SPTE had voldatile bits.
649 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
650 u64 old_spte, u64 new_spte, gfn_t gfn, int level)
652 lockdep_assert_held_write(&kvm->mmu_lock);
655 * No thread should be using this function to set SPTEs to or from the
656 * temporary removed SPTE value.
657 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
658 * should be used. If operating under the MMU lock in write mode, the
659 * use of the removed SPTE should not be necessary.
661 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
663 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
665 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
669 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
672 WARN_ON_ONCE(iter->yielded);
673 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
674 iter->old_spte, new_spte,
675 iter->gfn, iter->level);
678 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
679 for_each_tdp_pte(_iter, _root, _start, _end)
681 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
682 tdp_root_for_each_pte(_iter, _root, _start, _end) \
683 if (!is_shadow_present_pte(_iter.old_spte) || \
684 !is_last_spte(_iter.old_spte, _iter.level)) \
688 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
689 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
692 * Yield if the MMU lock is contended or this thread needs to return control
695 * If this function should yield and flush is set, it will perform a remote
696 * TLB flush before yielding.
698 * If this function yields, iter->yielded is set and the caller must skip to
699 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
700 * over the paging structures to allow the iterator to continue its traversal
701 * from the paging structure root.
703 * Returns true if this function yielded.
705 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
706 struct tdp_iter *iter,
707 bool flush, bool shared)
709 WARN_ON(iter->yielded);
711 /* Ensure forward progress has been made before yielding. */
712 if (iter->next_last_level_gfn == iter->yielded_gfn)
715 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
717 kvm_flush_remote_tlbs(kvm);
722 cond_resched_rwlock_read(&kvm->mmu_lock);
724 cond_resched_rwlock_write(&kvm->mmu_lock);
728 WARN_ON(iter->gfn > iter->next_last_level_gfn);
730 iter->yielded = true;
733 return iter->yielded;
736 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
739 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
740 * a gpa range that would exceed the max gfn, and KVM does not create
741 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
742 * the slow emulation path every time.
744 return kvm_mmu_max_gfn() + 1;
747 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
748 bool shared, int zap_level)
750 struct tdp_iter iter;
752 gfn_t end = tdp_mmu_max_gfn_exclusive();
755 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
757 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
760 if (!is_shadow_present_pte(iter.old_spte))
763 if (iter.level > zap_level)
767 tdp_mmu_iter_set_spte(kvm, &iter, 0);
768 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
773 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
778 * The root must have an elevated refcount so that it's reachable via
779 * mmu_notifier callbacks, which allows this path to yield and drop
780 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
781 * must drop all references to relevant pages prior to completing the
782 * callback. Dropping mmu_lock with an unreachable root would result
783 * in zapping SPTEs after a relevant mmu_notifier callback completes
784 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
785 * dirty accessed bits to the SPTE's associated struct page.
787 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
789 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
794 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
795 * split the zap into two passes. On the first pass, zap at the 1gb
796 * level, and then zap top-level SPs on the second pass. "1gb" is not
797 * arbitrary, as KVM must be able to zap a 1gb shadow page without
798 * inducing a stall to allow in-place replacement with a 1gb hugepage.
800 * Because zapping a SP recurses on its children, stepping down to
801 * PG_LEVEL_4K in the iterator itself is unnecessary.
803 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
804 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
809 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
814 * This helper intentionally doesn't allow zapping a root shadow page,
815 * which doesn't have a parent page table and thus no associated entry.
817 if (WARN_ON_ONCE(!sp->ptep))
820 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
821 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
824 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
825 sp->gfn, sp->role.level + 1);
831 * If can_yield is true, will release the MMU lock and reschedule if the
832 * scheduler needs the CPU or there is contention on the MMU lock. If this
833 * function cannot yield, it will not release the MMU lock or reschedule and
834 * the caller must ensure it does not supply too large a GFN range, or the
835 * operation can cause a soft lockup.
837 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
838 gfn_t start, gfn_t end, bool can_yield, bool flush)
840 struct tdp_iter iter;
842 end = min(end, tdp_mmu_max_gfn_exclusive());
844 lockdep_assert_held_write(&kvm->mmu_lock);
848 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
850 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
855 if (!is_shadow_present_pte(iter.old_spte) ||
856 !is_last_spte(iter.old_spte, iter.level))
859 tdp_mmu_iter_set_spte(kvm, &iter, 0);
866 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
867 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
873 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
874 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
875 * more SPTEs were zapped since the MMU lock was last acquired.
877 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
878 bool can_yield, bool flush)
880 struct kvm_mmu_page *root;
882 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
883 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
888 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
890 struct kvm_mmu_page *root;
894 * Zap all roots, including invalid roots, as all SPTEs must be dropped
895 * before returning to the caller. Zap directly even if the root is
896 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
897 * all that expensive and mmu_lock is already held, which means the
898 * worker has yielded, i.e. flushing the work instead of zapping here
899 * isn't guaranteed to be any faster.
901 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
902 * is being destroyed or the userspace VMM has exited. In both cases,
903 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
905 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
906 for_each_tdp_mmu_root_yield_safe(kvm, root, i)
907 tdp_mmu_zap_root(kvm, root, false);
912 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
915 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
917 flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
921 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
922 * is about to be zapped, e.g. in response to a memslots update. The actual
923 * zapping is performed asynchronously. Using a separate workqueue makes it
924 * easy to ensure that the destruction is performed before the "fast zap"
925 * completes, without keeping a separate list of invalidated roots; the list is
926 * effectively the list of work items in the workqueue.
928 * Note, the asynchronous worker is gifted the TDP MMU's reference.
929 * See kvm_tdp_mmu_get_vcpu_root_hpa().
931 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
933 struct kvm_mmu_page *root;
936 * mmu_lock must be held for write to ensure that a root doesn't become
937 * invalid while there are active readers (invalidating a root while
938 * there are active readers may or may not be problematic in practice,
939 * but it's uncharted territory and not supported).
941 * Waive the assertion if there are no users of @kvm, i.e. the VM is
942 * being destroyed after all references have been put, or if no vCPUs
943 * have been created (which means there are no roots), i.e. the VM is
944 * being destroyed in an error path of KVM_CREATE_VM.
946 if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
947 refcount_read(&kvm->users_count) && kvm->created_vcpus)
948 lockdep_assert_held_write(&kvm->mmu_lock);
951 * As above, mmu_lock isn't held when destroying the VM! There can't
952 * be other references to @kvm, i.e. nothing else can invalidate roots
953 * or be consuming roots, but walking the list of roots does need to be
954 * guarded against roots being deleted by the asynchronous zap worker.
958 list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
959 if (!root->role.invalid) {
960 root->role.invalid = true;
961 tdp_mmu_schedule_zap_root(kvm, root);
969 * Installs a last-level SPTE to handle a TDP page fault.
970 * (NPT/EPT violation/misconfiguration)
972 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
973 struct kvm_page_fault *fault,
974 struct tdp_iter *iter)
976 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
978 int ret = RET_PF_FIXED;
981 if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
984 if (unlikely(!fault->slot))
985 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
987 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
988 fault->pfn, iter->old_spte, fault->prefetch, true,
989 fault->map_writable, &new_spte);
991 if (new_spte == iter->old_spte)
992 ret = RET_PF_SPURIOUS;
993 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
995 else if (is_shadow_present_pte(iter->old_spte) &&
996 !is_last_spte(iter->old_spte, iter->level))
997 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1000 * If the page fault was caused by a write but the page is write
1001 * protected, emulation is needed. If the emulation was skipped,
1002 * the vCPU would have the same fault again.
1006 ret = RET_PF_EMULATE;
1009 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1010 if (unlikely(is_mmio_spte(new_spte))) {
1011 vcpu->stat.pf_mmio_spte_created++;
1012 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1014 ret = RET_PF_EMULATE;
1016 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1017 rcu_dereference(iter->sptep));
1024 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1025 * provided page table.
1027 * @kvm: kvm instance
1028 * @iter: a tdp_iter instance currently on the SPTE that should be set
1029 * @sp: The new TDP page table to install.
1030 * @shared: This operation is running under the MMU lock in read mode.
1032 * Returns: 0 if the new page table was installed. Non-0 if the page table
1033 * could not be installed (e.g. the atomic compare-exchange failed).
1035 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1036 struct kvm_mmu_page *sp, bool shared)
1038 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1042 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1046 tdp_mmu_iter_set_spte(kvm, iter, spte);
1049 tdp_account_mmu_page(kvm, sp);
1054 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1055 struct kvm_mmu_page *sp, bool shared);
1058 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1059 * page tables and SPTEs to translate the faulting guest physical address.
1061 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1063 struct kvm_mmu *mmu = vcpu->arch.mmu;
1064 struct kvm *kvm = vcpu->kvm;
1065 struct tdp_iter iter;
1066 struct kvm_mmu_page *sp;
1067 int ret = RET_PF_RETRY;
1069 kvm_mmu_hugepage_adjust(vcpu, fault);
1071 trace_kvm_mmu_spte_requested(fault);
1075 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1078 if (fault->nx_huge_page_workaround_enabled)
1079 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1082 * If SPTE has been frozen by another thread, just give up and
1083 * retry, avoiding unnecessary page table allocation and free.
1085 if (is_removed_spte(iter.old_spte))
1088 if (iter.level == fault->goal_level)
1089 goto map_target_level;
1091 /* Step down into the lower level page table if it exists. */
1092 if (is_shadow_present_pte(iter.old_spte) &&
1093 !is_large_pte(iter.old_spte))
1097 * The SPTE is either non-present or points to a huge page that
1098 * needs to be split.
1100 sp = tdp_mmu_alloc_sp(vcpu);
1101 tdp_mmu_init_child_sp(sp, &iter);
1103 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1105 if (is_shadow_present_pte(iter.old_spte))
1106 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1108 r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1111 * Force the guest to retry if installing an upper level SPTE
1112 * failed, e.g. because a different task modified the SPTE.
1115 tdp_mmu_free_sp(sp);
1119 if (fault->huge_page_disallowed &&
1120 fault->req_level >= iter.level) {
1121 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1122 if (sp->nx_huge_page_disallowed)
1123 track_possible_nx_huge_page(kvm, sp);
1124 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1129 * The walk aborted before reaching the target level, e.g. because the
1130 * iterator detected an upper level SPTE was frozen during traversal.
1132 WARN_ON_ONCE(iter.level == fault->goal_level);
1136 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1143 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1146 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1147 range->end, range->may_block, flush);
1150 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1151 struct kvm_gfn_range *range);
1153 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1154 struct kvm_gfn_range *range,
1155 tdp_handler_t handler)
1157 struct kvm_mmu_page *root;
1158 struct tdp_iter iter;
1162 * Don't support rescheduling, none of the MMU notifiers that funnel
1163 * into this helper allow blocking; it'd be dead, wasteful code.
1165 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1168 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1169 ret |= handler(kvm, &iter, range);
1178 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1179 * if any of the GFNs in the range have been accessed.
1181 * No need to mark the corresponding PFN as accessed as this call is coming
1182 * from the clear_young() or clear_flush_young() notifier, which uses the
1183 * return value to determine if the page has been accessed.
1185 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1186 struct kvm_gfn_range *range)
1190 /* If we have a non-accessed entry we don't need to change the pte. */
1191 if (!is_accessed_spte(iter->old_spte))
1194 if (spte_ad_enabled(iter->old_spte)) {
1195 iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
1197 shadow_accessed_mask,
1199 new_spte = iter->old_spte & ~shadow_accessed_mask;
1202 * Capture the dirty status of the page, so that it doesn't get
1203 * lost when the SPTE is marked for access tracking.
1205 if (is_writable_pte(iter->old_spte))
1206 kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
1208 new_spte = mark_spte_for_access_track(iter->old_spte);
1209 iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
1210 iter->old_spte, new_spte,
1214 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1215 iter->old_spte, new_spte);
1219 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1221 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1224 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1225 struct kvm_gfn_range *range)
1227 return is_accessed_spte(iter->old_spte);
1230 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1232 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1235 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1236 struct kvm_gfn_range *range)
1240 /* Huge pages aren't expected to be modified without first being zapped. */
1241 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1243 if (iter->level != PG_LEVEL_4K ||
1244 !is_shadow_present_pte(iter->old_spte))
1248 * Note, when changing a read-only SPTE, it's not strictly necessary to
1249 * zero the SPTE before setting the new PFN, but doing so preserves the
1250 * invariant that the PFN of a present * leaf SPTE can never change.
1251 * See handle_changed_spte().
1253 tdp_mmu_iter_set_spte(kvm, iter, 0);
1255 if (!pte_write(range->pte)) {
1256 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1257 pte_pfn(range->pte));
1259 tdp_mmu_iter_set_spte(kvm, iter, new_spte);
1266 * Handle the changed_pte MMU notifier for the TDP MMU.
1267 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1269 * Returns non-zero if a flush is needed before releasing the MMU lock.
1271 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1274 * No need to handle the remote TLB flush under RCU protection, the
1275 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1276 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
1278 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1282 * Remove write access from all SPTEs at or above min_level that map GFNs
1283 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1286 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1287 gfn_t start, gfn_t end, int min_level)
1289 struct tdp_iter iter;
1291 bool spte_set = false;
1295 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1297 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1299 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1302 if (!is_shadow_present_pte(iter.old_spte) ||
1303 !is_last_spte(iter.old_spte, iter.level) ||
1304 !(iter.old_spte & PT_WRITABLE_MASK))
1307 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1309 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1320 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1321 * only affect leaf SPTEs down to min_level.
1322 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1324 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1325 const struct kvm_memory_slot *slot, int min_level)
1327 struct kvm_mmu_page *root;
1328 bool spte_set = false;
1330 lockdep_assert_held_read(&kvm->mmu_lock);
1332 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1333 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1334 slot->base_gfn + slot->npages, min_level);
1339 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1341 struct kvm_mmu_page *sp;
1345 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1349 sp->spt = (void *)__get_free_page(gfp);
1351 kmem_cache_free(mmu_page_header_cache, sp);
1358 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1359 struct tdp_iter *iter,
1362 struct kvm_mmu_page *sp;
1365 * Since we are allocating while under the MMU lock we have to be
1366 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1367 * reclaim and to avoid making any filesystem callbacks (which can end
1368 * up invoking KVM MMU notifiers, resulting in a deadlock).
1370 * If this allocation fails we drop the lock and retry with reclaim
1373 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1380 read_unlock(&kvm->mmu_lock);
1382 write_unlock(&kvm->mmu_lock);
1384 iter->yielded = true;
1385 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1388 read_lock(&kvm->mmu_lock);
1390 write_lock(&kvm->mmu_lock);
1397 /* Note, the caller is responsible for initializing @sp. */
1398 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1399 struct kvm_mmu_page *sp, bool shared)
1401 const u64 huge_spte = iter->old_spte;
1402 const int level = iter->level;
1406 * No need for atomics when writing to sp->spt since the page table has
1407 * not been linked in yet and thus is not reachable from any other CPU.
1409 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1410 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1413 * Replace the huge spte with a pointer to the populated lower level
1414 * page table. Since we are making this change without a TLB flush vCPUs
1415 * will see a mix of the split mappings and the original huge mapping,
1416 * depending on what's currently in their TLB. This is fine from a
1417 * correctness standpoint since the translation will be the same either
1420 ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1425 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1426 * are overwriting from the page stats. But we have to manually update
1427 * the page stats with the new present child pages.
1429 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1432 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1436 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1437 struct kvm_mmu_page *root,
1438 gfn_t start, gfn_t end,
1439 int target_level, bool shared)
1441 struct kvm_mmu_page *sp = NULL;
1442 struct tdp_iter iter;
1448 * Traverse the page table splitting all huge pages above the target
1449 * level into one lower level. For example, if we encounter a 1GB page
1450 * we split it into 512 2MB pages.
1452 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1453 * to visit an SPTE before ever visiting its children, which means we
1454 * will correctly recursively split huge pages that are more than one
1455 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1456 * and then splitting each of those to 512 4KB pages).
1458 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1460 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1463 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1467 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1470 trace_kvm_mmu_split_huge_page(iter.gfn,
1480 tdp_mmu_init_child_sp(sp, &iter);
1482 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1491 * It's possible to exit the loop having never used the last sp if, for
1492 * example, a vCPU doing HugePage NX splitting wins the race and
1493 * installs its own sp in place of the last sp we tried to split.
1496 tdp_mmu_free_sp(sp);
1503 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1505 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1506 const struct kvm_memory_slot *slot,
1507 gfn_t start, gfn_t end,
1508 int target_level, bool shared)
1510 struct kvm_mmu_page *root;
1513 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1515 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1516 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1518 kvm_tdp_mmu_put_root(kvm, root, shared);
1525 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1526 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1527 * If AD bits are not enabled, this will require clearing the writable bit on
1528 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1531 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1532 gfn_t start, gfn_t end)
1534 u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1535 struct tdp_iter iter;
1536 bool spte_set = false;
1540 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1542 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1545 if (!is_shadow_present_pte(iter.old_spte))
1548 MMU_WARN_ON(kvm_ad_enabled() &&
1549 spte_ad_need_write_protect(iter.old_spte));
1551 if (!(iter.old_spte & dbit))
1554 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1565 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1566 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1567 * If AD bits are not enabled, this will require clearing the writable bit on
1568 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1571 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1572 const struct kvm_memory_slot *slot)
1574 struct kvm_mmu_page *root;
1575 bool spte_set = false;
1577 lockdep_assert_held_read(&kvm->mmu_lock);
1579 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1580 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1581 slot->base_gfn + slot->npages);
1587 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1588 * set in mask, starting at gfn. The given memslot is expected to contain all
1589 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1590 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1591 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1593 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1594 gfn_t gfn, unsigned long mask, bool wrprot)
1596 u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1598 struct tdp_iter iter;
1602 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1603 gfn + BITS_PER_LONG) {
1607 MMU_WARN_ON(kvm_ad_enabled() &&
1608 spte_ad_need_write_protect(iter.old_spte));
1610 if (iter.level > PG_LEVEL_4K ||
1611 !(mask & (1UL << (iter.gfn - gfn))))
1614 mask &= ~(1UL << (iter.gfn - gfn));
1616 if (!(iter.old_spte & dbit))
1619 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1620 iter.old_spte, dbit,
1623 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1625 iter.old_spte & ~dbit);
1626 kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
1633 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1634 * set in mask, starting at gfn. The given memslot is expected to contain all
1635 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1636 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1637 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1639 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1640 struct kvm_memory_slot *slot,
1641 gfn_t gfn, unsigned long mask,
1644 struct kvm_mmu_page *root;
1646 lockdep_assert_held_write(&kvm->mmu_lock);
1647 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1648 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1651 static void zap_collapsible_spte_range(struct kvm *kvm,
1652 struct kvm_mmu_page *root,
1653 const struct kvm_memory_slot *slot)
1655 gfn_t start = slot->base_gfn;
1656 gfn_t end = start + slot->npages;
1657 struct tdp_iter iter;
1658 int max_mapping_level;
1662 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1664 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1667 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1668 !is_shadow_present_pte(iter.old_spte))
1672 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1673 * a large page size, then its parent would have been zapped
1674 * instead of stepping down.
1676 if (is_last_spte(iter.old_spte, iter.level))
1680 * If iter.gfn resides outside of the slot, i.e. the page for
1681 * the current level overlaps but is not contained by the slot,
1682 * then the SPTE can't be made huge. More importantly, trying
1683 * to query that info from slot->arch.lpage_info will cause an
1684 * out-of-bounds access.
1686 if (iter.gfn < start || iter.gfn >= end)
1689 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1690 iter.gfn, PG_LEVEL_NUM);
1691 if (max_mapping_level < iter.level)
1694 /* Note, a successful atomic zap also does a remote TLB flush. */
1695 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1703 * Zap non-leaf SPTEs (and free their associated page tables) which could
1704 * be replaced by huge pages, for GFNs within the slot.
1706 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1707 const struct kvm_memory_slot *slot)
1709 struct kvm_mmu_page *root;
1711 lockdep_assert_held_read(&kvm->mmu_lock);
1713 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1714 zap_collapsible_spte_range(kvm, root, slot);
1718 * Removes write access on the last level SPTE mapping this GFN and unsets the
1719 * MMU-writable bit to ensure future writes continue to be intercepted.
1720 * Returns true if an SPTE was set and a TLB flush is needed.
1722 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1723 gfn_t gfn, int min_level)
1725 struct tdp_iter iter;
1727 bool spte_set = false;
1729 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1733 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1734 if (!is_shadow_present_pte(iter.old_spte) ||
1735 !is_last_spte(iter.old_spte, iter.level))
1738 new_spte = iter.old_spte &
1739 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1741 if (new_spte == iter.old_spte)
1744 tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1754 * Removes write access on the last level SPTE mapping this GFN and unsets the
1755 * MMU-writable bit to ensure future writes continue to be intercepted.
1756 * Returns true if an SPTE was set and a TLB flush is needed.
1758 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1759 struct kvm_memory_slot *slot, gfn_t gfn,
1762 struct kvm_mmu_page *root;
1763 bool spte_set = false;
1765 lockdep_assert_held_write(&kvm->mmu_lock);
1766 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1767 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1773 * Return the level of the lowest level SPTE added to sptes.
1774 * That SPTE may be non-present.
1776 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1778 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1781 struct tdp_iter iter;
1782 struct kvm_mmu *mmu = vcpu->arch.mmu;
1783 gfn_t gfn = addr >> PAGE_SHIFT;
1786 *root_level = vcpu->arch.mmu->root_role.level;
1788 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1790 sptes[leaf] = iter.old_spte;
1797 * Returns the last level spte pointer of the shadow page walk for the given
1798 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1799 * walk could be performed, returns NULL and *spte does not contain valid data.
1802 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1803 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1805 * WARNING: This function is only intended to be called during fast_page_fault.
1807 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1810 struct tdp_iter iter;
1811 struct kvm_mmu *mmu = vcpu->arch.mmu;
1812 gfn_t gfn = addr >> PAGE_SHIFT;
1813 tdp_ptep_t sptep = NULL;
1815 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1816 *spte = iter.old_spte;
1821 * Perform the rcu_dereference to get the raw spte pointer value since
1822 * we are passing it up to fast_page_fault, which is shared with the
1823 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1826 * This is safe since fast_page_fault obeys the contracts of this
1827 * function as well as all TDP MMU contracts around modifying SPTEs
1828 * outside of mmu_lock.
1830 return rcu_dereference(sptep);