1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3 
4 #include "mmu.h"
5 #include "mmu_internal.h"
6 #include "mmutrace.h"
7 #include "tdp_iter.h"
8 #include "tdp_mmu.h"
9 #include "spte.h"
10 
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
13 
14 /* Initializes the TDP MMU for the VM, if enabled. */
kvm_mmu_init_tdp_mmu(struct kvm * kvm)15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16 {
17 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19 }
20 
21 /* Arbitrarily returns true so that this may be used in if statements. */
kvm_lockdep_assert_mmu_lock_held(struct kvm * kvm,bool shared)22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
23 							     bool shared)
24 {
25 	if (shared)
26 		lockdep_assert_held_read(&kvm->mmu_lock);
27 	else
28 		lockdep_assert_held_write(&kvm->mmu_lock);
29 
30 	return true;
31 }
32 
kvm_mmu_uninit_tdp_mmu(struct kvm * kvm)33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34 {
35 	/*
36 	 * Invalidate all roots, which besides the obvious, schedules all roots
37 	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38 	 * ultimately frees all roots.
39 	 */
40 	kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
41 	kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
42 
43 #ifdef CONFIG_KVM_PROVE_MMU
44 	KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
45 #endif
46 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
47 
48 	/*
49 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
50 	 * can run before the VM is torn down.  Putting the last reference to
51 	 * zapped roots will create new callbacks.
52 	 */
53 	rcu_barrier();
54 }
55 
tdp_mmu_free_sp(struct kvm_mmu_page * sp)56 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
57 {
58 	free_page((unsigned long)sp->external_spt);
59 	free_page((unsigned long)sp->spt);
60 	kmem_cache_free(mmu_page_header_cache, sp);
61 }
62 
63 /*
64  * This is called through call_rcu in order to free TDP page table memory
65  * safely with respect to other kernel threads that may be operating on
66  * the memory.
67  * By only accessing TDP MMU page table memory in an RCU read critical
68  * section, and freeing it after a grace period, lockless access to that
69  * memory won't use it after it is freed.
70  */
tdp_mmu_free_sp_rcu_callback(struct rcu_head * head)71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72 {
73 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74 					       rcu_head);
75 
76 	tdp_mmu_free_sp(sp);
77 }
78 
kvm_tdp_mmu_put_root(struct kvm * kvm,struct kvm_mmu_page * root)79 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
80 {
81 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
82 		return;
83 
84 	/*
85 	 * The TDP MMU itself holds a reference to each root until the root is
86 	 * explicitly invalidated, i.e. the final reference should be never be
87 	 * put for a valid root.
88 	 */
89 	KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
90 
91 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92 	list_del_rcu(&root->link);
93 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
94 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
95 }
96 
tdp_mmu_root_match(struct kvm_mmu_page * root,enum kvm_tdp_mmu_root_types types)97 static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
98 			       enum kvm_tdp_mmu_root_types types)
99 {
100 	if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
101 		return false;
102 
103 	if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
104 		return false;
105 
106 	if (likely(!is_mirror_sp(root)))
107 		return types & KVM_DIRECT_ROOTS;
108 	return types & KVM_MIRROR_ROOTS;
109 }
110 
111 /*
112  * Returns the next root after @prev_root (or the first root if @prev_root is
113  * NULL) that matches with @types.  A reference to the returned root is
114  * acquired, and the reference to @prev_root is released (the caller obviously
115  * must hold a reference to @prev_root if it's non-NULL).
116  *
117  * Roots that doesn't match with @types are skipped.
118  *
119  * Returns NULL if the end of tdp_mmu_roots was reached.
120  */
tdp_mmu_next_root(struct kvm * kvm,struct kvm_mmu_page * prev_root,enum kvm_tdp_mmu_root_types types)121 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
122 					      struct kvm_mmu_page *prev_root,
123 					      enum kvm_tdp_mmu_root_types types)
124 {
125 	struct kvm_mmu_page *next_root;
126 
127 	/*
128 	 * While the roots themselves are RCU-protected, fields such as
129 	 * role.invalid are protected by mmu_lock.
130 	 */
131 	lockdep_assert_held(&kvm->mmu_lock);
132 
133 	rcu_read_lock();
134 
135 	if (prev_root)
136 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
137 						  &prev_root->link,
138 						  typeof(*prev_root), link);
139 	else
140 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
141 						   typeof(*next_root), link);
142 
143 	while (next_root) {
144 		if (tdp_mmu_root_match(next_root, types) &&
145 		    kvm_tdp_mmu_get_root(next_root))
146 			break;
147 
148 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
149 				&next_root->link, typeof(*next_root), link);
150 	}
151 
152 	rcu_read_unlock();
153 
154 	if (prev_root)
155 		kvm_tdp_mmu_put_root(kvm, prev_root);
156 
157 	return next_root;
158 }
159 
160 /*
161  * Note: this iterator gets and puts references to the roots it iterates over.
162  * This makes it safe to release the MMU lock and yield within the loop, but
163  * if exiting the loop early, the caller must drop the reference to the most
164  * recent root. (Unless keeping a live reference is desirable.)
165  *
166  * If shared is set, this function is operating under the MMU lock in read
167  * mode.
168  */
169 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types)	\
170 	for (_root = tdp_mmu_next_root(_kvm, NULL, _types);		\
171 	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;		\
172 	     _root = tdp_mmu_next_root(_kvm, _root, _types))		\
173 		if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) {	\
174 		} else
175 
176 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)	\
177 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
178 
179 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)			\
180 	for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS);		\
181 	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;	\
182 	     _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
183 
184 /*
185  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
186  * the implication being that any flow that holds mmu_lock for read is
187  * inherently yield-friendly and should use the yield-safe variant above.
188  * Holding mmu_lock for write obviates the need for RCU protection as the list
189  * is guaranteed to be stable.
190  */
191 #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types)			\
192 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)		\
193 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&		\
194 		    ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
195 		     !tdp_mmu_root_match((_root), (_types)))) {			\
196 		} else
197 
198 /*
199  * Iterate over all TDP MMU roots in an RCU read-side critical section.
200  * It is safe to iterate over the SPTEs under the root, but their values will
201  * be unstable, so all writes must be atomic. As this routine is meant to be
202  * used without holding the mmu_lock at all, any bits that are flipped must
203  * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
204  */
205 #define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types)			\
206 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)		\
207 		if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
208 		    !tdp_mmu_root_match((_root), (_types))) {			\
209 		} else
210 
211 #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id)		\
212 	__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
213 
tdp_mmu_alloc_sp(struct kvm_vcpu * vcpu)214 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
215 {
216 	struct kvm_mmu_page *sp;
217 
218 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
219 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
220 
221 	return sp;
222 }
223 
tdp_mmu_init_sp(struct kvm_mmu_page * sp,tdp_ptep_t sptep,gfn_t gfn,union kvm_mmu_page_role role)224 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
225 			    gfn_t gfn, union kvm_mmu_page_role role)
226 {
227 	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
228 
229 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
230 
231 	sp->role = role;
232 	sp->gfn = gfn;
233 	sp->ptep = sptep;
234 	sp->tdp_mmu_page = true;
235 
236 	trace_kvm_mmu_get_page(sp, true);
237 }
238 
tdp_mmu_init_child_sp(struct kvm_mmu_page * child_sp,struct tdp_iter * iter)239 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
240 				  struct tdp_iter *iter)
241 {
242 	struct kvm_mmu_page *parent_sp;
243 	union kvm_mmu_page_role role;
244 
245 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
246 
247 	role = parent_sp->role;
248 	role.level--;
249 
250 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
251 }
252 
kvm_tdp_mmu_alloc_root(struct kvm_vcpu * vcpu,bool mirror)253 void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
254 {
255 	struct kvm_mmu *mmu = vcpu->arch.mmu;
256 	union kvm_mmu_page_role role = mmu->root_role;
257 	int as_id = kvm_mmu_role_as_id(role);
258 	struct kvm *kvm = vcpu->kvm;
259 	struct kvm_mmu_page *root;
260 
261 	if (mirror)
262 		role.is_mirror = true;
263 
264 	/*
265 	 * Check for an existing root before acquiring the pages lock to avoid
266 	 * unnecessary serialization if multiple vCPUs are loading a new root.
267 	 * E.g. when bringing up secondary vCPUs, KVM will already have created
268 	 * a valid root on behalf of the primary vCPU.
269 	 */
270 	read_lock(&kvm->mmu_lock);
271 
272 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
273 		if (root->role.word == role.word)
274 			goto out_read_unlock;
275 	}
276 
277 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
278 
279 	/*
280 	 * Recheck for an existing root after acquiring the pages lock, another
281 	 * vCPU may have raced ahead and created a new usable root.  Manually
282 	 * walk the list of roots as the standard macros assume that the pages
283 	 * lock is *not* held.  WARN if grabbing a reference to a usable root
284 	 * fails, as the last reference to a root can only be put *after* the
285 	 * root has been invalidated, which requires holding mmu_lock for write.
286 	 */
287 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
288 		if (root->role.word == role.word &&
289 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
290 			goto out_spin_unlock;
291 	}
292 
293 	root = tdp_mmu_alloc_sp(vcpu);
294 	tdp_mmu_init_sp(root, NULL, 0, role);
295 
296 	/*
297 	 * TDP MMU roots are kept until they are explicitly invalidated, either
298 	 * by a memslot update or by the destruction of the VM.  Initialize the
299 	 * refcount to two; one reference for the vCPU, and one reference for
300 	 * the TDP MMU itself, which is held until the root is invalidated and
301 	 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
302 	 */
303 	refcount_set(&root->tdp_mmu_root_count, 2);
304 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
305 
306 out_spin_unlock:
307 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
308 out_read_unlock:
309 	read_unlock(&kvm->mmu_lock);
310 	/*
311 	 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
312 	 * and actually consuming the root if it's invalidated after dropping
313 	 * mmu_lock, and the root can't be freed as this vCPU holds a reference.
314 	 */
315 	if (mirror) {
316 		mmu->mirror_root_hpa = __pa(root->spt);
317 	} else {
318 		mmu->root.hpa = __pa(root->spt);
319 		mmu->root.pgd = 0;
320 	}
321 }
322 
323 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
324 				u64 old_spte, u64 new_spte, int level,
325 				bool shared);
326 
tdp_account_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)327 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
328 {
329 	kvm_account_pgtable_pages((void *)sp->spt, +1);
330 #ifdef CONFIG_KVM_PROVE_MMU
331 	atomic64_inc(&kvm->arch.tdp_mmu_pages);
332 #endif
333 }
334 
tdp_unaccount_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)335 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
336 {
337 	kvm_account_pgtable_pages((void *)sp->spt, -1);
338 #ifdef CONFIG_KVM_PROVE_MMU
339 	atomic64_dec(&kvm->arch.tdp_mmu_pages);
340 #endif
341 }
342 
343 /**
344  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
345  *
346  * @kvm: kvm instance
347  * @sp: the page to be removed
348  */
tdp_mmu_unlink_sp(struct kvm * kvm,struct kvm_mmu_page * sp)349 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
350 {
351 	tdp_unaccount_mmu_page(kvm, sp);
352 
353 	if (!sp->nx_huge_page_disallowed)
354 		return;
355 
356 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
357 	sp->nx_huge_page_disallowed = false;
358 	untrack_possible_nx_huge_page(kvm, sp);
359 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
360 }
361 
remove_external_spte(struct kvm * kvm,gfn_t gfn,u64 old_spte,int level)362 static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
363 				 int level)
364 {
365 	kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
366 	int ret;
367 
368 	/*
369 	 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
370 	 * PTs are removed in a special order, involving free_external_spt().
371 	 * But remove_external_spte() will be called on non-leaf PTEs via
372 	 * __tdp_mmu_zap_root(), so avoid the error the former would return
373 	 * in this case.
374 	 */
375 	if (!is_last_spte(old_spte, level))
376 		return;
377 
378 	/* Zapping leaf spte is allowed only when write lock is held. */
379 	lockdep_assert_held_write(&kvm->mmu_lock);
380 	/* Because write lock is held, operation should success. */
381 	ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
382 	KVM_BUG_ON(ret, kvm);
383 }
384 
385 /**
386  * handle_removed_pt() - handle a page table removed from the TDP structure
387  *
388  * @kvm: kvm instance
389  * @pt: the page removed from the paging structure
390  * @shared: This operation may not be running under the exclusive use
391  *	    of the MMU lock and the operation must synchronize with other
392  *	    threads that might be modifying SPTEs.
393  *
394  * Given a page table that has been removed from the TDP paging structure,
395  * iterates through the page table to clear SPTEs and free child page tables.
396  *
397  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
398  * protection. Since this thread removed it from the paging structure,
399  * this thread will be responsible for ensuring the page is freed. Hence the
400  * early rcu_dereferences in the function.
401  */
handle_removed_pt(struct kvm * kvm,tdp_ptep_t pt,bool shared)402 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
403 {
404 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
405 	int level = sp->role.level;
406 	gfn_t base_gfn = sp->gfn;
407 	int i;
408 
409 	trace_kvm_mmu_prepare_zap_page(sp);
410 
411 	tdp_mmu_unlink_sp(kvm, sp);
412 
413 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
414 		tdp_ptep_t sptep = pt + i;
415 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
416 		u64 old_spte;
417 
418 		if (shared) {
419 			/*
420 			 * Set the SPTE to a nonpresent value that other
421 			 * threads will not overwrite. If the SPTE was
422 			 * already marked as frozen then another thread
423 			 * handling a page fault could overwrite it, so
424 			 * set the SPTE until it is set from some other
425 			 * value to the frozen SPTE value.
426 			 */
427 			for (;;) {
428 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
429 				if (!is_frozen_spte(old_spte))
430 					break;
431 				cpu_relax();
432 			}
433 		} else {
434 			/*
435 			 * If the SPTE is not MMU-present, there is no backing
436 			 * page associated with the SPTE and so no side effects
437 			 * that need to be recorded, and exclusive ownership of
438 			 * mmu_lock ensures the SPTE can't be made present.
439 			 * Note, zapping MMIO SPTEs is also unnecessary as they
440 			 * are guarded by the memslots generation, not by being
441 			 * unreachable.
442 			 */
443 			old_spte = kvm_tdp_mmu_read_spte(sptep);
444 			if (!is_shadow_present_pte(old_spte))
445 				continue;
446 
447 			/*
448 			 * Use the common helper instead of a raw WRITE_ONCE as
449 			 * the SPTE needs to be updated atomically if it can be
450 			 * modified by a different vCPU outside of mmu_lock.
451 			 * Even though the parent SPTE is !PRESENT, the TLB
452 			 * hasn't yet been flushed, and both Intel and AMD
453 			 * document that A/D assists can use upper-level PxE
454 			 * entries that are cached in the TLB, i.e. the CPU can
455 			 * still access the page and mark it dirty.
456 			 *
457 			 * No retry is needed in the atomic update path as the
458 			 * sole concern is dropping a Dirty bit, i.e. no other
459 			 * task can zap/remove the SPTE as mmu_lock is held for
460 			 * write.  Marking the SPTE as a frozen SPTE is not
461 			 * strictly necessary for the same reason, but using
462 			 * the frozen SPTE value keeps the shared/exclusive
463 			 * paths consistent and allows the handle_changed_spte()
464 			 * call below to hardcode the new value to FROZEN_SPTE.
465 			 *
466 			 * Note, even though dropping a Dirty bit is the only
467 			 * scenario where a non-atomic update could result in a
468 			 * functional bug, simply checking the Dirty bit isn't
469 			 * sufficient as a fast page fault could read the upper
470 			 * level SPTE before it is zapped, and then make this
471 			 * target SPTE writable, resume the guest, and set the
472 			 * Dirty bit between reading the SPTE above and writing
473 			 * it here.
474 			 */
475 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
476 							  FROZEN_SPTE, level);
477 		}
478 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
479 				    old_spte, FROZEN_SPTE, level, shared);
480 
481 		if (is_mirror_sp(sp)) {
482 			KVM_BUG_ON(shared, kvm);
483 			remove_external_spte(kvm, gfn, old_spte, level);
484 		}
485 	}
486 
487 	if (is_mirror_sp(sp) &&
488 	    WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
489 							  sp->external_spt))) {
490 		/*
491 		 * Failed to free page table page in mirror page table and
492 		 * there is nothing to do further.
493 		 * Intentionally leak the page to prevent the kernel from
494 		 * accessing the encrypted page.
495 		 */
496 		sp->external_spt = NULL;
497 	}
498 
499 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
500 }
501 
get_external_spt(gfn_t gfn,u64 new_spte,int level)502 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
503 {
504 	if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
505 		struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
506 
507 		WARN_ON_ONCE(sp->role.level + 1 != level);
508 		WARN_ON_ONCE(sp->gfn != gfn);
509 		return sp->external_spt;
510 	}
511 
512 	return NULL;
513 }
514 
set_external_spte_present(struct kvm * kvm,tdp_ptep_t sptep,gfn_t gfn,u64 old_spte,u64 new_spte,int level)515 static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
516 						 gfn_t gfn, u64 old_spte,
517 						 u64 new_spte, int level)
518 {
519 	bool was_present = is_shadow_present_pte(old_spte);
520 	bool is_present = is_shadow_present_pte(new_spte);
521 	bool is_leaf = is_present && is_last_spte(new_spte, level);
522 	kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
523 	int ret = 0;
524 
525 	KVM_BUG_ON(was_present, kvm);
526 
527 	lockdep_assert_held(&kvm->mmu_lock);
528 	/*
529 	 * We need to lock out other updates to the SPTE until the external
530 	 * page table has been modified. Use FROZEN_SPTE similar to
531 	 * the zapping case.
532 	 */
533 	if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
534 		return -EBUSY;
535 
536 	/*
537 	 * Use different call to either set up middle level
538 	 * external page table, or leaf.
539 	 */
540 	if (is_leaf) {
541 		ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
542 	} else {
543 		void *external_spt = get_external_spt(gfn, new_spte, level);
544 
545 		KVM_BUG_ON(!external_spt, kvm);
546 		ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
547 	}
548 	if (ret)
549 		__kvm_tdp_mmu_write_spte(sptep, old_spte);
550 	else
551 		__kvm_tdp_mmu_write_spte(sptep, new_spte);
552 	return ret;
553 }
554 
555 /**
556  * handle_changed_spte - handle bookkeeping associated with an SPTE change
557  * @kvm: kvm instance
558  * @as_id: the address space of the paging structure the SPTE was a part of
559  * @gfn: the base GFN that was mapped by the SPTE
560  * @old_spte: The value of the SPTE before the change
561  * @new_spte: The value of the SPTE after the change
562  * @level: the level of the PT the SPTE is part of in the paging structure
563  * @shared: This operation may not be running under the exclusive use of
564  *	    the MMU lock and the operation must synchronize with other
565  *	    threads that might be modifying SPTEs.
566  *
567  * Handle bookkeeping that might result from the modification of a SPTE.  Note,
568  * dirty logging updates are handled in common code, not here (see make_spte()
569  * and fast_pf_fix_direct_spte()).
570  */
handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)571 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
572 				u64 old_spte, u64 new_spte, int level,
573 				bool shared)
574 {
575 	bool was_present = is_shadow_present_pte(old_spte);
576 	bool is_present = is_shadow_present_pte(new_spte);
577 	bool was_leaf = was_present && is_last_spte(old_spte, level);
578 	bool is_leaf = is_present && is_last_spte(new_spte, level);
579 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
580 
581 	WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
582 	WARN_ON_ONCE(level < PG_LEVEL_4K);
583 	WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
584 
585 	/*
586 	 * If this warning were to trigger it would indicate that there was a
587 	 * missing MMU notifier or a race with some notifier handler.
588 	 * A present, leaf SPTE should never be directly replaced with another
589 	 * present leaf SPTE pointing to a different PFN. A notifier handler
590 	 * should be zapping the SPTE before the main MM's page table is
591 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
592 	 * thread before replacement.
593 	 */
594 	if (was_leaf && is_leaf && pfn_changed) {
595 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
596 		       "SPTE with another present leaf SPTE mapping a\n"
597 		       "different PFN!\n"
598 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
599 		       as_id, gfn, old_spte, new_spte, level);
600 
601 		/*
602 		 * Crash the host to prevent error propagation and guest data
603 		 * corruption.
604 		 */
605 		BUG();
606 	}
607 
608 	if (old_spte == new_spte)
609 		return;
610 
611 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
612 
613 	if (is_leaf)
614 		check_spte_writable_invariants(new_spte);
615 
616 	/*
617 	 * The only times a SPTE should be changed from a non-present to
618 	 * non-present state is when an MMIO entry is installed/modified/
619 	 * removed. In that case, there is nothing to do here.
620 	 */
621 	if (!was_present && !is_present) {
622 		/*
623 		 * If this change does not involve a MMIO SPTE or frozen SPTE,
624 		 * it is unexpected. Log the change, though it should not
625 		 * impact the guest since both the former and current SPTEs
626 		 * are nonpresent.
627 		 */
628 		if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
629 				 !is_mmio_spte(kvm, new_spte) &&
630 				 !is_frozen_spte(new_spte)))
631 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
632 			       "should not be replaced with another,\n"
633 			       "different nonpresent SPTE, unless one or both\n"
634 			       "are MMIO SPTEs, or the new SPTE is\n"
635 			       "a temporary frozen SPTE.\n"
636 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
637 			       as_id, gfn, old_spte, new_spte, level);
638 		return;
639 	}
640 
641 	if (is_leaf != was_leaf)
642 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
643 
644 	/*
645 	 * Recursively handle child PTs if the change removed a subtree from
646 	 * the paging structure.  Note the WARN on the PFN changing without the
647 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
648 	 * pages are kernel allocations and should never be migrated.
649 	 */
650 	if (was_present && !was_leaf &&
651 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
652 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
653 }
654 
__tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)655 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
656 							 struct tdp_iter *iter,
657 							 u64 new_spte)
658 {
659 	/*
660 	 * The caller is responsible for ensuring the old SPTE is not a FROZEN
661 	 * SPTE.  KVM should never attempt to zap or manipulate a FROZEN SPTE,
662 	 * and pre-checking before inserting a new SPTE is advantageous as it
663 	 * avoids unnecessary work.
664 	 */
665 	WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
666 
667 	if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
668 		int ret;
669 
670 		/*
671 		 * Users of atomic zapping don't operate on mirror roots,
672 		 * so don't handle it and bug the VM if it's seen.
673 		 */
674 		if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
675 			return -EBUSY;
676 
677 		ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
678 						iter->old_spte, new_spte, iter->level);
679 		if (ret)
680 			return ret;
681 	} else {
682 		u64 *sptep = rcu_dereference(iter->sptep);
683 
684 		/*
685 		 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
686 		 * and does not hold the mmu_lock.  On failure, i.e. if a
687 		 * different logical CPU modified the SPTE, try_cmpxchg64()
688 		 * updates iter->old_spte with the current value, so the caller
689 		 * operates on fresh data, e.g. if it retries
690 		 * tdp_mmu_set_spte_atomic()
691 		 */
692 		if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
693 			return -EBUSY;
694 	}
695 
696 	return 0;
697 }
698 
699 /*
700  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
701  * and handle the associated bookkeeping.  Do not mark the page dirty
702  * in KVM's dirty bitmaps.
703  *
704  * If setting the SPTE fails because it has changed, iter->old_spte will be
705  * refreshed to the current value of the spte.
706  *
707  * @kvm: kvm instance
708  * @iter: a tdp_iter instance currently on the SPTE that should be set
709  * @new_spte: The value the SPTE should be set to
710  * Return:
711  * * 0      - If the SPTE was set.
712  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
713  *            no side-effects other than setting iter->old_spte to the last
714  *            known value of the spte.
715  */
tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)716 static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
717 						       struct tdp_iter *iter,
718 						       u64 new_spte)
719 {
720 	int ret;
721 
722 	lockdep_assert_held_read(&kvm->mmu_lock);
723 
724 	ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
725 	if (ret)
726 		return ret;
727 
728 	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
729 			    new_spte, iter->level, true);
730 
731 	return 0;
732 }
733 
734 /*
735  * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
736  * @kvm:	      KVM instance
737  * @as_id:	      Address space ID, i.e. regular vs. SMM
738  * @sptep:	      Pointer to the SPTE
739  * @old_spte:	      The current value of the SPTE
740  * @new_spte:	      The new value that will be set for the SPTE
741  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
742  * @level:	      The level _containing_ the SPTE (its parent PT's level)
743  *
744  * Returns the old SPTE value, which _may_ be different than @old_spte if the
745  * SPTE had voldatile bits.
746  */
tdp_mmu_set_spte(struct kvm * kvm,int as_id,tdp_ptep_t sptep,u64 old_spte,u64 new_spte,gfn_t gfn,int level)747 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
748 			    u64 old_spte, u64 new_spte, gfn_t gfn, int level)
749 {
750 	lockdep_assert_held_write(&kvm->mmu_lock);
751 
752 	/*
753 	 * No thread should be using this function to set SPTEs to or from the
754 	 * temporary frozen SPTE value.
755 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
756 	 * should be used. If operating under the MMU lock in write mode, the
757 	 * use of the frozen SPTE should not be necessary.
758 	 */
759 	WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
760 
761 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
762 
763 	handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
764 
765 	/*
766 	 * Users that do non-atomic setting of PTEs don't operate on mirror
767 	 * roots, so don't handle it and bug the VM if it's seen.
768 	 */
769 	if (is_mirror_sptep(sptep)) {
770 		KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
771 		remove_external_spte(kvm, gfn, old_spte, level);
772 	}
773 
774 	return old_spte;
775 }
776 
tdp_mmu_iter_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)777 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
778 					 u64 new_spte)
779 {
780 	WARN_ON_ONCE(iter->yielded);
781 	iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
782 					  iter->old_spte, new_spte,
783 					  iter->gfn, iter->level);
784 }
785 
786 #define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end)	\
787 	for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
788 
789 #define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end)	\
790 	tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end)		\
791 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
792 		    !is_last_spte(_iter.old_spte, _iter.level))		\
793 			continue;					\
794 		else
795 
tdp_mmu_iter_need_resched(struct kvm * kvm,struct tdp_iter * iter)796 static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
797 							  struct tdp_iter *iter)
798 {
799 	if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
800 		return false;
801 
802 	/* Ensure forward progress has been made before yielding. */
803 	return iter->next_last_level_gfn != iter->yielded_gfn;
804 }
805 
806 /*
807  * Yield if the MMU lock is contended or this thread needs to return control
808  * to the scheduler.
809  *
810  * If this function should yield and flush is set, it will perform a remote
811  * TLB flush before yielding.
812  *
813  * If this function yields, iter->yielded is set and the caller must skip to
814  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
815  * over the paging structures to allow the iterator to continue its traversal
816  * from the paging structure root.
817  *
818  * Returns true if this function yielded.
819  */
tdp_mmu_iter_cond_resched(struct kvm * kvm,struct tdp_iter * iter,bool flush,bool shared)820 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
821 							  struct tdp_iter *iter,
822 							  bool flush, bool shared)
823 {
824 	KVM_MMU_WARN_ON(iter->yielded);
825 
826 	if (!tdp_mmu_iter_need_resched(kvm, iter))
827 		return false;
828 
829 	if (flush)
830 		kvm_flush_remote_tlbs(kvm);
831 
832 	rcu_read_unlock();
833 
834 	if (shared)
835 		cond_resched_rwlock_read(&kvm->mmu_lock);
836 	else
837 		cond_resched_rwlock_write(&kvm->mmu_lock);
838 
839 	rcu_read_lock();
840 
841 	WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
842 
843 	iter->yielded = true;
844 	return true;
845 }
846 
tdp_mmu_max_gfn_exclusive(void)847 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
848 {
849 	/*
850 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
851 	 * a gpa range that would exceed the max gfn, and KVM does not create
852 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
853 	 * the slow emulation path every time.
854 	 */
855 	return kvm_mmu_max_gfn() + 1;
856 }
857 
__tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared,int zap_level)858 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
859 			       bool shared, int zap_level)
860 {
861 	struct tdp_iter iter;
862 
863 	for_each_tdp_pte_min_level_all(iter, root, zap_level) {
864 retry:
865 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
866 			continue;
867 
868 		if (!is_shadow_present_pte(iter.old_spte))
869 			continue;
870 
871 		if (iter.level > zap_level)
872 			continue;
873 
874 		if (!shared)
875 			tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
876 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
877 			goto retry;
878 	}
879 }
880 
tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)881 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
882 			     bool shared)
883 {
884 
885 	/*
886 	 * The root must have an elevated refcount so that it's reachable via
887 	 * mmu_notifier callbacks, which allows this path to yield and drop
888 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
889 	 * must drop all references to relevant pages prior to completing the
890 	 * callback.  Dropping mmu_lock with an unreachable root would result
891 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
892 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
893 	 * dirty accessed bits to the SPTE's associated struct page.
894 	 */
895 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
896 
897 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
898 
899 	rcu_read_lock();
900 
901 	/*
902 	 * Zap roots in multiple passes of decreasing granularity, i.e. zap at
903 	 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
904 	 * preempt models) or mmu_lock contention (full or real-time models).
905 	 * Zapping at finer granularity marginally increases the total time of
906 	 * the zap, but in most cases the zap itself isn't latency sensitive.
907 	 *
908 	 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
909 	 * in order to mimic the page fault path, which can replace a 1GiB page
910 	 * table with an equivalent 1GiB hugepage, i.e. can get saddled with
911 	 * zapping a 1GiB region that's fully populated with 4KiB SPTEs.  This
912 	 * allows verifying that KVM can safely zap 1GiB regions, e.g. without
913 	 * inducing RCU stalls, without relying on a relatively rare event
914 	 * (zapping roots is orders of magnitude more common).  Note, because
915 	 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
916 	 * in the iterator itself is unnecessary.
917 	 */
918 	if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
919 		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
920 		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
921 	}
922 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
923 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
924 
925 	rcu_read_unlock();
926 }
927 
kvm_tdp_mmu_zap_sp(struct kvm * kvm,struct kvm_mmu_page * sp)928 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
929 {
930 	u64 old_spte;
931 
932 	/*
933 	 * This helper intentionally doesn't allow zapping a root shadow page,
934 	 * which doesn't have a parent page table and thus no associated entry.
935 	 */
936 	if (WARN_ON_ONCE(!sp->ptep))
937 		return false;
938 
939 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
940 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
941 		return false;
942 
943 	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
944 			 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
945 
946 	return true;
947 }
948 
949 /*
950  * If can_yield is true, will release the MMU lock and reschedule if the
951  * scheduler needs the CPU or there is contention on the MMU lock. If this
952  * function cannot yield, it will not release the MMU lock or reschedule and
953  * the caller must ensure it does not supply too large a GFN range, or the
954  * operation can cause a soft lockup.
955  */
tdp_mmu_zap_leafs(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,bool can_yield,bool flush)956 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
957 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
958 {
959 	struct tdp_iter iter;
960 
961 	end = min(end, tdp_mmu_max_gfn_exclusive());
962 
963 	lockdep_assert_held_write(&kvm->mmu_lock);
964 
965 	rcu_read_lock();
966 
967 	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
968 		if (can_yield &&
969 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
970 			flush = false;
971 			continue;
972 		}
973 
974 		if (!is_shadow_present_pte(iter.old_spte) ||
975 		    !is_last_spte(iter.old_spte, iter.level))
976 			continue;
977 
978 		tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
979 
980 		/*
981 		 * Zappings SPTEs in invalid roots doesn't require a TLB flush,
982 		 * see kvm_tdp_mmu_zap_invalidated_roots() for details.
983 		 */
984 		if (!root->role.invalid)
985 			flush = true;
986 	}
987 
988 	rcu_read_unlock();
989 
990 	/*
991 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
992 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
993 	 */
994 	return flush;
995 }
996 
997 /*
998  * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
999  * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
1000  * one or more SPTEs were zapped since the MMU lock was last acquired.
1001  */
kvm_tdp_mmu_zap_leafs(struct kvm * kvm,gfn_t start,gfn_t end,bool flush)1002 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
1003 {
1004 	struct kvm_mmu_page *root;
1005 
1006 	lockdep_assert_held_write(&kvm->mmu_lock);
1007 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
1008 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
1009 
1010 	return flush;
1011 }
1012 
kvm_tdp_mmu_zap_all(struct kvm * kvm)1013 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
1014 {
1015 	struct kvm_mmu_page *root;
1016 
1017 	/*
1018 	 * Zap all direct roots, including invalid direct roots, as all direct
1019 	 * SPTEs must be dropped before returning to the caller. For TDX, mirror
1020 	 * roots don't need handling in response to the mmu notifier (the caller).
1021 	 *
1022 	 * Zap directly even if the root is also being zapped by a concurrent
1023 	 * "fast zap".  Walking zapped top-level SPTEs isn't all that expensive
1024 	 * and mmu_lock is already held, which means the other thread has yielded.
1025 	 *
1026 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1027 	 * is being destroyed or the userspace VMM has exited.  In both cases,
1028 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1029 	 */
1030 	lockdep_assert_held_write(&kvm->mmu_lock);
1031 	__for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
1032 					   KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
1033 		tdp_mmu_zap_root(kvm, root, false);
1034 }
1035 
1036 /*
1037  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1038  * zap" completes.
1039  */
kvm_tdp_mmu_zap_invalidated_roots(struct kvm * kvm,bool shared)1040 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
1041 {
1042 	struct kvm_mmu_page *root;
1043 
1044 	if (shared)
1045 		read_lock(&kvm->mmu_lock);
1046 	else
1047 		write_lock(&kvm->mmu_lock);
1048 
1049 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1050 		if (!root->tdp_mmu_scheduled_root_to_zap)
1051 			continue;
1052 
1053 		root->tdp_mmu_scheduled_root_to_zap = false;
1054 		KVM_BUG_ON(!root->role.invalid, kvm);
1055 
1056 		/*
1057 		 * A TLB flush is not necessary as KVM performs a local TLB
1058 		 * flush when allocating a new root (see kvm_mmu_load()), and
1059 		 * when migrating a vCPU to a different pCPU.  Note, the local
1060 		 * TLB flush on reuse also invalidates paging-structure-cache
1061 		 * entries, i.e. TLB entries for intermediate paging structures,
1062 		 * that may be zapped, as such entries are associated with the
1063 		 * ASID on both VMX and SVM.
1064 		 */
1065 		tdp_mmu_zap_root(kvm, root, shared);
1066 
1067 		/*
1068 		 * The referenced needs to be put *after* zapping the root, as
1069 		 * the root must be reachable by mmu_notifiers while it's being
1070 		 * zapped
1071 		 */
1072 		kvm_tdp_mmu_put_root(kvm, root);
1073 	}
1074 
1075 	if (shared)
1076 		read_unlock(&kvm->mmu_lock);
1077 	else
1078 		write_unlock(&kvm->mmu_lock);
1079 }
1080 
1081 /*
1082  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1083  * is about to be zapped, e.g. in response to a memslots update.  The actual
1084  * zapping is done separately so that it happens with mmu_lock with read,
1085  * whereas invalidating roots must be done with mmu_lock held for write (unless
1086  * the VM is being destroyed).
1087  *
1088  * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
1089  * See kvm_tdp_mmu_alloc_root().
1090  */
kvm_tdp_mmu_invalidate_roots(struct kvm * kvm,enum kvm_tdp_mmu_root_types root_types)1091 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
1092 				  enum kvm_tdp_mmu_root_types root_types)
1093 {
1094 	struct kvm_mmu_page *root;
1095 
1096 	/*
1097 	 * Invalidating invalid roots doesn't make sense, prevent developers from
1098 	 * having to think about it.
1099 	 */
1100 	if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
1101 		root_types &= ~KVM_INVALID_ROOTS;
1102 
1103 	/*
1104 	 * mmu_lock must be held for write to ensure that a root doesn't become
1105 	 * invalid while there are active readers (invalidating a root while
1106 	 * there are active readers may or may not be problematic in practice,
1107 	 * but it's uncharted territory and not supported).
1108 	 *
1109 	 * Waive the assertion if there are no users of @kvm, i.e. the VM is
1110 	 * being destroyed after all references have been put, or if no vCPUs
1111 	 * have been created (which means there are no roots), i.e. the VM is
1112 	 * being destroyed in an error path of KVM_CREATE_VM.
1113 	 */
1114 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
1115 	    refcount_read(&kvm->users_count) && kvm->created_vcpus)
1116 		lockdep_assert_held_write(&kvm->mmu_lock);
1117 
1118 	/*
1119 	 * As above, mmu_lock isn't held when destroying the VM!  There can't
1120 	 * be other references to @kvm, i.e. nothing else can invalidate roots
1121 	 * or get/put references to roots.
1122 	 */
1123 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1124 		if (!tdp_mmu_root_match(root, root_types))
1125 			continue;
1126 
1127 		/*
1128 		 * Note, invalid roots can outlive a memslot update!  Invalid
1129 		 * roots must be *zapped* before the memslot update completes,
1130 		 * but a different task can acquire a reference and keep the
1131 		 * root alive after its been zapped.
1132 		 */
1133 		if (!root->role.invalid) {
1134 			root->tdp_mmu_scheduled_root_to_zap = true;
1135 			root->role.invalid = true;
1136 		}
1137 	}
1138 }
1139 
1140 /*
1141  * Installs a last-level SPTE to handle a TDP page fault.
1142  * (NPT/EPT violation/misconfiguration)
1143  */
tdp_mmu_map_handle_target_level(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault,struct tdp_iter * iter)1144 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1145 					  struct kvm_page_fault *fault,
1146 					  struct tdp_iter *iter)
1147 {
1148 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1149 	u64 new_spte;
1150 	int ret = RET_PF_FIXED;
1151 	bool wrprot = false;
1152 
1153 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1154 		return RET_PF_RETRY;
1155 
1156 	if (fault->prefetch && is_shadow_present_pte(iter->old_spte))
1157 		return RET_PF_SPURIOUS;
1158 
1159 	if (is_shadow_present_pte(iter->old_spte) &&
1160 	    is_access_allowed(fault, iter->old_spte) &&
1161 	    is_last_spte(iter->old_spte, iter->level))
1162 		return RET_PF_SPURIOUS;
1163 
1164 	if (unlikely(!fault->slot))
1165 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1166 	else
1167 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1168 				   fault->pfn, iter->old_spte, fault->prefetch,
1169 				   false, fault->map_writable, &new_spte);
1170 
1171 	if (new_spte == iter->old_spte)
1172 		ret = RET_PF_SPURIOUS;
1173 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1174 		return RET_PF_RETRY;
1175 	else if (is_shadow_present_pte(iter->old_spte) &&
1176 		 (!is_last_spte(iter->old_spte, iter->level) ||
1177 		  WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
1178 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1179 
1180 	/*
1181 	 * If the page fault was caused by a write but the page is write
1182 	 * protected, emulation is needed. If the emulation was skipped,
1183 	 * the vCPU would have the same fault again.
1184 	 */
1185 	if (wrprot && fault->write)
1186 		ret = RET_PF_WRITE_PROTECTED;
1187 
1188 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1189 	if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
1190 		vcpu->stat.pf_mmio_spte_created++;
1191 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1192 				     new_spte);
1193 		ret = RET_PF_EMULATE;
1194 	} else {
1195 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1196 				       rcu_dereference(iter->sptep));
1197 	}
1198 
1199 	return ret;
1200 }
1201 
1202 /*
1203  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1204  * provided page table.
1205  *
1206  * @kvm: kvm instance
1207  * @iter: a tdp_iter instance currently on the SPTE that should be set
1208  * @sp: The new TDP page table to install.
1209  * @shared: This operation is running under the MMU lock in read mode.
1210  *
1211  * Returns: 0 if the new page table was installed. Non-0 if the page table
1212  *          could not be installed (e.g. the atomic compare-exchange failed).
1213  */
tdp_mmu_link_sp(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1214 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1215 			   struct kvm_mmu_page *sp, bool shared)
1216 {
1217 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
1218 	int ret = 0;
1219 
1220 	if (shared) {
1221 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1222 		if (ret)
1223 			return ret;
1224 	} else {
1225 		tdp_mmu_iter_set_spte(kvm, iter, spte);
1226 	}
1227 
1228 	tdp_account_mmu_page(kvm, sp);
1229 
1230 	return 0;
1231 }
1232 
1233 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1234 				   struct kvm_mmu_page *sp, bool shared);
1235 
1236 /*
1237  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1238  * page tables and SPTEs to translate the faulting guest physical address.
1239  */
kvm_tdp_mmu_map(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault)1240 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1241 {
1242 	struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
1243 	struct kvm *kvm = vcpu->kvm;
1244 	struct tdp_iter iter;
1245 	struct kvm_mmu_page *sp;
1246 	int ret = RET_PF_RETRY;
1247 
1248 	kvm_mmu_hugepage_adjust(vcpu, fault);
1249 
1250 	trace_kvm_mmu_spte_requested(fault);
1251 
1252 	rcu_read_lock();
1253 
1254 	for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
1255 		int r;
1256 
1257 		if (fault->nx_huge_page_workaround_enabled)
1258 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1259 
1260 		/*
1261 		 * If SPTE has been frozen by another thread, just give up and
1262 		 * retry, avoiding unnecessary page table allocation and free.
1263 		 */
1264 		if (is_frozen_spte(iter.old_spte))
1265 			goto retry;
1266 
1267 		if (iter.level == fault->goal_level)
1268 			goto map_target_level;
1269 
1270 		/* Step down into the lower level page table if it exists. */
1271 		if (is_shadow_present_pte(iter.old_spte) &&
1272 		    !is_large_pte(iter.old_spte))
1273 			continue;
1274 
1275 		/*
1276 		 * The SPTE is either non-present or points to a huge page that
1277 		 * needs to be split.
1278 		 */
1279 		sp = tdp_mmu_alloc_sp(vcpu);
1280 		tdp_mmu_init_child_sp(sp, &iter);
1281 		if (is_mirror_sp(sp))
1282 			kvm_mmu_alloc_external_spt(vcpu, sp);
1283 
1284 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1285 
1286 		if (is_shadow_present_pte(iter.old_spte)) {
1287 			/* Don't support large page for mirrored roots (TDX) */
1288 			KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
1289 			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1290 		} else {
1291 			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1292 		}
1293 
1294 		/*
1295 		 * Force the guest to retry if installing an upper level SPTE
1296 		 * failed, e.g. because a different task modified the SPTE.
1297 		 */
1298 		if (r) {
1299 			tdp_mmu_free_sp(sp);
1300 			goto retry;
1301 		}
1302 
1303 		if (fault->huge_page_disallowed &&
1304 		    fault->req_level >= iter.level) {
1305 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1306 			if (sp->nx_huge_page_disallowed)
1307 				track_possible_nx_huge_page(kvm, sp);
1308 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1309 		}
1310 	}
1311 
1312 	/*
1313 	 * The walk aborted before reaching the target level, e.g. because the
1314 	 * iterator detected an upper level SPTE was frozen during traversal.
1315 	 */
1316 	WARN_ON_ONCE(iter.level == fault->goal_level);
1317 	goto retry;
1318 
1319 map_target_level:
1320 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1321 
1322 retry:
1323 	rcu_read_unlock();
1324 	return ret;
1325 }
1326 
1327 /* Used by mmu notifier via kvm_unmap_gfn_range() */
kvm_tdp_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool flush)1328 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1329 				 bool flush)
1330 {
1331 	enum kvm_tdp_mmu_root_types types;
1332 	struct kvm_mmu_page *root;
1333 
1334 	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
1335 
1336 	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
1337 		flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1338 					  range->may_block, flush);
1339 
1340 	return flush;
1341 }
1342 
1343 /*
1344  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1345  * if any of the GFNs in the range have been accessed.
1346  *
1347  * No need to mark the corresponding PFN as accessed as this call is coming
1348  * from the clear_young() or clear_flush_young() notifier, which uses the
1349  * return value to determine if the page has been accessed.
1350  */
kvm_tdp_mmu_age_spte(struct kvm * kvm,struct tdp_iter * iter)1351 static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
1352 {
1353 	u64 new_spte;
1354 
1355 	if (spte_ad_enabled(iter->old_spte)) {
1356 		iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
1357 								shadow_accessed_mask);
1358 		new_spte = iter->old_spte & ~shadow_accessed_mask;
1359 	} else {
1360 		new_spte = mark_spte_for_access_track(iter->old_spte);
1361 		/*
1362 		 * It is safe for the following cmpxchg to fail. Leave the
1363 		 * Accessed bit set, as the spte is most likely young anyway.
1364 		 */
1365 		if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
1366 			return;
1367 	}
1368 
1369 	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1370 				       iter->old_spte, new_spte);
1371 }
1372 
__kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool test_only)1373 static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
1374 					struct kvm_gfn_range *range,
1375 					bool test_only)
1376 {
1377 	enum kvm_tdp_mmu_root_types types;
1378 	struct kvm_mmu_page *root;
1379 	struct tdp_iter iter;
1380 	bool ret = false;
1381 
1382 	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
1383 
1384 	/*
1385 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1386 	 * into this helper allow blocking; it'd be dead, wasteful code.  Note,
1387 	 * this helper must NOT be used to unmap GFNs, as it processes only
1388 	 * valid roots!
1389 	 */
1390 	WARN_ON(types & ~KVM_VALID_ROOTS);
1391 
1392 	guard(rcu)();
1393 	for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
1394 		tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
1395 			if (!is_accessed_spte(iter.old_spte))
1396 				continue;
1397 
1398 			if (test_only)
1399 				return true;
1400 
1401 			ret = true;
1402 			kvm_tdp_mmu_age_spte(kvm, &iter);
1403 		}
1404 	}
1405 
1406 	return ret;
1407 }
1408 
kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)1409 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1410 {
1411 	return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
1412 }
1413 
kvm_tdp_mmu_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1414 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1415 {
1416 	return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
1417 }
1418 
1419 /*
1420  * Remove write access from all SPTEs at or above min_level that map GFNs
1421  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1422  * be flushed.
1423  */
wrprot_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int min_level)1424 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1425 			     gfn_t start, gfn_t end, int min_level)
1426 {
1427 	struct tdp_iter iter;
1428 	u64 new_spte;
1429 	bool spte_set = false;
1430 
1431 	rcu_read_lock();
1432 
1433 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1434 
1435 	for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
1436 retry:
1437 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1438 			continue;
1439 
1440 		if (!is_shadow_present_pte(iter.old_spte) ||
1441 		    !is_last_spte(iter.old_spte, iter.level) ||
1442 		    !(iter.old_spte & PT_WRITABLE_MASK))
1443 			continue;
1444 
1445 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1446 
1447 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1448 			goto retry;
1449 
1450 		spte_set = true;
1451 	}
1452 
1453 	rcu_read_unlock();
1454 	return spte_set;
1455 }
1456 
1457 /*
1458  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1459  * only affect leaf SPTEs down to min_level.
1460  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1461  */
kvm_tdp_mmu_wrprot_slot(struct kvm * kvm,const struct kvm_memory_slot * slot,int min_level)1462 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1463 			     const struct kvm_memory_slot *slot, int min_level)
1464 {
1465 	struct kvm_mmu_page *root;
1466 	bool spte_set = false;
1467 
1468 	lockdep_assert_held_read(&kvm->mmu_lock);
1469 
1470 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1471 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1472 			     slot->base_gfn + slot->npages, min_level);
1473 
1474 	return spte_set;
1475 }
1476 
tdp_mmu_alloc_sp_for_split(void)1477 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
1478 {
1479 	struct kvm_mmu_page *sp;
1480 
1481 	sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
1482 	if (!sp)
1483 		return NULL;
1484 
1485 	sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
1486 	if (!sp->spt) {
1487 		kmem_cache_free(mmu_page_header_cache, sp);
1488 		return NULL;
1489 	}
1490 
1491 	return sp;
1492 }
1493 
1494 /* Note, the caller is responsible for initializing @sp. */
tdp_mmu_split_huge_page(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1495 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1496 				   struct kvm_mmu_page *sp, bool shared)
1497 {
1498 	const u64 huge_spte = iter->old_spte;
1499 	const int level = iter->level;
1500 	int ret, i;
1501 
1502 	/*
1503 	 * No need for atomics when writing to sp->spt since the page table has
1504 	 * not been linked in yet and thus is not reachable from any other CPU.
1505 	 */
1506 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1507 		sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
1508 
1509 	/*
1510 	 * Replace the huge spte with a pointer to the populated lower level
1511 	 * page table. Since we are making this change without a TLB flush vCPUs
1512 	 * will see a mix of the split mappings and the original huge mapping,
1513 	 * depending on what's currently in their TLB. This is fine from a
1514 	 * correctness standpoint since the translation will be the same either
1515 	 * way.
1516 	 */
1517 	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1518 	if (ret)
1519 		goto out;
1520 
1521 	/*
1522 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1523 	 * are overwriting from the page stats. But we have to manually update
1524 	 * the page stats with the new present child pages.
1525 	 */
1526 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1527 
1528 out:
1529 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1530 	return ret;
1531 }
1532 
tdp_mmu_split_huge_pages_root(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int target_level,bool shared)1533 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1534 					 struct kvm_mmu_page *root,
1535 					 gfn_t start, gfn_t end,
1536 					 int target_level, bool shared)
1537 {
1538 	struct kvm_mmu_page *sp = NULL;
1539 	struct tdp_iter iter;
1540 
1541 	rcu_read_lock();
1542 
1543 	/*
1544 	 * Traverse the page table splitting all huge pages above the target
1545 	 * level into one lower level. For example, if we encounter a 1GB page
1546 	 * we split it into 512 2MB pages.
1547 	 *
1548 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1549 	 * to visit an SPTE before ever visiting its children, which means we
1550 	 * will correctly recursively split huge pages that are more than one
1551 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1552 	 * and then splitting each of those to 512 4KB pages).
1553 	 */
1554 	for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
1555 retry:
1556 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1557 			continue;
1558 
1559 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1560 			continue;
1561 
1562 		if (!sp) {
1563 			rcu_read_unlock();
1564 
1565 			if (shared)
1566 				read_unlock(&kvm->mmu_lock);
1567 			else
1568 				write_unlock(&kvm->mmu_lock);
1569 
1570 			sp = tdp_mmu_alloc_sp_for_split();
1571 
1572 			if (shared)
1573 				read_lock(&kvm->mmu_lock);
1574 			else
1575 				write_lock(&kvm->mmu_lock);
1576 
1577 			if (!sp) {
1578 				trace_kvm_mmu_split_huge_page(iter.gfn,
1579 							      iter.old_spte,
1580 							      iter.level, -ENOMEM);
1581 				return -ENOMEM;
1582 			}
1583 
1584 			rcu_read_lock();
1585 
1586 			iter.yielded = true;
1587 			continue;
1588 		}
1589 
1590 		tdp_mmu_init_child_sp(sp, &iter);
1591 
1592 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1593 			goto retry;
1594 
1595 		sp = NULL;
1596 	}
1597 
1598 	rcu_read_unlock();
1599 
1600 	/*
1601 	 * It's possible to exit the loop having never used the last sp if, for
1602 	 * example, a vCPU doing HugePage NX splitting wins the race and
1603 	 * installs its own sp in place of the last sp we tried to split.
1604 	 */
1605 	if (sp)
1606 		tdp_mmu_free_sp(sp);
1607 
1608 	return 0;
1609 }
1610 
1611 
1612 /*
1613  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1614  */
kvm_tdp_mmu_try_split_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot,gfn_t start,gfn_t end,int target_level,bool shared)1615 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1616 				      const struct kvm_memory_slot *slot,
1617 				      gfn_t start, gfn_t end,
1618 				      int target_level, bool shared)
1619 {
1620 	struct kvm_mmu_page *root;
1621 	int r = 0;
1622 
1623 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1624 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
1625 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1626 		if (r) {
1627 			kvm_tdp_mmu_put_root(kvm, root);
1628 			break;
1629 		}
1630 	}
1631 }
1632 
tdp_mmu_need_write_protect(struct kvm_mmu_page * sp)1633 static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
1634 {
1635 	/*
1636 	 * All TDP MMU shadow pages share the same role as their root, aside
1637 	 * from level, so it is valid to key off any shadow page to determine if
1638 	 * write protection is needed for an entire tree.
1639 	 */
1640 	return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
1641 }
1642 
clear_dirty_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end)1643 static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1644 				  gfn_t start, gfn_t end)
1645 {
1646 	const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
1647 							    shadow_dirty_mask;
1648 	struct tdp_iter iter;
1649 
1650 	rcu_read_lock();
1651 
1652 	tdp_root_for_each_pte(iter, kvm, root, start, end) {
1653 retry:
1654 		if (!is_shadow_present_pte(iter.old_spte) ||
1655 		    !is_last_spte(iter.old_spte, iter.level))
1656 			continue;
1657 
1658 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1659 			continue;
1660 
1661 		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1662 				spte_ad_need_write_protect(iter.old_spte));
1663 
1664 		if (!(iter.old_spte & dbit))
1665 			continue;
1666 
1667 		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1668 			goto retry;
1669 	}
1670 
1671 	rcu_read_unlock();
1672 }
1673 
1674 /*
1675  * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
1676  * memslot.
1677  */
kvm_tdp_mmu_clear_dirty_slot(struct kvm * kvm,const struct kvm_memory_slot * slot)1678 void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1679 				  const struct kvm_memory_slot *slot)
1680 {
1681 	struct kvm_mmu_page *root;
1682 
1683 	lockdep_assert_held_read(&kvm->mmu_lock);
1684 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1685 		clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1686 				      slot->base_gfn + slot->npages);
1687 }
1688 
clear_dirty_pt_masked(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,unsigned long mask,bool wrprot)1689 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1690 				  gfn_t gfn, unsigned long mask, bool wrprot)
1691 {
1692 	const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
1693 									shadow_dirty_mask;
1694 	struct tdp_iter iter;
1695 
1696 	lockdep_assert_held_write(&kvm->mmu_lock);
1697 
1698 	rcu_read_lock();
1699 
1700 	tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
1701 				    gfn + BITS_PER_LONG) {
1702 		if (!mask)
1703 			break;
1704 
1705 		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1706 				spte_ad_need_write_protect(iter.old_spte));
1707 
1708 		if (iter.level > PG_LEVEL_4K ||
1709 		    !(mask & (1UL << (iter.gfn - gfn))))
1710 			continue;
1711 
1712 		mask &= ~(1UL << (iter.gfn - gfn));
1713 
1714 		if (!(iter.old_spte & dbit))
1715 			continue;
1716 
1717 		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1718 							iter.old_spte, dbit,
1719 							iter.level);
1720 
1721 		trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1722 					       iter.old_spte,
1723 					       iter.old_spte & ~dbit);
1724 	}
1725 
1726 	rcu_read_unlock();
1727 }
1728 
1729 /*
1730  * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
1731  * which a bit is set in mask, starting at gfn. The given memslot is expected to
1732  * contain all the GFNs represented by set bits in the mask.
1733  */
kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,unsigned long mask,bool wrprot)1734 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1735 				       struct kvm_memory_slot *slot,
1736 				       gfn_t gfn, unsigned long mask,
1737 				       bool wrprot)
1738 {
1739 	struct kvm_mmu_page *root;
1740 
1741 	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1742 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1743 }
1744 
tdp_mmu_make_huge_spte(struct kvm * kvm,struct tdp_iter * parent,u64 * huge_spte)1745 static int tdp_mmu_make_huge_spte(struct kvm *kvm,
1746 				  struct tdp_iter *parent,
1747 				  u64 *huge_spte)
1748 {
1749 	struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
1750 	gfn_t start = parent->gfn;
1751 	gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
1752 	struct tdp_iter iter;
1753 
1754 	tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
1755 		/*
1756 		 * Use the parent iterator when checking for forward progress so
1757 		 * that KVM doesn't get stuck continuously trying to yield (i.e.
1758 		 * returning -EAGAIN here and then failing the forward progress
1759 		 * check in the caller ad nauseam).
1760 		 */
1761 		if (tdp_mmu_iter_need_resched(kvm, parent))
1762 			return -EAGAIN;
1763 
1764 		*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
1765 		return 0;
1766 	}
1767 
1768 	return -ENOENT;
1769 }
1770 
recover_huge_pages_range(struct kvm * kvm,struct kvm_mmu_page * root,const struct kvm_memory_slot * slot)1771 static void recover_huge_pages_range(struct kvm *kvm,
1772 				     struct kvm_mmu_page *root,
1773 				     const struct kvm_memory_slot *slot)
1774 {
1775 	gfn_t start = slot->base_gfn;
1776 	gfn_t end = start + slot->npages;
1777 	struct tdp_iter iter;
1778 	int max_mapping_level;
1779 	bool flush = false;
1780 	u64 huge_spte;
1781 	int r;
1782 
1783 	if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
1784 		return;
1785 
1786 	rcu_read_lock();
1787 
1788 	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
1789 retry:
1790 		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1791 			flush = false;
1792 			continue;
1793 		}
1794 
1795 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1796 		    !is_shadow_present_pte(iter.old_spte))
1797 			continue;
1798 
1799 		/*
1800 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1801 		 * a large page size, then its parent would have been zapped
1802 		 * instead of stepping down.
1803 		 */
1804 		if (is_last_spte(iter.old_spte, iter.level))
1805 			continue;
1806 
1807 		/*
1808 		 * If iter.gfn resides outside of the slot, i.e. the page for
1809 		 * the current level overlaps but is not contained by the slot,
1810 		 * then the SPTE can't be made huge.  More importantly, trying
1811 		 * to query that info from slot->arch.lpage_info will cause an
1812 		 * out-of-bounds access.
1813 		 */
1814 		if (iter.gfn < start || iter.gfn >= end)
1815 			continue;
1816 
1817 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
1818 		if (max_mapping_level < iter.level)
1819 			continue;
1820 
1821 		r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
1822 		if (r == -EAGAIN)
1823 			goto retry;
1824 		else if (r)
1825 			continue;
1826 
1827 		if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
1828 			goto retry;
1829 
1830 		flush = true;
1831 	}
1832 
1833 	if (flush)
1834 		kvm_flush_remote_tlbs_memslot(kvm, slot);
1835 
1836 	rcu_read_unlock();
1837 }
1838 
1839 /*
1840  * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
1841  * huge SPTEs where possible.
1842  */
kvm_tdp_mmu_recover_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot)1843 void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
1844 				    const struct kvm_memory_slot *slot)
1845 {
1846 	struct kvm_mmu_page *root;
1847 
1848 	lockdep_assert_held_read(&kvm->mmu_lock);
1849 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1850 		recover_huge_pages_range(kvm, root, slot);
1851 }
1852 
1853 /*
1854  * Removes write access on the last level SPTE mapping this GFN and unsets the
1855  * MMU-writable bit to ensure future writes continue to be intercepted.
1856  * Returns true if an SPTE was set and a TLB flush is needed.
1857  */
write_protect_gfn(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,int min_level)1858 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1859 			      gfn_t gfn, int min_level)
1860 {
1861 	struct tdp_iter iter;
1862 	u64 new_spte;
1863 	bool spte_set = false;
1864 
1865 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1866 
1867 	rcu_read_lock();
1868 
1869 	for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
1870 		if (!is_shadow_present_pte(iter.old_spte) ||
1871 		    !is_last_spte(iter.old_spte, iter.level))
1872 			continue;
1873 
1874 		new_spte = iter.old_spte &
1875 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1876 
1877 		if (new_spte == iter.old_spte)
1878 			break;
1879 
1880 		tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1881 		spte_set = true;
1882 	}
1883 
1884 	rcu_read_unlock();
1885 
1886 	return spte_set;
1887 }
1888 
1889 /*
1890  * Removes write access on the last level SPTE mapping this GFN and unsets the
1891  * MMU-writable bit to ensure future writes continue to be intercepted.
1892  * Returns true if an SPTE was set and a TLB flush is needed.
1893  */
kvm_tdp_mmu_write_protect_gfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,int min_level)1894 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1895 				   struct kvm_memory_slot *slot, gfn_t gfn,
1896 				   int min_level)
1897 {
1898 	struct kvm_mmu_page *root;
1899 	bool spte_set = false;
1900 
1901 	lockdep_assert_held_write(&kvm->mmu_lock);
1902 	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1903 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1904 
1905 	return spte_set;
1906 }
1907 
1908 /*
1909  * Return the level of the lowest level SPTE added to sptes.
1910  * That SPTE may be non-present.
1911  *
1912  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1913  */
kvm_tdp_mmu_get_walk(struct kvm_vcpu * vcpu,u64 addr,u64 * sptes,int * root_level)1914 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1915 			 int *root_level)
1916 {
1917 	struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
1918 	struct tdp_iter iter;
1919 	gfn_t gfn = addr >> PAGE_SHIFT;
1920 	int leaf = -1;
1921 
1922 	*root_level = vcpu->arch.mmu->root_role.level;
1923 
1924 	for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1925 		leaf = iter.level;
1926 		sptes[leaf] = iter.old_spte;
1927 	}
1928 
1929 	return leaf;
1930 }
1931 
1932 /*
1933  * Returns the last level spte pointer of the shadow page walk for the given
1934  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1935  * walk could be performed, returns NULL and *spte does not contain valid data.
1936  *
1937  * Contract:
1938  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1939  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1940  *
1941  * WARNING: This function is only intended to be called during fast_page_fault.
1942  */
kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu * vcpu,gfn_t gfn,u64 * spte)1943 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
1944 					u64 *spte)
1945 {
1946 	/* Fast pf is not supported for mirrored roots  */
1947 	struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
1948 	struct tdp_iter iter;
1949 	tdp_ptep_t sptep = NULL;
1950 
1951 	for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1952 		*spte = iter.old_spte;
1953 		sptep = iter.sptep;
1954 	}
1955 
1956 	/*
1957 	 * Perform the rcu_dereference to get the raw spte pointer value since
1958 	 * we are passing it up to fast_page_fault, which is shared with the
1959 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1960 	 * annotation.
1961 	 *
1962 	 * This is safe since fast_page_fault obeys the contracts of this
1963 	 * function as well as all TDP MMU contracts around modifying SPTEs
1964 	 * outside of mmu_lock.
1965 	 */
1966 	return rcu_dereference(sptep);
1967 }
1968