1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4 #include "mmu.h"
5 #include "mmu_internal.h"
6 #include "mmutrace.h"
7 #include "tdp_iter.h"
8 #include "tdp_mmu.h"
9 #include "spte.h"
10
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
13
14 /* Initializes the TDP MMU for the VM, if enabled. */
kvm_mmu_init_tdp_mmu(struct kvm * kvm)15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16 {
17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19 }
20
21 /* Arbitrarily returns true so that this may be used in if statements. */
kvm_lockdep_assert_mmu_lock_held(struct kvm * kvm,bool shared)22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
23 bool shared)
24 {
25 if (shared)
26 lockdep_assert_held_read(&kvm->mmu_lock);
27 else
28 lockdep_assert_held_write(&kvm->mmu_lock);
29
30 return true;
31 }
32
kvm_mmu_uninit_tdp_mmu(struct kvm * kvm)33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34 {
35 /*
36 * Invalidate all roots, which besides the obvious, schedules all roots
37 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38 * ultimately frees all roots.
39 */
40 kvm_tdp_mmu_invalidate_all_roots(kvm);
41 kvm_tdp_mmu_zap_invalidated_roots(kvm);
42
43 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
45
46 /*
47 * Ensure that all the outstanding RCU callbacks to free shadow pages
48 * can run before the VM is torn down. Putting the last reference to
49 * zapped roots will create new callbacks.
50 */
51 rcu_barrier();
52 }
53
tdp_mmu_free_sp(struct kvm_mmu_page * sp)54 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
55 {
56 free_page((unsigned long)sp->spt);
57 kmem_cache_free(mmu_page_header_cache, sp);
58 }
59
60 /*
61 * This is called through call_rcu in order to free TDP page table memory
62 * safely with respect to other kernel threads that may be operating on
63 * the memory.
64 * By only accessing TDP MMU page table memory in an RCU read critical
65 * section, and freeing it after a grace period, lockless access to that
66 * memory won't use it after it is freed.
67 */
tdp_mmu_free_sp_rcu_callback(struct rcu_head * head)68 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
69 {
70 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
71 rcu_head);
72
73 tdp_mmu_free_sp(sp);
74 }
75
kvm_tdp_mmu_put_root(struct kvm * kvm,struct kvm_mmu_page * root)76 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
77 {
78 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
79 return;
80
81 /*
82 * The TDP MMU itself holds a reference to each root until the root is
83 * explicitly invalidated, i.e. the final reference should be never be
84 * put for a valid root.
85 */
86 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
87
88 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
89 list_del_rcu(&root->link);
90 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
91 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
92 }
93
94 /*
95 * Returns the next root after @prev_root (or the first root if @prev_root is
96 * NULL). A reference to the returned root is acquired, and the reference to
97 * @prev_root is released (the caller obviously must hold a reference to
98 * @prev_root if it's non-NULL).
99 *
100 * If @only_valid is true, invalid roots are skipped.
101 *
102 * Returns NULL if the end of tdp_mmu_roots was reached.
103 */
tdp_mmu_next_root(struct kvm * kvm,struct kvm_mmu_page * prev_root,bool only_valid)104 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
105 struct kvm_mmu_page *prev_root,
106 bool only_valid)
107 {
108 struct kvm_mmu_page *next_root;
109
110 /*
111 * While the roots themselves are RCU-protected, fields such as
112 * role.invalid are protected by mmu_lock.
113 */
114 lockdep_assert_held(&kvm->mmu_lock);
115
116 rcu_read_lock();
117
118 if (prev_root)
119 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
120 &prev_root->link,
121 typeof(*prev_root), link);
122 else
123 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
124 typeof(*next_root), link);
125
126 while (next_root) {
127 if ((!only_valid || !next_root->role.invalid) &&
128 kvm_tdp_mmu_get_root(next_root))
129 break;
130
131 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
132 &next_root->link, typeof(*next_root), link);
133 }
134
135 rcu_read_unlock();
136
137 if (prev_root)
138 kvm_tdp_mmu_put_root(kvm, prev_root);
139
140 return next_root;
141 }
142
143 /*
144 * Note: this iterator gets and puts references to the roots it iterates over.
145 * This makes it safe to release the MMU lock and yield within the loop, but
146 * if exiting the loop early, the caller must drop the reference to the most
147 * recent root. (Unless keeping a live reference is desirable.)
148 *
149 * If shared is set, this function is operating under the MMU lock in read
150 * mode.
151 */
152 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\
153 for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
154 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
155 _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
156 if (kvm_mmu_page_as_id(_root) != _as_id) { \
157 } else
158
159 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
160 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true)
161
162 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
163 for (_root = tdp_mmu_next_root(_kvm, NULL, false); \
164 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
165 _root = tdp_mmu_next_root(_kvm, _root, false))
166
167 /*
168 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
169 * the implication being that any flow that holds mmu_lock for read is
170 * inherently yield-friendly and should use the yield-safe variant above.
171 * Holding mmu_lock for write obviates the need for RCU protection as the list
172 * is guaranteed to be stable.
173 */
174 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
175 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
176 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
177 kvm_mmu_page_as_id(_root) != _as_id) { \
178 } else
179
tdp_mmu_alloc_sp(struct kvm_vcpu * vcpu)180 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
181 {
182 struct kvm_mmu_page *sp;
183
184 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
185 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
186
187 return sp;
188 }
189
tdp_mmu_init_sp(struct kvm_mmu_page * sp,tdp_ptep_t sptep,gfn_t gfn,union kvm_mmu_page_role role)190 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
191 gfn_t gfn, union kvm_mmu_page_role role)
192 {
193 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
194
195 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
196
197 sp->role = role;
198 sp->gfn = gfn;
199 sp->ptep = sptep;
200 sp->tdp_mmu_page = true;
201
202 trace_kvm_mmu_get_page(sp, true);
203 }
204
tdp_mmu_init_child_sp(struct kvm_mmu_page * child_sp,struct tdp_iter * iter)205 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
206 struct tdp_iter *iter)
207 {
208 struct kvm_mmu_page *parent_sp;
209 union kvm_mmu_page_role role;
210
211 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
212
213 role = parent_sp->role;
214 role.level--;
215
216 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
217 }
218
kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu * vcpu)219 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
220 {
221 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
222 struct kvm *kvm = vcpu->kvm;
223 struct kvm_mmu_page *root;
224
225 lockdep_assert_held_write(&kvm->mmu_lock);
226
227 /*
228 * Check for an existing root before allocating a new one. Note, the
229 * role check prevents consuming an invalid root.
230 */
231 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
232 if (root->role.word == role.word &&
233 kvm_tdp_mmu_get_root(root))
234 goto out;
235 }
236
237 root = tdp_mmu_alloc_sp(vcpu);
238 tdp_mmu_init_sp(root, NULL, 0, role);
239
240 /*
241 * TDP MMU roots are kept until they are explicitly invalidated, either
242 * by a memslot update or by the destruction of the VM. Initialize the
243 * refcount to two; one reference for the vCPU, and one reference for
244 * the TDP MMU itself, which is held until the root is invalidated and
245 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
246 */
247 refcount_set(&root->tdp_mmu_root_count, 2);
248
249 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
250 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
251 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
252
253 out:
254 return __pa(root->spt);
255 }
256
257 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
258 u64 old_spte, u64 new_spte, int level,
259 bool shared);
260
tdp_account_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)261 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
262 {
263 kvm_account_pgtable_pages((void *)sp->spt, +1);
264 atomic64_inc(&kvm->arch.tdp_mmu_pages);
265 }
266
tdp_unaccount_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)267 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
268 {
269 kvm_account_pgtable_pages((void *)sp->spt, -1);
270 atomic64_dec(&kvm->arch.tdp_mmu_pages);
271 }
272
273 /**
274 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
275 *
276 * @kvm: kvm instance
277 * @sp: the page to be removed
278 */
tdp_mmu_unlink_sp(struct kvm * kvm,struct kvm_mmu_page * sp)279 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
280 {
281 tdp_unaccount_mmu_page(kvm, sp);
282
283 if (!sp->nx_huge_page_disallowed)
284 return;
285
286 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
287 sp->nx_huge_page_disallowed = false;
288 untrack_possible_nx_huge_page(kvm, sp);
289 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
290 }
291
292 /**
293 * handle_removed_pt() - handle a page table removed from the TDP structure
294 *
295 * @kvm: kvm instance
296 * @pt: the page removed from the paging structure
297 * @shared: This operation may not be running under the exclusive use
298 * of the MMU lock and the operation must synchronize with other
299 * threads that might be modifying SPTEs.
300 *
301 * Given a page table that has been removed from the TDP paging structure,
302 * iterates through the page table to clear SPTEs and free child page tables.
303 *
304 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
305 * protection. Since this thread removed it from the paging structure,
306 * this thread will be responsible for ensuring the page is freed. Hence the
307 * early rcu_dereferences in the function.
308 */
handle_removed_pt(struct kvm * kvm,tdp_ptep_t pt,bool shared)309 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
310 {
311 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
312 int level = sp->role.level;
313 gfn_t base_gfn = sp->gfn;
314 int i;
315
316 trace_kvm_mmu_prepare_zap_page(sp);
317
318 tdp_mmu_unlink_sp(kvm, sp);
319
320 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
321 tdp_ptep_t sptep = pt + i;
322 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
323 u64 old_spte;
324
325 if (shared) {
326 /*
327 * Set the SPTE to a nonpresent value that other
328 * threads will not overwrite. If the SPTE was
329 * already marked as removed then another thread
330 * handling a page fault could overwrite it, so
331 * set the SPTE until it is set from some other
332 * value to the removed SPTE value.
333 */
334 for (;;) {
335 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
336 if (!is_removed_spte(old_spte))
337 break;
338 cpu_relax();
339 }
340 } else {
341 /*
342 * If the SPTE is not MMU-present, there is no backing
343 * page associated with the SPTE and so no side effects
344 * that need to be recorded, and exclusive ownership of
345 * mmu_lock ensures the SPTE can't be made present.
346 * Note, zapping MMIO SPTEs is also unnecessary as they
347 * are guarded by the memslots generation, not by being
348 * unreachable.
349 */
350 old_spte = kvm_tdp_mmu_read_spte(sptep);
351 if (!is_shadow_present_pte(old_spte))
352 continue;
353
354 /*
355 * Use the common helper instead of a raw WRITE_ONCE as
356 * the SPTE needs to be updated atomically if it can be
357 * modified by a different vCPU outside of mmu_lock.
358 * Even though the parent SPTE is !PRESENT, the TLB
359 * hasn't yet been flushed, and both Intel and AMD
360 * document that A/D assists can use upper-level PxE
361 * entries that are cached in the TLB, i.e. the CPU can
362 * still access the page and mark it dirty.
363 *
364 * No retry is needed in the atomic update path as the
365 * sole concern is dropping a Dirty bit, i.e. no other
366 * task can zap/remove the SPTE as mmu_lock is held for
367 * write. Marking the SPTE as a removed SPTE is not
368 * strictly necessary for the same reason, but using
369 * the remove SPTE value keeps the shared/exclusive
370 * paths consistent and allows the handle_changed_spte()
371 * call below to hardcode the new value to REMOVED_SPTE.
372 *
373 * Note, even though dropping a Dirty bit is the only
374 * scenario where a non-atomic update could result in a
375 * functional bug, simply checking the Dirty bit isn't
376 * sufficient as a fast page fault could read the upper
377 * level SPTE before it is zapped, and then make this
378 * target SPTE writable, resume the guest, and set the
379 * Dirty bit between reading the SPTE above and writing
380 * it here.
381 */
382 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
383 REMOVED_SPTE, level);
384 }
385 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
386 old_spte, REMOVED_SPTE, level, shared);
387 }
388
389 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
390 }
391
392 /**
393 * handle_changed_spte - handle bookkeeping associated with an SPTE change
394 * @kvm: kvm instance
395 * @as_id: the address space of the paging structure the SPTE was a part of
396 * @gfn: the base GFN that was mapped by the SPTE
397 * @old_spte: The value of the SPTE before the change
398 * @new_spte: The value of the SPTE after the change
399 * @level: the level of the PT the SPTE is part of in the paging structure
400 * @shared: This operation may not be running under the exclusive use of
401 * the MMU lock and the operation must synchronize with other
402 * threads that might be modifying SPTEs.
403 *
404 * Handle bookkeeping that might result from the modification of a SPTE. Note,
405 * dirty logging updates are handled in common code, not here (see make_spte()
406 * and fast_pf_fix_direct_spte()).
407 */
handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)408 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
409 u64 old_spte, u64 new_spte, int level,
410 bool shared)
411 {
412 bool was_present = is_shadow_present_pte(old_spte);
413 bool is_present = is_shadow_present_pte(new_spte);
414 bool was_leaf = was_present && is_last_spte(old_spte, level);
415 bool is_leaf = is_present && is_last_spte(new_spte, level);
416 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
417
418 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
419 WARN_ON_ONCE(level < PG_LEVEL_4K);
420 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
421
422 /*
423 * If this warning were to trigger it would indicate that there was a
424 * missing MMU notifier or a race with some notifier handler.
425 * A present, leaf SPTE should never be directly replaced with another
426 * present leaf SPTE pointing to a different PFN. A notifier handler
427 * should be zapping the SPTE before the main MM's page table is
428 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
429 * thread before replacement.
430 */
431 if (was_leaf && is_leaf && pfn_changed) {
432 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
433 "SPTE with another present leaf SPTE mapping a\n"
434 "different PFN!\n"
435 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
436 as_id, gfn, old_spte, new_spte, level);
437
438 /*
439 * Crash the host to prevent error propagation and guest data
440 * corruption.
441 */
442 BUG();
443 }
444
445 if (old_spte == new_spte)
446 return;
447
448 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
449
450 if (is_leaf)
451 check_spte_writable_invariants(new_spte);
452
453 /*
454 * The only times a SPTE should be changed from a non-present to
455 * non-present state is when an MMIO entry is installed/modified/
456 * removed. In that case, there is nothing to do here.
457 */
458 if (!was_present && !is_present) {
459 /*
460 * If this change does not involve a MMIO SPTE or removed SPTE,
461 * it is unexpected. Log the change, though it should not
462 * impact the guest since both the former and current SPTEs
463 * are nonpresent.
464 */
465 if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
466 !is_mmio_spte(new_spte) &&
467 !is_removed_spte(new_spte)))
468 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
469 "should not be replaced with another,\n"
470 "different nonpresent SPTE, unless one or both\n"
471 "are MMIO SPTEs, or the new SPTE is\n"
472 "a temporary removed SPTE.\n"
473 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
474 as_id, gfn, old_spte, new_spte, level);
475 return;
476 }
477
478 if (is_leaf != was_leaf)
479 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
480
481 if (was_leaf && is_dirty_spte(old_spte) &&
482 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
483 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
484
485 /*
486 * Recursively handle child PTs if the change removed a subtree from
487 * the paging structure. Note the WARN on the PFN changing without the
488 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
489 * pages are kernel allocations and should never be migrated.
490 */
491 if (was_present && !was_leaf &&
492 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
493 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
494
495 if (was_leaf && is_accessed_spte(old_spte) &&
496 (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
497 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
498 }
499
500 /*
501 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
502 * and handle the associated bookkeeping. Do not mark the page dirty
503 * in KVM's dirty bitmaps.
504 *
505 * If setting the SPTE fails because it has changed, iter->old_spte will be
506 * refreshed to the current value of the spte.
507 *
508 * @kvm: kvm instance
509 * @iter: a tdp_iter instance currently on the SPTE that should be set
510 * @new_spte: The value the SPTE should be set to
511 * Return:
512 * * 0 - If the SPTE was set.
513 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
514 * no side-effects other than setting iter->old_spte to the last
515 * known value of the spte.
516 */
tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)517 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
518 struct tdp_iter *iter,
519 u64 new_spte)
520 {
521 u64 *sptep = rcu_dereference(iter->sptep);
522
523 /*
524 * The caller is responsible for ensuring the old SPTE is not a REMOVED
525 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
526 * and pre-checking before inserting a new SPTE is advantageous as it
527 * avoids unnecessary work.
528 */
529 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
530
531 lockdep_assert_held_read(&kvm->mmu_lock);
532
533 /*
534 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
535 * does not hold the mmu_lock. On failure, i.e. if a different logical
536 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
537 * the current value, so the caller operates on fresh data, e.g. if it
538 * retries tdp_mmu_set_spte_atomic()
539 */
540 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
541 return -EBUSY;
542
543 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
544 new_spte, iter->level, true);
545
546 return 0;
547 }
548
tdp_mmu_zap_spte_atomic(struct kvm * kvm,struct tdp_iter * iter)549 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
550 struct tdp_iter *iter)
551 {
552 int ret;
553
554 /*
555 * Freeze the SPTE by setting it to a special,
556 * non-present value. This will stop other threads from
557 * immediately installing a present entry in its place
558 * before the TLBs are flushed.
559 */
560 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
561 if (ret)
562 return ret;
563
564 kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
565
566 /*
567 * No other thread can overwrite the removed SPTE as they must either
568 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
569 * overwrite the special removed SPTE value. No bookkeeping is needed
570 * here since the SPTE is going from non-present to non-present. Use
571 * the raw write helper to avoid an unnecessary check on volatile bits.
572 */
573 __kvm_tdp_mmu_write_spte(iter->sptep, 0);
574
575 return 0;
576 }
577
578
579 /*
580 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
581 * @kvm: KVM instance
582 * @as_id: Address space ID, i.e. regular vs. SMM
583 * @sptep: Pointer to the SPTE
584 * @old_spte: The current value of the SPTE
585 * @new_spte: The new value that will be set for the SPTE
586 * @gfn: The base GFN that was (or will be) mapped by the SPTE
587 * @level: The level _containing_ the SPTE (its parent PT's level)
588 *
589 * Returns the old SPTE value, which _may_ be different than @old_spte if the
590 * SPTE had voldatile bits.
591 */
tdp_mmu_set_spte(struct kvm * kvm,int as_id,tdp_ptep_t sptep,u64 old_spte,u64 new_spte,gfn_t gfn,int level)592 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
593 u64 old_spte, u64 new_spte, gfn_t gfn, int level)
594 {
595 lockdep_assert_held_write(&kvm->mmu_lock);
596
597 /*
598 * No thread should be using this function to set SPTEs to or from the
599 * temporary removed SPTE value.
600 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
601 * should be used. If operating under the MMU lock in write mode, the
602 * use of the removed SPTE should not be necessary.
603 */
604 WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
605
606 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
607
608 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
609 return old_spte;
610 }
611
tdp_mmu_iter_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)612 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
613 u64 new_spte)
614 {
615 WARN_ON_ONCE(iter->yielded);
616 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
617 iter->old_spte, new_spte,
618 iter->gfn, iter->level);
619 }
620
621 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
622 for_each_tdp_pte(_iter, _root, _start, _end)
623
624 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
625 tdp_root_for_each_pte(_iter, _root, _start, _end) \
626 if (!is_shadow_present_pte(_iter.old_spte) || \
627 !is_last_spte(_iter.old_spte, _iter.level)) \
628 continue; \
629 else
630
631 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
632 for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
633
634 /*
635 * Yield if the MMU lock is contended or this thread needs to return control
636 * to the scheduler.
637 *
638 * If this function should yield and flush is set, it will perform a remote
639 * TLB flush before yielding.
640 *
641 * If this function yields, iter->yielded is set and the caller must skip to
642 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
643 * over the paging structures to allow the iterator to continue its traversal
644 * from the paging structure root.
645 *
646 * Returns true if this function yielded.
647 */
tdp_mmu_iter_cond_resched(struct kvm * kvm,struct tdp_iter * iter,bool flush,bool shared)648 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
649 struct tdp_iter *iter,
650 bool flush, bool shared)
651 {
652 WARN_ON_ONCE(iter->yielded);
653
654 /* Ensure forward progress has been made before yielding. */
655 if (iter->next_last_level_gfn == iter->yielded_gfn)
656 return false;
657
658 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
659 if (flush)
660 kvm_flush_remote_tlbs(kvm);
661
662 rcu_read_unlock();
663
664 if (shared)
665 cond_resched_rwlock_read(&kvm->mmu_lock);
666 else
667 cond_resched_rwlock_write(&kvm->mmu_lock);
668
669 rcu_read_lock();
670
671 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
672
673 iter->yielded = true;
674 }
675
676 return iter->yielded;
677 }
678
tdp_mmu_max_gfn_exclusive(void)679 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
680 {
681 /*
682 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
683 * a gpa range that would exceed the max gfn, and KVM does not create
684 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
685 * the slow emulation path every time.
686 */
687 return kvm_mmu_max_gfn() + 1;
688 }
689
__tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared,int zap_level)690 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
691 bool shared, int zap_level)
692 {
693 struct tdp_iter iter;
694
695 gfn_t end = tdp_mmu_max_gfn_exclusive();
696 gfn_t start = 0;
697
698 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
699 retry:
700 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
701 continue;
702
703 if (!is_shadow_present_pte(iter.old_spte))
704 continue;
705
706 if (iter.level > zap_level)
707 continue;
708
709 if (!shared)
710 tdp_mmu_iter_set_spte(kvm, &iter, 0);
711 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
712 goto retry;
713 }
714 }
715
tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)716 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
717 bool shared)
718 {
719
720 /*
721 * The root must have an elevated refcount so that it's reachable via
722 * mmu_notifier callbacks, which allows this path to yield and drop
723 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
724 * must drop all references to relevant pages prior to completing the
725 * callback. Dropping mmu_lock with an unreachable root would result
726 * in zapping SPTEs after a relevant mmu_notifier callback completes
727 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
728 * dirty accessed bits to the SPTE's associated struct page.
729 */
730 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
731
732 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
733
734 rcu_read_lock();
735
736 /*
737 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
738 * split the zap into two passes. On the first pass, zap at the 1gb
739 * level, and then zap top-level SPs on the second pass. "1gb" is not
740 * arbitrary, as KVM must be able to zap a 1gb shadow page without
741 * inducing a stall to allow in-place replacement with a 1gb hugepage.
742 *
743 * Because zapping a SP recurses on its children, stepping down to
744 * PG_LEVEL_4K in the iterator itself is unnecessary.
745 */
746 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
747 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
748
749 rcu_read_unlock();
750 }
751
kvm_tdp_mmu_zap_sp(struct kvm * kvm,struct kvm_mmu_page * sp)752 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
753 {
754 u64 old_spte;
755
756 /*
757 * This helper intentionally doesn't allow zapping a root shadow page,
758 * which doesn't have a parent page table and thus no associated entry.
759 */
760 if (WARN_ON_ONCE(!sp->ptep))
761 return false;
762
763 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
764 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
765 return false;
766
767 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
768 sp->gfn, sp->role.level + 1);
769
770 return true;
771 }
772
773 /*
774 * If can_yield is true, will release the MMU lock and reschedule if the
775 * scheduler needs the CPU or there is contention on the MMU lock. If this
776 * function cannot yield, it will not release the MMU lock or reschedule and
777 * the caller must ensure it does not supply too large a GFN range, or the
778 * operation can cause a soft lockup.
779 */
tdp_mmu_zap_leafs(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,bool can_yield,bool flush)780 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
781 gfn_t start, gfn_t end, bool can_yield, bool flush)
782 {
783 struct tdp_iter iter;
784
785 end = min(end, tdp_mmu_max_gfn_exclusive());
786
787 lockdep_assert_held_write(&kvm->mmu_lock);
788
789 rcu_read_lock();
790
791 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
792 if (can_yield &&
793 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
794 flush = false;
795 continue;
796 }
797
798 if (!is_shadow_present_pte(iter.old_spte) ||
799 !is_last_spte(iter.old_spte, iter.level))
800 continue;
801
802 tdp_mmu_iter_set_spte(kvm, &iter, 0);
803 flush = true;
804 }
805
806 rcu_read_unlock();
807
808 /*
809 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
810 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
811 */
812 return flush;
813 }
814
815 /*
816 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
817 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
818 * more SPTEs were zapped since the MMU lock was last acquired.
819 */
kvm_tdp_mmu_zap_leafs(struct kvm * kvm,gfn_t start,gfn_t end,bool flush)820 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
821 {
822 struct kvm_mmu_page *root;
823
824 lockdep_assert_held_write(&kvm->mmu_lock);
825 for_each_tdp_mmu_root_yield_safe(kvm, root)
826 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
827
828 return flush;
829 }
830
kvm_tdp_mmu_zap_all(struct kvm * kvm)831 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
832 {
833 struct kvm_mmu_page *root;
834
835 /*
836 * Zap all roots, including invalid roots, as all SPTEs must be dropped
837 * before returning to the caller. Zap directly even if the root is
838 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
839 * all that expensive and mmu_lock is already held, which means the
840 * worker has yielded, i.e. flushing the work instead of zapping here
841 * isn't guaranteed to be any faster.
842 *
843 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
844 * is being destroyed or the userspace VMM has exited. In both cases,
845 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
846 */
847 lockdep_assert_held_write(&kvm->mmu_lock);
848 for_each_tdp_mmu_root_yield_safe(kvm, root)
849 tdp_mmu_zap_root(kvm, root, false);
850 }
851
852 /*
853 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
854 * zap" completes.
855 */
kvm_tdp_mmu_zap_invalidated_roots(struct kvm * kvm)856 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
857 {
858 struct kvm_mmu_page *root;
859
860 read_lock(&kvm->mmu_lock);
861
862 for_each_tdp_mmu_root_yield_safe(kvm, root) {
863 if (!root->tdp_mmu_scheduled_root_to_zap)
864 continue;
865
866 root->tdp_mmu_scheduled_root_to_zap = false;
867 KVM_BUG_ON(!root->role.invalid, kvm);
868
869 /*
870 * A TLB flush is not necessary as KVM performs a local TLB
871 * flush when allocating a new root (see kvm_mmu_load()), and
872 * when migrating a vCPU to a different pCPU. Note, the local
873 * TLB flush on reuse also invalidates paging-structure-cache
874 * entries, i.e. TLB entries for intermediate paging structures,
875 * that may be zapped, as such entries are associated with the
876 * ASID on both VMX and SVM.
877 */
878 tdp_mmu_zap_root(kvm, root, true);
879
880 /*
881 * The referenced needs to be put *after* zapping the root, as
882 * the root must be reachable by mmu_notifiers while it's being
883 * zapped
884 */
885 kvm_tdp_mmu_put_root(kvm, root);
886 }
887
888 read_unlock(&kvm->mmu_lock);
889 }
890
891 /*
892 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
893 * is about to be zapped, e.g. in response to a memslots update. The actual
894 * zapping is done separately so that it happens with mmu_lock with read,
895 * whereas invalidating roots must be done with mmu_lock held for write (unless
896 * the VM is being destroyed).
897 *
898 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
899 * See kvm_tdp_mmu_get_vcpu_root_hpa().
900 */
kvm_tdp_mmu_invalidate_all_roots(struct kvm * kvm)901 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
902 {
903 struct kvm_mmu_page *root;
904
905 /*
906 * mmu_lock must be held for write to ensure that a root doesn't become
907 * invalid while there are active readers (invalidating a root while
908 * there are active readers may or may not be problematic in practice,
909 * but it's uncharted territory and not supported).
910 *
911 * Waive the assertion if there are no users of @kvm, i.e. the VM is
912 * being destroyed after all references have been put, or if no vCPUs
913 * have been created (which means there are no roots), i.e. the VM is
914 * being destroyed in an error path of KVM_CREATE_VM.
915 */
916 if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
917 refcount_read(&kvm->users_count) && kvm->created_vcpus)
918 lockdep_assert_held_write(&kvm->mmu_lock);
919
920 /*
921 * As above, mmu_lock isn't held when destroying the VM! There can't
922 * be other references to @kvm, i.e. nothing else can invalidate roots
923 * or get/put references to roots.
924 */
925 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
926 /*
927 * Note, invalid roots can outlive a memslot update! Invalid
928 * roots must be *zapped* before the memslot update completes,
929 * but a different task can acquire a reference and keep the
930 * root alive after its been zapped.
931 */
932 if (!root->role.invalid) {
933 root->tdp_mmu_scheduled_root_to_zap = true;
934 root->role.invalid = true;
935 }
936 }
937 }
938
939 /*
940 * Installs a last-level SPTE to handle a TDP page fault.
941 * (NPT/EPT violation/misconfiguration)
942 */
tdp_mmu_map_handle_target_level(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault,struct tdp_iter * iter)943 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
944 struct kvm_page_fault *fault,
945 struct tdp_iter *iter)
946 {
947 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
948 u64 new_spte;
949 int ret = RET_PF_FIXED;
950 bool wrprot = false;
951
952 if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
953 return RET_PF_RETRY;
954
955 if (unlikely(!fault->slot))
956 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
957 else
958 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
959 fault->pfn, iter->old_spte, fault->prefetch, true,
960 fault->map_writable, &new_spte);
961
962 if (new_spte == iter->old_spte)
963 ret = RET_PF_SPURIOUS;
964 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
965 return RET_PF_RETRY;
966 else if (is_shadow_present_pte(iter->old_spte) &&
967 !is_last_spte(iter->old_spte, iter->level))
968 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
969
970 /*
971 * If the page fault was caused by a write but the page is write
972 * protected, emulation is needed. If the emulation was skipped,
973 * the vCPU would have the same fault again.
974 */
975 if (wrprot) {
976 if (fault->write)
977 ret = RET_PF_EMULATE;
978 }
979
980 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
981 if (unlikely(is_mmio_spte(new_spte))) {
982 vcpu->stat.pf_mmio_spte_created++;
983 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
984 new_spte);
985 ret = RET_PF_EMULATE;
986 } else {
987 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
988 rcu_dereference(iter->sptep));
989 }
990
991 return ret;
992 }
993
994 /*
995 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
996 * provided page table.
997 *
998 * @kvm: kvm instance
999 * @iter: a tdp_iter instance currently on the SPTE that should be set
1000 * @sp: The new TDP page table to install.
1001 * @shared: This operation is running under the MMU lock in read mode.
1002 *
1003 * Returns: 0 if the new page table was installed. Non-0 if the page table
1004 * could not be installed (e.g. the atomic compare-exchange failed).
1005 */
tdp_mmu_link_sp(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1006 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1007 struct kvm_mmu_page *sp, bool shared)
1008 {
1009 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1010 int ret = 0;
1011
1012 if (shared) {
1013 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1014 if (ret)
1015 return ret;
1016 } else {
1017 tdp_mmu_iter_set_spte(kvm, iter, spte);
1018 }
1019
1020 tdp_account_mmu_page(kvm, sp);
1021
1022 return 0;
1023 }
1024
1025 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1026 struct kvm_mmu_page *sp, bool shared);
1027
1028 /*
1029 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1030 * page tables and SPTEs to translate the faulting guest physical address.
1031 */
kvm_tdp_mmu_map(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault)1032 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1033 {
1034 struct kvm_mmu *mmu = vcpu->arch.mmu;
1035 struct kvm *kvm = vcpu->kvm;
1036 struct tdp_iter iter;
1037 struct kvm_mmu_page *sp;
1038 int ret = RET_PF_RETRY;
1039
1040 kvm_mmu_hugepage_adjust(vcpu, fault);
1041
1042 trace_kvm_mmu_spte_requested(fault);
1043
1044 rcu_read_lock();
1045
1046 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1047 int r;
1048
1049 if (fault->nx_huge_page_workaround_enabled)
1050 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1051
1052 /*
1053 * If SPTE has been frozen by another thread, just give up and
1054 * retry, avoiding unnecessary page table allocation and free.
1055 */
1056 if (is_removed_spte(iter.old_spte))
1057 goto retry;
1058
1059 if (iter.level == fault->goal_level)
1060 goto map_target_level;
1061
1062 /* Step down into the lower level page table if it exists. */
1063 if (is_shadow_present_pte(iter.old_spte) &&
1064 !is_large_pte(iter.old_spte))
1065 continue;
1066
1067 /*
1068 * The SPTE is either non-present or points to a huge page that
1069 * needs to be split.
1070 */
1071 sp = tdp_mmu_alloc_sp(vcpu);
1072 tdp_mmu_init_child_sp(sp, &iter);
1073
1074 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1075
1076 if (is_shadow_present_pte(iter.old_spte))
1077 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1078 else
1079 r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1080
1081 /*
1082 * Force the guest to retry if installing an upper level SPTE
1083 * failed, e.g. because a different task modified the SPTE.
1084 */
1085 if (r) {
1086 tdp_mmu_free_sp(sp);
1087 goto retry;
1088 }
1089
1090 if (fault->huge_page_disallowed &&
1091 fault->req_level >= iter.level) {
1092 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1093 if (sp->nx_huge_page_disallowed)
1094 track_possible_nx_huge_page(kvm, sp);
1095 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1096 }
1097 }
1098
1099 /*
1100 * The walk aborted before reaching the target level, e.g. because the
1101 * iterator detected an upper level SPTE was frozen during traversal.
1102 */
1103 WARN_ON_ONCE(iter.level == fault->goal_level);
1104 goto retry;
1105
1106 map_target_level:
1107 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1108
1109 retry:
1110 rcu_read_unlock();
1111 return ret;
1112 }
1113
kvm_tdp_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool flush)1114 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1115 bool flush)
1116 {
1117 struct kvm_mmu_page *root;
1118
1119 __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
1120 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1121 range->may_block, flush);
1122
1123 return flush;
1124 }
1125
1126 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1127 struct kvm_gfn_range *range);
1128
kvm_tdp_mmu_handle_gfn(struct kvm * kvm,struct kvm_gfn_range * range,tdp_handler_t handler)1129 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1130 struct kvm_gfn_range *range,
1131 tdp_handler_t handler)
1132 {
1133 struct kvm_mmu_page *root;
1134 struct tdp_iter iter;
1135 bool ret = false;
1136
1137 /*
1138 * Don't support rescheduling, none of the MMU notifiers that funnel
1139 * into this helper allow blocking; it'd be dead, wasteful code.
1140 */
1141 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1142 rcu_read_lock();
1143
1144 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1145 ret |= handler(kvm, &iter, range);
1146
1147 rcu_read_unlock();
1148 }
1149
1150 return ret;
1151 }
1152
1153 /*
1154 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1155 * if any of the GFNs in the range have been accessed.
1156 *
1157 * No need to mark the corresponding PFN as accessed as this call is coming
1158 * from the clear_young() or clear_flush_young() notifier, which uses the
1159 * return value to determine if the page has been accessed.
1160 */
age_gfn_range(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1161 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1162 struct kvm_gfn_range *range)
1163 {
1164 u64 new_spte;
1165
1166 /* If we have a non-accessed entry we don't need to change the pte. */
1167 if (!is_accessed_spte(iter->old_spte))
1168 return false;
1169
1170 if (spte_ad_enabled(iter->old_spte)) {
1171 iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
1172 iter->old_spte,
1173 shadow_accessed_mask,
1174 iter->level);
1175 new_spte = iter->old_spte & ~shadow_accessed_mask;
1176 } else {
1177 /*
1178 * Capture the dirty status of the page, so that it doesn't get
1179 * lost when the SPTE is marked for access tracking.
1180 */
1181 if (is_writable_pte(iter->old_spte))
1182 kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
1183
1184 new_spte = mark_spte_for_access_track(iter->old_spte);
1185 iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
1186 iter->old_spte, new_spte,
1187 iter->level);
1188 }
1189
1190 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1191 iter->old_spte, new_spte);
1192 return true;
1193 }
1194
kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)1195 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1196 {
1197 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1198 }
1199
test_age_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1200 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1201 struct kvm_gfn_range *range)
1202 {
1203 return is_accessed_spte(iter->old_spte);
1204 }
1205
kvm_tdp_mmu_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1206 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1207 {
1208 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1209 }
1210
set_spte_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1211 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1212 struct kvm_gfn_range *range)
1213 {
1214 u64 new_spte;
1215
1216 /* Huge pages aren't expected to be modified without first being zapped. */
1217 WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
1218
1219 if (iter->level != PG_LEVEL_4K ||
1220 !is_shadow_present_pte(iter->old_spte))
1221 return false;
1222
1223 /*
1224 * Note, when changing a read-only SPTE, it's not strictly necessary to
1225 * zero the SPTE before setting the new PFN, but doing so preserves the
1226 * invariant that the PFN of a present * leaf SPTE can never change.
1227 * See handle_changed_spte().
1228 */
1229 tdp_mmu_iter_set_spte(kvm, iter, 0);
1230
1231 if (!pte_write(range->arg.pte)) {
1232 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1233 pte_pfn(range->arg.pte));
1234
1235 tdp_mmu_iter_set_spte(kvm, iter, new_spte);
1236 }
1237
1238 return true;
1239 }
1240
1241 /*
1242 * Handle the changed_pte MMU notifier for the TDP MMU.
1243 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1244 * notifier.
1245 * Returns non-zero if a flush is needed before releasing the MMU lock.
1246 */
kvm_tdp_mmu_set_spte_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1247 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1248 {
1249 /*
1250 * No need to handle the remote TLB flush under RCU protection, the
1251 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1252 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
1253 */
1254 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1255 }
1256
1257 /*
1258 * Remove write access from all SPTEs at or above min_level that map GFNs
1259 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1260 * be flushed.
1261 */
wrprot_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int min_level)1262 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1263 gfn_t start, gfn_t end, int min_level)
1264 {
1265 struct tdp_iter iter;
1266 u64 new_spte;
1267 bool spte_set = false;
1268
1269 rcu_read_lock();
1270
1271 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1272
1273 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1274 retry:
1275 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1276 continue;
1277
1278 if (!is_shadow_present_pte(iter.old_spte) ||
1279 !is_last_spte(iter.old_spte, iter.level) ||
1280 !(iter.old_spte & PT_WRITABLE_MASK))
1281 continue;
1282
1283 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1284
1285 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1286 goto retry;
1287
1288 spte_set = true;
1289 }
1290
1291 rcu_read_unlock();
1292 return spte_set;
1293 }
1294
1295 /*
1296 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1297 * only affect leaf SPTEs down to min_level.
1298 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1299 */
kvm_tdp_mmu_wrprot_slot(struct kvm * kvm,const struct kvm_memory_slot * slot,int min_level)1300 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1301 const struct kvm_memory_slot *slot, int min_level)
1302 {
1303 struct kvm_mmu_page *root;
1304 bool spte_set = false;
1305
1306 lockdep_assert_held_read(&kvm->mmu_lock);
1307
1308 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1309 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1310 slot->base_gfn + slot->npages, min_level);
1311
1312 return spte_set;
1313 }
1314
__tdp_mmu_alloc_sp_for_split(gfp_t gfp)1315 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1316 {
1317 struct kvm_mmu_page *sp;
1318
1319 gfp |= __GFP_ZERO;
1320
1321 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1322 if (!sp)
1323 return NULL;
1324
1325 sp->spt = (void *)__get_free_page(gfp);
1326 if (!sp->spt) {
1327 kmem_cache_free(mmu_page_header_cache, sp);
1328 return NULL;
1329 }
1330
1331 return sp;
1332 }
1333
tdp_mmu_alloc_sp_for_split(struct kvm * kvm,struct tdp_iter * iter,bool shared)1334 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1335 struct tdp_iter *iter,
1336 bool shared)
1337 {
1338 struct kvm_mmu_page *sp;
1339
1340 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1341
1342 /*
1343 * Since we are allocating while under the MMU lock we have to be
1344 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1345 * reclaim and to avoid making any filesystem callbacks (which can end
1346 * up invoking KVM MMU notifiers, resulting in a deadlock).
1347 *
1348 * If this allocation fails we drop the lock and retry with reclaim
1349 * allowed.
1350 */
1351 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1352 if (sp)
1353 return sp;
1354
1355 rcu_read_unlock();
1356
1357 if (shared)
1358 read_unlock(&kvm->mmu_lock);
1359 else
1360 write_unlock(&kvm->mmu_lock);
1361
1362 iter->yielded = true;
1363 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1364
1365 if (shared)
1366 read_lock(&kvm->mmu_lock);
1367 else
1368 write_lock(&kvm->mmu_lock);
1369
1370 rcu_read_lock();
1371
1372 return sp;
1373 }
1374
1375 /* Note, the caller is responsible for initializing @sp. */
tdp_mmu_split_huge_page(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1376 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1377 struct kvm_mmu_page *sp, bool shared)
1378 {
1379 const u64 huge_spte = iter->old_spte;
1380 const int level = iter->level;
1381 int ret, i;
1382
1383 /*
1384 * No need for atomics when writing to sp->spt since the page table has
1385 * not been linked in yet and thus is not reachable from any other CPU.
1386 */
1387 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1388 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1389
1390 /*
1391 * Replace the huge spte with a pointer to the populated lower level
1392 * page table. Since we are making this change without a TLB flush vCPUs
1393 * will see a mix of the split mappings and the original huge mapping,
1394 * depending on what's currently in their TLB. This is fine from a
1395 * correctness standpoint since the translation will be the same either
1396 * way.
1397 */
1398 ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1399 if (ret)
1400 goto out;
1401
1402 /*
1403 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1404 * are overwriting from the page stats. But we have to manually update
1405 * the page stats with the new present child pages.
1406 */
1407 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1408
1409 out:
1410 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1411 return ret;
1412 }
1413
tdp_mmu_split_huge_pages_root(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int target_level,bool shared)1414 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1415 struct kvm_mmu_page *root,
1416 gfn_t start, gfn_t end,
1417 int target_level, bool shared)
1418 {
1419 struct kvm_mmu_page *sp = NULL;
1420 struct tdp_iter iter;
1421 int ret = 0;
1422
1423 rcu_read_lock();
1424
1425 /*
1426 * Traverse the page table splitting all huge pages above the target
1427 * level into one lower level. For example, if we encounter a 1GB page
1428 * we split it into 512 2MB pages.
1429 *
1430 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1431 * to visit an SPTE before ever visiting its children, which means we
1432 * will correctly recursively split huge pages that are more than one
1433 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1434 * and then splitting each of those to 512 4KB pages).
1435 */
1436 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1437 retry:
1438 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1439 continue;
1440
1441 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1442 continue;
1443
1444 if (!sp) {
1445 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1446 if (!sp) {
1447 ret = -ENOMEM;
1448 trace_kvm_mmu_split_huge_page(iter.gfn,
1449 iter.old_spte,
1450 iter.level, ret);
1451 break;
1452 }
1453
1454 if (iter.yielded)
1455 continue;
1456 }
1457
1458 tdp_mmu_init_child_sp(sp, &iter);
1459
1460 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1461 goto retry;
1462
1463 sp = NULL;
1464 }
1465
1466 rcu_read_unlock();
1467
1468 /*
1469 * It's possible to exit the loop having never used the last sp if, for
1470 * example, a vCPU doing HugePage NX splitting wins the race and
1471 * installs its own sp in place of the last sp we tried to split.
1472 */
1473 if (sp)
1474 tdp_mmu_free_sp(sp);
1475
1476 return ret;
1477 }
1478
1479
1480 /*
1481 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1482 */
kvm_tdp_mmu_try_split_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot,gfn_t start,gfn_t end,int target_level,bool shared)1483 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1484 const struct kvm_memory_slot *slot,
1485 gfn_t start, gfn_t end,
1486 int target_level, bool shared)
1487 {
1488 struct kvm_mmu_page *root;
1489 int r = 0;
1490
1491 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1492 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
1493 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1494 if (r) {
1495 kvm_tdp_mmu_put_root(kvm, root);
1496 break;
1497 }
1498 }
1499 }
1500
1501 /*
1502 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1503 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1504 * If AD bits are not enabled, this will require clearing the writable bit on
1505 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1506 * be flushed.
1507 */
clear_dirty_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end)1508 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1509 gfn_t start, gfn_t end)
1510 {
1511 u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1512 struct tdp_iter iter;
1513 bool spte_set = false;
1514
1515 rcu_read_lock();
1516
1517 tdp_root_for_each_pte(iter, root, start, end) {
1518 retry:
1519 if (!is_shadow_present_pte(iter.old_spte) ||
1520 !is_last_spte(iter.old_spte, iter.level))
1521 continue;
1522
1523 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1524 continue;
1525
1526 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1527 spte_ad_need_write_protect(iter.old_spte));
1528
1529 if (!(iter.old_spte & dbit))
1530 continue;
1531
1532 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1533 goto retry;
1534
1535 spte_set = true;
1536 }
1537
1538 rcu_read_unlock();
1539 return spte_set;
1540 }
1541
1542 /*
1543 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1544 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1545 * If AD bits are not enabled, this will require clearing the writable bit on
1546 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1547 * be flushed.
1548 */
kvm_tdp_mmu_clear_dirty_slot(struct kvm * kvm,const struct kvm_memory_slot * slot)1549 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1550 const struct kvm_memory_slot *slot)
1551 {
1552 struct kvm_mmu_page *root;
1553 bool spte_set = false;
1554
1555 lockdep_assert_held_read(&kvm->mmu_lock);
1556 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1557 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1558 slot->base_gfn + slot->npages);
1559
1560 return spte_set;
1561 }
1562
1563 /*
1564 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1565 * set in mask, starting at gfn. The given memslot is expected to contain all
1566 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1567 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1568 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1569 */
clear_dirty_pt_masked(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,unsigned long mask,bool wrprot)1570 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1571 gfn_t gfn, unsigned long mask, bool wrprot)
1572 {
1573 u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1574 shadow_dirty_mask;
1575 struct tdp_iter iter;
1576
1577 lockdep_assert_held_write(&kvm->mmu_lock);
1578
1579 rcu_read_lock();
1580
1581 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1582 gfn + BITS_PER_LONG) {
1583 if (!mask)
1584 break;
1585
1586 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1587 spte_ad_need_write_protect(iter.old_spte));
1588
1589 if (iter.level > PG_LEVEL_4K ||
1590 !(mask & (1UL << (iter.gfn - gfn))))
1591 continue;
1592
1593 mask &= ~(1UL << (iter.gfn - gfn));
1594
1595 if (!(iter.old_spte & dbit))
1596 continue;
1597
1598 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1599 iter.old_spte, dbit,
1600 iter.level);
1601
1602 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1603 iter.old_spte,
1604 iter.old_spte & ~dbit);
1605 kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
1606 }
1607
1608 rcu_read_unlock();
1609 }
1610
1611 /*
1612 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1613 * set in mask, starting at gfn. The given memslot is expected to contain all
1614 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1615 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1616 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1617 */
kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,unsigned long mask,bool wrprot)1618 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1619 struct kvm_memory_slot *slot,
1620 gfn_t gfn, unsigned long mask,
1621 bool wrprot)
1622 {
1623 struct kvm_mmu_page *root;
1624
1625 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1626 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1627 }
1628
zap_collapsible_spte_range(struct kvm * kvm,struct kvm_mmu_page * root,const struct kvm_memory_slot * slot)1629 static void zap_collapsible_spte_range(struct kvm *kvm,
1630 struct kvm_mmu_page *root,
1631 const struct kvm_memory_slot *slot)
1632 {
1633 gfn_t start = slot->base_gfn;
1634 gfn_t end = start + slot->npages;
1635 struct tdp_iter iter;
1636 int max_mapping_level;
1637
1638 rcu_read_lock();
1639
1640 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1641 retry:
1642 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1643 continue;
1644
1645 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1646 !is_shadow_present_pte(iter.old_spte))
1647 continue;
1648
1649 /*
1650 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1651 * a large page size, then its parent would have been zapped
1652 * instead of stepping down.
1653 */
1654 if (is_last_spte(iter.old_spte, iter.level))
1655 continue;
1656
1657 /*
1658 * If iter.gfn resides outside of the slot, i.e. the page for
1659 * the current level overlaps but is not contained by the slot,
1660 * then the SPTE can't be made huge. More importantly, trying
1661 * to query that info from slot->arch.lpage_info will cause an
1662 * out-of-bounds access.
1663 */
1664 if (iter.gfn < start || iter.gfn >= end)
1665 continue;
1666
1667 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1668 iter.gfn, PG_LEVEL_NUM);
1669 if (max_mapping_level < iter.level)
1670 continue;
1671
1672 /* Note, a successful atomic zap also does a remote TLB flush. */
1673 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1674 goto retry;
1675 }
1676
1677 rcu_read_unlock();
1678 }
1679
1680 /*
1681 * Zap non-leaf SPTEs (and free their associated page tables) which could
1682 * be replaced by huge pages, for GFNs within the slot.
1683 */
kvm_tdp_mmu_zap_collapsible_sptes(struct kvm * kvm,const struct kvm_memory_slot * slot)1684 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1685 const struct kvm_memory_slot *slot)
1686 {
1687 struct kvm_mmu_page *root;
1688
1689 lockdep_assert_held_read(&kvm->mmu_lock);
1690 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1691 zap_collapsible_spte_range(kvm, root, slot);
1692 }
1693
1694 /*
1695 * Removes write access on the last level SPTE mapping this GFN and unsets the
1696 * MMU-writable bit to ensure future writes continue to be intercepted.
1697 * Returns true if an SPTE was set and a TLB flush is needed.
1698 */
write_protect_gfn(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,int min_level)1699 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1700 gfn_t gfn, int min_level)
1701 {
1702 struct tdp_iter iter;
1703 u64 new_spte;
1704 bool spte_set = false;
1705
1706 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1707
1708 rcu_read_lock();
1709
1710 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1711 if (!is_shadow_present_pte(iter.old_spte) ||
1712 !is_last_spte(iter.old_spte, iter.level))
1713 continue;
1714
1715 new_spte = iter.old_spte &
1716 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1717
1718 if (new_spte == iter.old_spte)
1719 break;
1720
1721 tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1722 spte_set = true;
1723 }
1724
1725 rcu_read_unlock();
1726
1727 return spte_set;
1728 }
1729
1730 /*
1731 * Removes write access on the last level SPTE mapping this GFN and unsets the
1732 * MMU-writable bit to ensure future writes continue to be intercepted.
1733 * Returns true if an SPTE was set and a TLB flush is needed.
1734 */
kvm_tdp_mmu_write_protect_gfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,int min_level)1735 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1736 struct kvm_memory_slot *slot, gfn_t gfn,
1737 int min_level)
1738 {
1739 struct kvm_mmu_page *root;
1740 bool spte_set = false;
1741
1742 lockdep_assert_held_write(&kvm->mmu_lock);
1743 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1744 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1745
1746 return spte_set;
1747 }
1748
1749 /*
1750 * Return the level of the lowest level SPTE added to sptes.
1751 * That SPTE may be non-present.
1752 *
1753 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1754 */
kvm_tdp_mmu_get_walk(struct kvm_vcpu * vcpu,u64 addr,u64 * sptes,int * root_level)1755 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1756 int *root_level)
1757 {
1758 struct tdp_iter iter;
1759 struct kvm_mmu *mmu = vcpu->arch.mmu;
1760 gfn_t gfn = addr >> PAGE_SHIFT;
1761 int leaf = -1;
1762
1763 *root_level = vcpu->arch.mmu->root_role.level;
1764
1765 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1766 leaf = iter.level;
1767 sptes[leaf] = iter.old_spte;
1768 }
1769
1770 return leaf;
1771 }
1772
1773 /*
1774 * Returns the last level spte pointer of the shadow page walk for the given
1775 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1776 * walk could be performed, returns NULL and *spte does not contain valid data.
1777 *
1778 * Contract:
1779 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1780 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1781 *
1782 * WARNING: This function is only intended to be called during fast_page_fault.
1783 */
kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu * vcpu,u64 addr,u64 * spte)1784 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1785 u64 *spte)
1786 {
1787 struct tdp_iter iter;
1788 struct kvm_mmu *mmu = vcpu->arch.mmu;
1789 gfn_t gfn = addr >> PAGE_SHIFT;
1790 tdp_ptep_t sptep = NULL;
1791
1792 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1793 *spte = iter.old_spte;
1794 sptep = iter.sptep;
1795 }
1796
1797 /*
1798 * Perform the rcu_dereference to get the raw spte pointer value since
1799 * we are passing it up to fast_page_fault, which is shared with the
1800 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1801 * annotation.
1802 *
1803 * This is safe since fast_page_fault obeys the contracts of this
1804 * function as well as all TDP MMU contracts around modifying SPTEs
1805 * outside of mmu_lock.
1806 */
1807 return rcu_dereference(sptep);
1808 }
1809