xref: /linux/arch/s390/kvm/gmap.c (revision 086aca1030cff9e4729785bcfafaf4b8c489a892)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Guest memory management for KVM/s390
4  *
5  * Copyright IBM Corp. 2008, 2020, 2024
6  *
7  *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
8  *               Martin Schwidefsky <schwidefsky@de.ibm.com>
9  *               David Hildenbrand <david@redhat.com>
10  *               Janosch Frank <frankja@linux.ibm.com>
11  */
12 
13 #include <linux/compiler.h>
14 #include <linux/kvm.h>
15 #include <linux/kvm_host.h>
16 #include <linux/pgtable.h>
17 #include <linux/pagemap.h>
18 #include <asm/lowcore.h>
19 #include <asm/uv.h>
20 #include <asm/gmap_helpers.h>
21 
22 #include "dat.h"
23 #include "gmap.h"
24 #include "kvm-s390.h"
25 #include "faultin.h"
26 
kvm_s390_is_in_sie(struct kvm_vcpu * vcpu)27 static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu)
28 {
29 	return vcpu->arch.sie_block->prog0c & PROG_IN_SIE;
30 }
31 
gmap_limit_to_type(gfn_t limit)32 static int gmap_limit_to_type(gfn_t limit)
33 {
34 	if (!limit)
35 		return TABLE_TYPE_REGION1;
36 	if (limit <= _REGION3_SIZE >> PAGE_SHIFT)
37 		return TABLE_TYPE_SEGMENT;
38 	if (limit <= _REGION2_SIZE >> PAGE_SHIFT)
39 		return TABLE_TYPE_REGION3;
40 	if (limit <= _REGION1_SIZE >> PAGE_SHIFT)
41 		return TABLE_TYPE_REGION2;
42 	return TABLE_TYPE_REGION1;
43 }
44 
45 /**
46  * gmap_new() - Allocate and initialize a guest address space.
47  * @kvm: The kvm owning the guest.
48  * @limit: Maximum address of the gmap address space.
49  *
50  * Return: A guest address space structure.
51  */
gmap_new(struct kvm * kvm,gfn_t limit)52 struct gmap *gmap_new(struct kvm *kvm, gfn_t limit)
53 {
54 	struct crst_table *table;
55 	struct gmap *gmap;
56 	int type;
57 
58 	type = gmap_limit_to_type(limit);
59 
60 	gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT);
61 	if (!gmap)
62 		return NULL;
63 	INIT_LIST_HEAD(&gmap->children);
64 	INIT_LIST_HEAD(&gmap->list);
65 	INIT_LIST_HEAD(&gmap->scb_users);
66 	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE);
67 	spin_lock_init(&gmap->children_lock);
68 	spin_lock_init(&gmap->host_to_rmap_lock);
69 	refcount_set(&gmap->refcount, 1);
70 
71 	table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val);
72 	if (!table) {
73 		kfree(gmap);
74 		return NULL;
75 	}
76 
77 	gmap->asce.val = __pa(table);
78 	gmap->asce.dt = type;
79 	gmap->asce.tl = _ASCE_TABLE_LENGTH;
80 	gmap->asce.x = 1;
81 	gmap->asce.p = 1;
82 	gmap->asce.s = 1;
83 	gmap->kvm = kvm;
84 	set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
85 
86 	return gmap;
87 }
88 
gmap_add_child(struct gmap * parent,struct gmap * child)89 static void gmap_add_child(struct gmap *parent, struct gmap *child)
90 {
91 	KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm);
92 	KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm);
93 	KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm);
94 	lockdep_assert_held(&parent->children_lock);
95 
96 	child->parent = parent;
97 
98 	if (is_ucontrol(parent))
99 		set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
100 	else
101 		clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
102 
103 	if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags))
104 		set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
105 	else
106 		clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
107 
108 	if (kvm_is_ucontrol(parent->kvm))
109 		clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags);
110 	list_add(&child->list, &parent->children);
111 }
112 
gmap_new_child(struct gmap * parent,gfn_t limit)113 struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
114 {
115 	struct gmap *res;
116 
117 	lockdep_assert_not_held(&parent->children_lock);
118 	res = gmap_new(parent->kvm, limit);
119 	if (res) {
120 		scoped_guard(spinlock, &parent->children_lock)
121 			gmap_add_child(parent, res);
122 	}
123 	return res;
124 }
125 
gmap_set_limit(struct gmap * gmap,gfn_t limit)126 int gmap_set_limit(struct gmap *gmap, gfn_t limit)
127 {
128 	struct kvm_s390_mmu_cache *mc;
129 	int rc, type;
130 
131 	type = gmap_limit_to_type(limit);
132 
133 	mc = kvm_s390_new_mmu_cache();
134 	if (!mc)
135 		return -ENOMEM;
136 
137 	do {
138 		rc = kvm_s390_mmu_cache_topup(mc);
139 		if (rc)
140 			return rc;
141 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
142 			rc = dat_set_asce_limit(mc, &gmap->asce, type);
143 	} while (rc == -ENOMEM);
144 
145 	kvm_s390_free_mmu_cache(mc);
146 	return 0;
147 }
148 
gmap_rmap_radix_tree_free(struct radix_tree_root * root)149 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
150 {
151 	struct vsie_rmap *rmap, *rnext, *head;
152 	struct radix_tree_iter iter;
153 	unsigned long indices[16];
154 	unsigned long index;
155 	void __rcu **slot;
156 	int i, nr;
157 
158 	/* A radix tree is freed by deleting all of its entries */
159 	index = 0;
160 	do {
161 		nr = 0;
162 		radix_tree_for_each_slot(slot, root, &iter, index) {
163 			indices[nr] = iter.index;
164 			if (++nr == 16)
165 				break;
166 		}
167 		for (i = 0; i < nr; i++) {
168 			index = indices[i];
169 			head = radix_tree_delete(root, index);
170 			gmap_for_each_rmap_safe(rmap, rnext, head)
171 				kfree(rmap);
172 		}
173 	} while (nr > 0);
174 }
175 
gmap_remove_child(struct gmap * child)176 void gmap_remove_child(struct gmap *child)
177 {
178 	if (KVM_BUG_ON(!child->parent, child->kvm))
179 		return;
180 	lockdep_assert_held(&child->parent->children_lock);
181 
182 	list_del(&child->list);
183 	child->parent = NULL;
184 	child->invalidated = true;
185 }
186 
187 /**
188  * gmap_dispose() - Remove and free a guest address space and its children.
189  * @gmap: Pointer to the guest address space structure.
190  */
gmap_dispose(struct gmap * gmap)191 void gmap_dispose(struct gmap *gmap)
192 {
193 	/* The gmap must have been removed from the parent beforehands */
194 	KVM_BUG_ON(gmap->parent, gmap->kvm);
195 	/* All children of this gmap must have been removed beforehands */
196 	KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm);
197 	/* No VSIE shadow block is allowed to use this gmap */
198 	KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm);
199 	/* The ASCE must be valid */
200 	KVM_BUG_ON(!gmap->asce.val, gmap->kvm);
201 	/* The refcount must be 0 */
202 	KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm);
203 
204 	/* Flush tlb of all gmaps */
205 	asce_flush_tlb(gmap->asce);
206 
207 	/* Free all DAT tables. */
208 	dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap));
209 
210 	/* Free additional data for a shadow gmap */
211 	if (is_shadow(gmap))
212 		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
213 
214 	kfree(gmap);
215 }
216 
217 /**
218  * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy.
219  * @gmap: The gmap whose ASCE needs to be replaced.
220  *
221  * If the ASCE is a SEGMENT type then this function will return -EINVAL,
222  * otherwise the pointers in the host_to_guest radix tree will keep pointing
223  * to the wrong pages, causing use-after-free and memory corruption.
224  * If the allocation of the new top level page table fails, the ASCE is not
225  * replaced.
226  * In any case, the old ASCE is always removed from the gmap CRST list.
227  * Therefore the caller has to make sure to save a pointer to it
228  * beforehand, unless a leak is actually intended.
229  *
230  * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE,
231  *         -ENOMEM if runinng out of memory.
232  */
s390_replace_asce(struct gmap * gmap)233 int s390_replace_asce(struct gmap *gmap)
234 {
235 	struct crst_table *table;
236 	union asce asce;
237 
238 	/* Replacing segment type ASCEs would cause serious issues */
239 	if (gmap->asce.dt == ASCE_TYPE_SEGMENT)
240 		return -EINVAL;
241 
242 	table = dat_alloc_crst_sleepable(0);
243 	if (!table)
244 		return -ENOMEM;
245 	memcpy(table, dereference_asce(gmap->asce), sizeof(*table));
246 
247 	/* Set new table origin while preserving existing ASCE control bits */
248 	asce = gmap->asce;
249 	asce.rsto = virt_to_pfn(table);
250 	WRITE_ONCE(gmap->asce, asce);
251 
252 	return 0;
253 }
254 
_gmap_unmap_prefix(struct gmap * gmap,gfn_t gfn,gfn_t end,bool hint)255 bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint)
256 {
257 	struct kvm *kvm = gmap->kvm;
258 	struct kvm_vcpu *vcpu;
259 	gfn_t prefix_gfn;
260 	unsigned long i;
261 
262 	if (is_shadow(gmap))
263 		return false;
264 	kvm_for_each_vcpu(i, vcpu, kvm) {
265 		/* Match against both prefix pages */
266 		prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu));
267 		if (prefix_gfn < end && gfn <= prefix_gfn + 1) {
268 			if (hint && kvm_s390_is_in_sie(vcpu))
269 				return false;
270 			VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx",
271 				   gfn_to_gpa(gfn), gfn_to_gpa(end));
272 			kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
273 		}
274 	}
275 	return true;
276 }
277 
278 struct clear_young_pte_priv {
279 	struct gmap *gmap;
280 	bool young;
281 };
282 
gmap_clear_young_pte(union pte * ptep,gfn_t gfn,gfn_t end,struct dat_walk * walk)283 static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
284 {
285 	struct clear_young_pte_priv *p = walk->priv;
286 	union pgste pgste;
287 	union pte pte, new;
288 
289 	pte = READ_ONCE(*ptep);
290 
291 	if (!pte.s.pr || (!pte.s.y && pte.h.i))
292 		return 0;
293 
294 	pgste = pgste_get_lock(ptep);
295 	if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) {
296 		new = pte;
297 		new.h.i = 1;
298 		new.s.y = 0;
299 		if ((new.s.d || !new.h.p) && !new.s.s)
300 			folio_set_dirty(pfn_folio(pte.h.pfra));
301 		new.s.d = 0;
302 		new.h.p = 1;
303 
304 		pgste.prefix_notif = 0;
305 		pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap));
306 	}
307 	p->young = 1;
308 	pgste_set_unlock(ptep, pgste);
309 	return 0;
310 }
311 
gmap_clear_young_crste(union crste * crstep,gfn_t gfn,gfn_t end,struct dat_walk * walk)312 static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
313 {
314 	struct clear_young_pte_priv *priv = walk->priv;
315 	union crste crste, new;
316 
317 	do {
318 		crste = READ_ONCE(*crstep);
319 
320 		if (!crste.h.fc)
321 			return 0;
322 		if (!crste.s.fc1.y && crste.h.i)
323 			return 0;
324 		if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
325 			break;
326 
327 		new = crste;
328 		new.h.i = 1;
329 		new.s.fc1.y = 0;
330 		new.s.fc1.prefix_notif = 0;
331 		if (new.s.fc1.d || !new.h.p)
332 			folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
333 		new.s.fc1.d = 0;
334 		new.h.p = 1;
335 	} while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));
336 
337 	priv->young = 1;
338 	return 0;
339 }
340 
341 /**
342  * gmap_age_gfn() - Clear young.
343  * @gmap: The guest gmap.
344  * @start: The first gfn to test.
345  * @end: The gfn after the last one to test.
346  *
347  * Context: Called with the kvm mmu write lock held.
348  * Return: 1 if any page in the given range was young, otherwise 0.
349  */
gmap_age_gfn(struct gmap * gmap,gfn_t start,gfn_t end)350 bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end)
351 {
352 	const struct dat_walk_ops ops = {
353 		.pte_entry = gmap_clear_young_pte,
354 		.pmd_entry = gmap_clear_young_crste,
355 		.pud_entry = gmap_clear_young_crste,
356 	};
357 	struct clear_young_pte_priv priv = {
358 		.gmap = gmap,
359 		.young = false,
360 	};
361 
362 	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
363 
364 	return priv.young;
365 }
366 
367 struct gmap_unmap_priv {
368 	struct gmap *gmap;
369 	struct kvm_memory_slot *slot;
370 };
371 
_gmap_unmap_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * w)372 static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w)
373 {
374 	struct gmap_unmap_priv *priv = w->priv;
375 	struct folio *folio = NULL;
376 	unsigned long vmaddr;
377 	union pgste pgste;
378 
379 	pgste = pgste_get_lock(ptep);
380 	if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) {
381 		vmaddr = __gfn_to_hva_memslot(priv->slot, gfn);
382 		gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr);
383 	}
384 	if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
385 		folio = pfn_folio(ptep->h.pfra);
386 	pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn);
387 	pgste_set_unlock(ptep, pgste);
388 	if (folio)
389 		uv_convert_from_secure_folio(folio);
390 
391 	return 0;
392 }
393 
_gmap_unmap_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)394 static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
395 {
396 	struct gmap_unmap_priv *priv = walk->priv;
397 	struct folio *folio = NULL;
398 	union crste old = *crstep;
399 
400 	if (!old.h.fc)
401 		return 0;
402 
403 	if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
404 		folio = phys_to_folio(crste_origin_large(old));
405 	/* No races should happen because kvm->mmu_lock is held in write mode */
406 	KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
407 		   priv->gmap->kvm);
408 	if (folio)
409 		uv_convert_from_secure_folio(folio);
410 
411 	return 0;
412 }
413 
414 /**
415  * gmap_unmap_gfn_range() - Unmap a range of guest addresses.
416  * @gmap: The gmap to act on.
417  * @slot: The memslot in which the range is located.
418  * @start: The first gfn to unmap.
419  * @end: The gfn after the last one to unmap.
420  *
421  * Context: Called with the kvm mmu write lock held.
422  * Return: false
423  */
gmap_unmap_gfn_range(struct gmap * gmap,struct kvm_memory_slot * slot,gfn_t start,gfn_t end)424 bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
425 {
426 	const struct dat_walk_ops ops = {
427 		.pte_entry = _gmap_unmap_pte,
428 		.pmd_entry = _gmap_unmap_crste,
429 		.pud_entry = _gmap_unmap_crste,
430 	};
431 	struct gmap_unmap_priv priv = {
432 		.gmap = gmap,
433 		.slot = slot,
434 	};
435 
436 	lockdep_assert_held_write(&gmap->kvm->mmu_lock);
437 
438 	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
439 	return false;
440 }
441 
__pte_test_and_clear_softdirty(union pte * ptep,union pgste pgste,gfn_t gfn,struct gmap * gmap)442 static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn,
443 						  struct gmap *gmap)
444 {
445 	union pte pte = READ_ONCE(*ptep);
446 
447 	if (!pte.s.pr || (pte.h.p && !pte.s.sd))
448 		return pgste;
449 
450 	/*
451 	 * If this page contains one or more prefixes of vCPUS that are currently
452 	 * running, do not reset the protection, leave it marked as dirty.
453 	 */
454 	if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) {
455 		pte.h.p = 1;
456 		pte.s.sd = 0;
457 		pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn);
458 	}
459 
460 	mark_page_dirty(gmap->kvm, gfn);
461 
462 	return pgste;
463 }
464 
_pte_test_and_clear_softdirty(union pte * ptep,gfn_t gfn,gfn_t end,struct dat_walk * walk)465 static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end,
466 					  struct dat_walk *walk)
467 {
468 	struct gmap *gmap = walk->priv;
469 	union pgste pgste;
470 
471 	pgste = pgste_get_lock(ptep);
472 	pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap);
473 	pgste_set_unlock(ptep, pgste);
474 	return 0;
475 }
476 
_crste_test_and_clear_softdirty(union crste * table,gfn_t gfn,gfn_t end,struct dat_walk * walk)477 static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end,
478 					    struct dat_walk *walk)
479 {
480 	struct gmap *gmap = walk->priv;
481 	union crste crste, new;
482 
483 	if (fatal_signal_pending(current))
484 		return 1;
485 	do {
486 		crste = READ_ONCE(*table);
487 		if (!crste.h.fc)
488 			return 0;
489 		if (crste.h.p && !crste.s.fc1.sd)
490 			return 0;
491 
492 		/*
493 		 * If this large page contains one or more prefixes of vCPUs that are
494 		 * currently running, do not reset the protection, leave it marked as
495 		 * dirty.
496 		 */
497 		if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
498 			break;
499 		new = crste;
500 		new.h.p = 1;
501 		new.s.fc1.sd = 0;
502 	} while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));
503 
504 	for ( ; gfn < end; gfn++)
505 		mark_page_dirty(gmap->kvm, gfn);
506 
507 	return 0;
508 }
509 
gmap_sync_dirty_log(struct gmap * gmap,gfn_t start,gfn_t end)510 void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
511 {
512 	const struct dat_walk_ops walk_ops = {
513 		.pte_entry = _pte_test_and_clear_softdirty,
514 		.pmd_entry = _crste_test_and_clear_softdirty,
515 		.pud_entry = _crste_test_and_clear_softdirty,
516 	};
517 
518 	lockdep_assert_held(&gmap->kvm->mmu_lock);
519 
520 	_dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
521 }
522 
gmap_handle_minor_crste_fault(struct gmap * gmap,struct guest_fault * f)523 static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f)
524 {
525 	union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
526 
527 	/* Somehow the crste is not large anymore, let the slow path deal with it. */
528 	if (!oldcrste.h.fc)
529 		return 1;
530 
531 	f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn));
532 	f->writable = oldcrste.s.fc1.w;
533 
534 	/* Appropriate permissions already (race with another handler), nothing to do. */
535 	if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p))
536 		return 0;
537 
538 	if (!f->write_attempt || oldcrste.s.fc1.w) {
539 		f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d;
540 		newcrste = oldcrste;
541 		newcrste.h.i = 0;
542 		newcrste.s.fc1.y = 1;
543 		if (f->write_attempt) {
544 			newcrste.h.p = 0;
545 			newcrste.s.fc1.d = 1;
546 			newcrste.s.fc1.sd = 1;
547 		}
548 		/* In case of races, let the slow path deal with it. */
549 		return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn);
550 	}
551 	/* Trying to write on a read-only page, let the slow path deal with it. */
552 	return 1;
553 }
554 
_gmap_handle_minor_pte_fault(struct gmap * gmap,union pgste * pgste,struct guest_fault * f)555 static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
556 					struct guest_fault *f)
557 {
558 	union pte newpte, oldpte = READ_ONCE(*f->ptep);
559 
560 	f->pfn = oldpte.h.pfra;
561 	f->writable = oldpte.s.w;
562 
563 	/* Appropriate permissions already (race with another handler), nothing to do. */
564 	if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p))
565 		return 0;
566 	/* Trying to write on a read-only page, let the slow path deal with it. */
567 	if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w))
568 		return 1;
569 
570 	newpte = oldpte;
571 	newpte.h.i = 0;
572 	newpte.s.y = 1;
573 	if (f->write_attempt) {
574 		newpte.h.p = 0;
575 		newpte.s.d = 1;
576 		newpte.s.sd = 1;
577 	}
578 	*pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
579 
580 	return 0;
581 }
582 
583 /**
584  * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault.
585  * @gmap: The gmap whose fault needs to be resolved.
586  * @fault: Describes the fault that is being resolved.
587  *
588  * A minor fault is a fault that can be resolved quickly within gmap.
589  * The page is already mapped, the fault is only due to dirty/young tracking.
590  *
591  * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could
592  *         not be resolved and needs to go through the slow path.
593  */
gmap_try_fixup_minor(struct gmap * gmap,struct guest_fault * fault)594 int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
595 {
596 	union pgste pgste;
597 	int rc;
598 
599 	lockdep_assert_held(&gmap->kvm->mmu_lock);
600 
601 	rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE,
602 			    &fault->crstep, &fault->ptep);
603 	/* If a PTE or a leaf CRSTE could not be reached, slow path. */
604 	if (rc)
605 		return 1;
606 
607 	if (fault->ptep) {
608 		pgste = pgste_get_lock(fault->ptep);
609 		rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault);
610 		if (!rc && fault->callback)
611 			fault->callback(fault);
612 		pgste_set_unlock(fault->ptep, pgste);
613 	} else {
614 		rc = gmap_handle_minor_crste_fault(gmap, fault);
615 		if (!rc && fault->callback)
616 			fault->callback(fault);
617 	}
618 	return rc;
619 }
620 
gmap_2g_allowed(struct gmap * gmap,gfn_t gfn)621 static inline bool gmap_2g_allowed(struct gmap *gmap, gfn_t gfn)
622 {
623 	return false;
624 }
625 
gmap_1m_allowed(struct gmap * gmap,gfn_t gfn)626 static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn)
627 {
628 	return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags);
629 }
630 
_gmap_link(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,int level,struct guest_fault * f)631 static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
632 		      struct guest_fault *f)
633 {
634 	union crste oldval, newval;
635 	union pte newpte, oldpte;
636 	union pgste pgste;
637 	int rc = 0;
638 
639 	rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
640 			    &f->crstep, &f->ptep);
641 	if (rc == -ENOMEM)
642 		return rc;
643 	if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
644 		return rc;
645 	if (rc)
646 		return -EAGAIN;
647 	if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
648 		return -EINVAL;
649 
650 	if (f->ptep) {
651 		pgste = pgste_get_lock(f->ptep);
652 		oldpte = *f->ptep;
653 		newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
654 		newpte.s.sd = oldpte.s.sd;
655 		oldpte.s.sd = 0;
656 		if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
657 			pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
658 			if (f->callback)
659 				f->callback(f);
660 		} else {
661 			rc = -EAGAIN;
662 		}
663 		pgste_set_unlock(f->ptep, pgste);
664 	} else {
665 		do {
666 			oldval = READ_ONCE(*f->crstep);
667 			newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
668 					    f->write_attempt | oldval.s.fc1.d);
669 			newval.s.fc1.s = !f->page;
670 			newval.s.fc1.sd = oldval.s.fc1.sd;
671 			if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
672 			    crste_origin_large(oldval) != crste_origin_large(newval))
673 				return -EAGAIN;
674 		} while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
675 		if (f->callback)
676 			f->callback(f);
677 	}
678 
679 	return rc;
680 }
681 
gmap_link(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,struct guest_fault * f)682 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f)
683 {
684 	unsigned int order;
685 	int level;
686 
687 	lockdep_assert_held(&gmap->kvm->mmu_lock);
688 
689 	level = TABLE_TYPE_PAGE_TABLE;
690 	if (f->page) {
691 		order = folio_order(page_folio(f->page));
692 		if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f->gfn))
693 			level = TABLE_TYPE_REGION3;
694 		else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn))
695 			level = TABLE_TYPE_SEGMENT;
696 	}
697 	return _gmap_link(mc, gmap, level, f);
698 }
699 
gmap_ucas_map_one(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,gfn_t p_gfn,gfn_t c_gfn,bool force_alloc)700 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
701 			     gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
702 {
703 	union crste newcrste, oldcrste;
704 	struct page_table *pt;
705 	union crste *crstep;
706 	union pte *ptep;
707 	int rc;
708 
709 	if (force_alloc)
710 		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC,
711 				    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
712 	else
713 		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE,
714 				    TABLE_TYPE_SEGMENT, &crstep, &ptep);
715 	if (rc)
716 		return rc;
717 	if (!ptep) {
718 		newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT);
719 		newcrste.h.i = 1;
720 		newcrste.h.fc0.tl = 1;
721 	} else {
722 		pt = pte_table_start(ptep);
723 		dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT));
724 		newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT);
725 	}
726 	rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT,
727 			    &crstep, &ptep);
728 	if (rc)
729 		return rc;
730 	do {
731 		oldcrste = READ_ONCE(*crstep);
732 		if (oldcrste.val == newcrste.val)
733 			break;
734 	} while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
735 	return 0;
736 }
737 
gmap_ucas_translate_simple(struct gmap * gmap,gpa_t * gaddr,union crste ** crstepp)738 static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp)
739 {
740 	union pte *ptep;
741 	int rc;
742 
743 	rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE,
744 			    TABLE_TYPE_SEGMENT, crstepp, &ptep);
745 	if (rc || (!ptep && !crste_is_ucas(**crstepp)))
746 		return -EREMOTE;
747 	if (!ptep)
748 		return 1;
749 	*gaddr &= ~_SEGMENT_MASK;
750 	*gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
751 	return 0;
752 }
753 
754 /**
755  * gmap_ucas_translate() - Translate a vcpu address into a host gmap address
756  * @mc: The memory cache to be used for allocations.
757  * @gmap: The per-cpu gmap.
758  * @gaddr: Pointer to the address to be translated, will get overwritten with
759  *         the translated address in case of success.
760  * Translates the per-vCPU guest address into a fake guest address, which can
761  * then be used with the fake memslots that are identity mapping userspace.
762  * This allows ucontrol VMs to use the normal fault resolution path, like
763  * normal VMs.
764  *
765  * Return: %0 in case of success, otherwise %-EREMOTE.
766  */
gmap_ucas_translate(struct kvm_s390_mmu_cache * mc,struct gmap * gmap,gpa_t * gaddr)767 int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr)
768 {
769 	gpa_t translated_address;
770 	union crste *crstep;
771 	gfn_t gfn;
772 	int rc;
773 
774 	gfn = gpa_to_gfn(*gaddr);
775 
776 	scoped_guard(read_lock, &gmap->kvm->mmu_lock) {
777 		rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
778 		if (rc <= 0)
779 			return rc;
780 	}
781 	do {
782 		scoped_guard(write_lock, &gmap->kvm->mmu_lock) {
783 			rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
784 			if (rc <= 0)
785 				return rc;
786 			translated_address = (*gaddr & ~_SEGMENT_MASK) |
787 					     (crstep->val & _SEGMENT_MASK);
788 			rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true);
789 		}
790 		if (!rc) {
791 			*gaddr = translated_address;
792 			return 0;
793 		}
794 		if (rc != -ENOMEM)
795 			return -EREMOTE;
796 		rc = kvm_s390_mmu_cache_topup(mc);
797 		if (rc)
798 			return rc;
799 	} while (1);
800 	return 0;
801 }
802 
gmap_ucas_map(struct gmap * gmap,gfn_t p_gfn,gfn_t c_gfn,unsigned long count)803 int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
804 {
805 	struct kvm_s390_mmu_cache *mc;
806 	int rc;
807 
808 	mc = kvm_s390_new_mmu_cache();
809 	if (!mc)
810 		return -ENOMEM;
811 
812 	while (count) {
813 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
814 			rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false);
815 		if (rc == -ENOMEM) {
816 			rc = kvm_s390_mmu_cache_topup(mc);
817 			if (rc)
818 				return rc;
819 			continue;
820 		}
821 		if (rc)
822 			return rc;
823 
824 		count--;
825 		c_gfn += _PAGE_ENTRIES;
826 		p_gfn += _PAGE_ENTRIES;
827 	}
828 	return rc;
829 }
830 
gmap_ucas_unmap_one(struct gmap * gmap,gfn_t c_gfn)831 static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
832 {
833 	union crste *crstep;
834 	union pte *ptep;
835 	int rc;
836 
837 	rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
838 	if (rc)
839 		return;
840 	while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
841 		;
842 }
843 
gmap_ucas_unmap(struct gmap * gmap,gfn_t c_gfn,unsigned long count)844 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
845 {
846 	guard(read_lock)(&gmap->kvm->mmu_lock);
847 
848 	for ( ; count; count--, c_gfn += _PAGE_ENTRIES)
849 		gmap_ucas_unmap_one(gmap, c_gfn);
850 }
851 
_gmap_split_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)852 static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
853 {
854 	struct gmap *gmap = walk->priv;
855 	union crste crste, newcrste;
856 
857 	crste = READ_ONCE(*crstep);
858 	newcrste = _CRSTE_EMPTY(crste.h.tt);
859 
860 	while (crste_leaf(crste)) {
861 		if (crste_prefix(crste))
862 			gmap_unmap_prefix(gmap, gfn, next);
863 		if (crste.s.fc1.vsie_notif)
864 			gmap_handle_vsie_unshadow_event(gmap, gfn);
865 		if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce))
866 			break;
867 		crste = READ_ONCE(*crstep);
868 	}
869 
870 	if (need_resched())
871 		return next;
872 
873 	return 0;
874 }
875 
gmap_split_huge_pages(struct gmap * gmap)876 void gmap_split_huge_pages(struct gmap *gmap)
877 {
878 	const struct dat_walk_ops ops = {
879 		.pmd_entry = _gmap_split_crste,
880 		.pud_entry = _gmap_split_crste,
881 	};
882 	gfn_t start = 0;
883 
884 	do {
885 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
886 			start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce,
887 						    &ops, DAT_WALK_IGN_HOLES, gmap);
888 		cond_resched();
889 	} while (start);
890 }
891 
_gmap_enable_skeys(struct gmap * gmap)892 static int _gmap_enable_skeys(struct gmap *gmap)
893 {
894 	gfn_t start = 0;
895 	int rc;
896 
897 	if (uses_skeys(gmap))
898 		return 0;
899 
900 	set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
901 	rc = gmap_helper_disable_cow_sharing();
902 	if (rc) {
903 		clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
904 		return rc;
905 	}
906 
907 	do {
908 		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
909 			start = dat_reset_skeys(gmap->asce, start);
910 		cond_resched();
911 	} while (start);
912 	return 0;
913 }
914 
gmap_enable_skeys(struct gmap * gmap)915 int gmap_enable_skeys(struct gmap *gmap)
916 {
917 	int rc;
918 
919 	mmap_write_lock(gmap->kvm->mm);
920 	rc = _gmap_enable_skeys(gmap);
921 	mmap_write_unlock(gmap->kvm->mm);
922 	return rc;
923 }
924 
_destroy_pages_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)925 static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
926 {
927 	if (!ptep->s.pr)
928 		return 0;
929 	__kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep)));
930 	if (need_resched())
931 		return next;
932 	return 0;
933 }
934 
_destroy_pages_crste(union crste * crstep,gfn_t gfn,gfn_t next,struct dat_walk * walk)935 static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
936 {
937 	phys_addr_t origin, cur, end;
938 
939 	if (!crstep->h.fc || !crstep->s.fc1.pr)
940 		return 0;
941 
942 	origin = crste_origin_large(*crstep);
943 	cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
944 	end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
945 	for ( ; cur < end; cur += PAGE_SIZE)
946 		__kvm_s390_pv_destroy_page(phys_to_page(cur));
947 	if (need_resched())
948 		return next;
949 	return 0;
950 }
951 
gmap_pv_destroy_range(struct gmap * gmap,gfn_t start,gfn_t end,bool interruptible)952 int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible)
953 {
954 	const struct dat_walk_ops ops = {
955 		.pte_entry = _destroy_pages_pte,
956 		.pmd_entry = _destroy_pages_crste,
957 		.pud_entry = _destroy_pages_crste,
958 	};
959 
960 	do {
961 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
962 			start = _dat_walk_gfn_range(start, end, gmap->asce, &ops,
963 						    DAT_WALK_IGN_HOLES, NULL);
964 		if (interruptible && fatal_signal_pending(current))
965 			return -EINTR;
966 		cond_resched();
967 	} while (start && start < end);
968 	return 0;
969 }
970 
gmap_insert_rmap(struct gmap * sg,gfn_t p_gfn,gfn_t r_gfn,int level)971 int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level)
972 {
973 	struct vsie_rmap *rmap __free(kvfree) = NULL;
974 	struct vsie_rmap *temp;
975 	void __rcu **slot;
976 	int rc = 0;
977 
978 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
979 	lockdep_assert_held(&sg->host_to_rmap_lock);
980 
981 	rmap = kzalloc_obj(*rmap, GFP_ATOMIC);
982 	if (!rmap)
983 		return -ENOMEM;
984 
985 	rmap->r_gfn = r_gfn;
986 	rmap->level = level;
987 	slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn);
988 	if (slot) {
989 		rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock);
990 		for (temp = rmap->next; temp; temp = temp->next) {
991 			if (temp->val == rmap->val)
992 				return 0;
993 		}
994 		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
995 	} else {
996 		rmap->next = NULL;
997 		rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap);
998 		if (rc)
999 			return rc;
1000 	}
1001 	rmap = NULL;
1002 
1003 	return 0;
1004 }
1005 
gmap_protect_rmap(struct kvm_s390_mmu_cache * mc,struct gmap * sg,gfn_t p_gfn,gfn_t r_gfn,kvm_pfn_t pfn,int level,bool wr)1006 int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
1007 		      kvm_pfn_t pfn, int level, bool wr)
1008 {
1009 	union crste *crstep;
1010 	union pgste pgste;
1011 	union pte *ptep;
1012 	union pte pte;
1013 	int flags, rc;
1014 
1015 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1016 	lockdep_assert_held(&sg->parent->children_lock);
1017 
1018 	flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
1019 	rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags,
1020 			    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
1021 	if (rc)
1022 		return rc;
1023 	if (level <= TABLE_TYPE_REGION1) {
1024 		scoped_guard(spinlock, &sg->host_to_rmap_lock)
1025 			rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level);
1026 	}
1027 	if (rc)
1028 		return rc;
1029 
1030 	if (!pgste_get_trylock(ptep, &pgste))
1031 		return -EAGAIN;
1032 	pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false);
1033 	pte.h.p = 1;
1034 	pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false);
1035 	pgste.vsie_notif = 1;
1036 	pgste_set_unlock(ptep, pgste);
1037 
1038 	return 0;
1039 }
1040 
__set_cmma_dirty_pte(union pte * ptep,gfn_t gfn,gfn_t next,struct dat_walk * walk)1041 static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1042 {
1043 	__atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val);
1044 	if (need_resched())
1045 		return next;
1046 	return 0;
1047 }
1048 
gmap_set_cmma_all_dirty(struct gmap * gmap)1049 void gmap_set_cmma_all_dirty(struct gmap *gmap)
1050 {
1051 	const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, };
1052 	gfn_t gfn = 0;
1053 
1054 	do {
1055 		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
1056 			gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops,
1057 						  DAT_WALK_IGN_HOLES, NULL);
1058 		cond_resched();
1059 	} while (gfn);
1060 }
1061 
gmap_unshadow_level(struct gmap * sg,gfn_t r_gfn,int level)1062 static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
1063 {
1064 	unsigned long align = PAGE_SIZE;
1065 	gpa_t gaddr = gfn_to_gpa(r_gfn);
1066 	union crste *crstep;
1067 	union crste crste;
1068 	union pte *ptep;
1069 
1070 	if (level > TABLE_TYPE_PAGE_TABLE)
1071 		align = 1UL << (11 * level + _SEGMENT_SHIFT);
1072 	kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align));
1073 	sg->invalidated = true;
1074 	if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep))
1075 		return;
1076 	if (ptep) {
1077 		if (READ_ONCE(*ptep).val != _PTE_EMPTY.val)
1078 			dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
1079 		return;
1080 	}
1081 
1082 	crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
1083 	if (crste_leaf(crste) || crste.h.i)
1084 		return;
1085 	if (is_pmd(crste))
1086 		dat_free_pt(dereference_pmd(crste.pmd));
1087 	else
1088 		dat_free_level(dereference_crste(crste), true);
1089 }
1090 
gmap_unshadow(struct gmap * sg)1091 static void gmap_unshadow(struct gmap *sg)
1092 {
1093 	struct gmap_cache *gmap_cache, *next;
1094 
1095 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1096 	KVM_BUG_ON(!sg->parent, sg->kvm);
1097 
1098 	lockdep_assert_held(&sg->parent->children_lock);
1099 
1100 	gmap_remove_child(sg);
1101 	kvm_s390_vsie_gmap_notifier(sg, 0, -1UL);
1102 
1103 	list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) {
1104 		gmap_cache->gmap = NULL;
1105 		list_del(&gmap_cache->list);
1106 	}
1107 
1108 	gmap_put(sg);
1109 }
1110 
_gmap_handle_vsie_unshadow_event(struct gmap * parent,gfn_t gfn)1111 void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
1112 {
1113 	struct vsie_rmap *rmap, *rnext, *head;
1114 	struct gmap *sg, *next;
1115 	gfn_t start, end;
1116 
1117 	list_for_each_entry_safe(sg, next, &parent->children, list) {
1118 		start = sg->guest_asce.rsto;
1119 		end = start + sg->guest_asce.tl + 1;
1120 		if (!sg->guest_asce.r && gfn >= start && gfn < end) {
1121 			gmap_unshadow(sg);
1122 			continue;
1123 		}
1124 		scoped_guard(spinlock, &sg->host_to_rmap_lock)
1125 			head = radix_tree_delete(&sg->host_to_rmap, gfn);
1126 		gmap_for_each_rmap_safe(rmap, rnext, head)
1127 			gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
1128 	}
1129 }
1130 
1131 /**
1132  * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables.
1133  * @parent: Pointer to the parent gmap.
1134  * @asce: ASCE for which the shadow table is created.
1135  * @edat_level: Edat level to be used for the shadow translation.
1136  *
1137  * Context: Called with parent->children_lock held.
1138  *
1139  * Return: The pointer to a gmap if a shadow table with the given asce is
1140  * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1141  * otherwise NULL.
1142  */
gmap_find_shadow(struct gmap * parent,union asce asce,int edat_level)1143 static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level)
1144 {
1145 	struct gmap *sg;
1146 
1147 	lockdep_assert_held(&parent->children_lock);
1148 	list_for_each_entry(sg, &parent->children, list) {
1149 		if (!gmap_is_shadow_valid(sg, asce, edat_level))
1150 			continue;
1151 		return sg;
1152 	}
1153 	return NULL;
1154 }
1155 
1156 #define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE)
1157 struct gmap_protect_asce_top_level {
1158 	unsigned long seq;
1159 	struct guest_fault f[CRST_TABLE_PAGES];
1160 };
1161 
__gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg,struct gmap_protect_asce_top_level * context)1162 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1163 						struct gmap_protect_asce_top_level *context)
1164 {
1165 	struct gmap *parent;
1166 	int rc, i;
1167 
1168 	guard(write_lock)(&sg->kvm->mmu_lock);
1169 
1170 	if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
1171 		return -EAGAIN;
1172 
1173 	parent = READ_ONCE(sg->parent);
1174 	if (!parent)
1175 		return -EAGAIN;
1176 	scoped_guard(spinlock, &parent->children_lock) {
1177 		if (READ_ONCE(sg->parent) != parent)
1178 			return -EAGAIN;
1179 		sg->invalidated = false;
1180 		for (i = 0; i < CRST_TABLE_PAGES; i++) {
1181 			if (!context->f[i].valid)
1182 				continue;
1183 			rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn,
1184 					       TABLE_TYPE_REGION1 + 1, context->f[i].writable);
1185 			if (rc)
1186 				return rc;
1187 		}
1188 		gmap_add_child(sg->parent, sg);
1189 	}
1190 
1191 	kvm_s390_release_faultin_array(sg->kvm, context->f, false);
1192 	return 0;
1193 }
1194 
_gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg,struct gmap_protect_asce_top_level * context)1195 static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1196 					       struct gmap_protect_asce_top_level *context)
1197 {
1198 	int rc;
1199 
1200 	if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f))
1201 		return -EAGAIN;
1202 	do {
1203 		rc = kvm_s390_mmu_cache_topup(mc);
1204 		if (rc)
1205 			return rc;
1206 		rc = radix_tree_preload(GFP_KERNEL);
1207 		if (rc)
1208 			return rc;
1209 		rc = __gmap_protect_asce_top_level(mc, sg, context);
1210 		radix_tree_preload_end();
1211 	} while (rc == -ENOMEM);
1212 
1213 	return rc;
1214 }
1215 
gmap_protect_asce_top_level(struct kvm_s390_mmu_cache * mc,struct gmap * sg)1216 static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg)
1217 {
1218 	struct gmap_protect_asce_top_level context = {};
1219 	union asce asce = sg->guest_asce;
1220 	int rc;
1221 
1222 	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1223 
1224 	context.seq = sg->kvm->mmu_invalidate_seq;
1225 	/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
1226 	smp_rmb();
1227 
1228 	rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false);
1229 	if (rc > 0)
1230 		rc = -EFAULT;
1231 	if (!rc)
1232 		rc = _gmap_protect_asce_top_level(mc, sg, &context);
1233 	if (rc)
1234 		kvm_s390_release_faultin_array(sg->kvm, context.f, true);
1235 	return rc;
1236 }
1237 
1238 /**
1239  * gmap_create_shadow() - Create/find a shadow guest address space.
1240  * @mc: The cache to use to allocate dat tables.
1241  * @parent: Pointer to the parent gmap.
1242  * @asce: ASCE for which the shadow table is created.
1243  * @edat_level: Edat level to be used for the shadow translation.
1244  *
1245  * The pages of the top level page table referred by the asce parameter
1246  * will be set to read-only and marked in the PGSTEs of the kvm process.
1247  * The shadow table will be removed automatically on any change to the
1248  * PTE mapping for the source table.
1249  *
1250  * The returned shadow gmap will be returned with one extra reference.
1251  *
1252  * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1253  * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1254  * parent gmap table could not be protected.
1255  */
gmap_create_shadow(struct kvm_s390_mmu_cache * mc,struct gmap * parent,union asce asce,int edat_level)1256 struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent,
1257 				union asce asce, int edat_level)
1258 {
1259 	struct gmap *sg, *new;
1260 	int rc;
1261 
1262 	if (WARN_ON(!parent))
1263 		return ERR_PTR(-EINVAL);
1264 
1265 	scoped_guard(spinlock, &parent->children_lock) {
1266 		sg = gmap_find_shadow(parent, asce, edat_level);
1267 		if (sg) {
1268 			gmap_get(sg);
1269 			return sg;
1270 		}
1271 	}
1272 	/* Create a new shadow gmap. */
1273 	new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce));
1274 	if (!new)
1275 		return ERR_PTR(-ENOMEM);
1276 	new->guest_asce = asce;
1277 	new->edat_level = edat_level;
1278 	set_bit(GMAP_FLAG_SHADOW, &new->flags);
1279 
1280 	scoped_guard(spinlock, &parent->children_lock) {
1281 		/* Recheck if another CPU created the same shadow. */
1282 		sg = gmap_find_shadow(parent, asce, edat_level);
1283 		if (sg) {
1284 			gmap_put(new);
1285 			gmap_get(sg);
1286 			return sg;
1287 		}
1288 		if (asce.r) {
1289 			/* Only allow one real-space gmap shadow. */
1290 			list_for_each_entry(sg, &parent->children, list) {
1291 				if (sg->guest_asce.r) {
1292 					scoped_guard(write_lock, &parent->kvm->mmu_lock)
1293 						gmap_unshadow(sg);
1294 					break;
1295 				}
1296 			}
1297 			gmap_add_child(parent, new);
1298 			/* Nothing to protect, return right away. */
1299 			gmap_get(new);
1300 			return new;
1301 		}
1302 	}
1303 
1304 	gmap_get(new);
1305 	new->parent = parent;
1306 	/* Protect while inserting, protects against invalidation races. */
1307 	rc = gmap_protect_asce_top_level(mc, new);
1308 	if (rc) {
1309 		new->parent = NULL;
1310 		gmap_put(new);
1311 		gmap_put(new);
1312 		return ERR_PTR(rc);
1313 	}
1314 	return new;
1315 }
1316