1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  KVM guest address space mapping code
4  *
5  *    Copyright IBM Corp. 2007, 2020
6  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7  *		 David Hildenbrand <david@redhat.com>
8  *		 Janosch Frank <frankja@linux.vnet.ibm.com>
9  */
10 
11 #include <linux/cpufeature.h>
12 #include <linux/kernel.h>
13 #include <linux/pagewalk.h>
14 #include <linux/swap.h>
15 #include <linux/smp.h>
16 #include <linux/spinlock.h>
17 #include <linux/slab.h>
18 #include <linux/swapops.h>
19 #include <linux/ksm.h>
20 #include <linux/mman.h>
21 #include <linux/pgtable.h>
22 #include <asm/page-states.h>
23 #include <asm/pgalloc.h>
24 #include <asm/machine.h>
25 #include <asm/gmap.h>
26 #include <asm/page.h>
27 #include <asm/tlb.h>
28 
29 /*
30  * The address is saved in a radix tree directly; NULL would be ambiguous,
31  * since 0 is a valid address, and NULL is returned when nothing was found.
32  * The lower bits are ignored by all users of the macro, so it can be used
33  * to distinguish a valid address 0 from a NULL.
34  */
35 #define VALID_GADDR_FLAG 1
36 #define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
37 #define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
38 
39 #define GMAP_SHADOW_FAKE_TABLE 1ULL
40 
gmap_alloc_crst(void)41 static struct page *gmap_alloc_crst(void)
42 {
43 	struct page *page;
44 
45 	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
46 	if (!page)
47 		return NULL;
48 	__arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
49 	return page;
50 }
51 
52 /**
53  * gmap_alloc - allocate and initialize a guest address space
54  * @limit: maximum address of the gmap address space
55  *
56  * Returns a guest address space structure.
57  */
gmap_alloc(unsigned long limit)58 struct gmap *gmap_alloc(unsigned long limit)
59 {
60 	struct gmap *gmap;
61 	struct page *page;
62 	unsigned long *table;
63 	unsigned long etype, atype;
64 
65 	if (limit < _REGION3_SIZE) {
66 		limit = _REGION3_SIZE - 1;
67 		atype = _ASCE_TYPE_SEGMENT;
68 		etype = _SEGMENT_ENTRY_EMPTY;
69 	} else if (limit < _REGION2_SIZE) {
70 		limit = _REGION2_SIZE - 1;
71 		atype = _ASCE_TYPE_REGION3;
72 		etype = _REGION3_ENTRY_EMPTY;
73 	} else if (limit < _REGION1_SIZE) {
74 		limit = _REGION1_SIZE - 1;
75 		atype = _ASCE_TYPE_REGION2;
76 		etype = _REGION2_ENTRY_EMPTY;
77 	} else {
78 		limit = -1UL;
79 		atype = _ASCE_TYPE_REGION1;
80 		etype = _REGION1_ENTRY_EMPTY;
81 	}
82 	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
83 	if (!gmap)
84 		goto out;
85 	INIT_LIST_HEAD(&gmap->children);
86 	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
87 	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
88 	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
89 	spin_lock_init(&gmap->guest_table_lock);
90 	spin_lock_init(&gmap->shadow_lock);
91 	refcount_set(&gmap->ref_count, 1);
92 	page = gmap_alloc_crst();
93 	if (!page)
94 		goto out_free;
95 	table = page_to_virt(page);
96 	crst_table_init(table, etype);
97 	gmap->table = table;
98 	gmap->asce = atype | _ASCE_TABLE_LENGTH |
99 		_ASCE_USER_BITS | __pa(table);
100 	gmap->asce_end = limit;
101 	return gmap;
102 
103 out_free:
104 	kfree(gmap);
105 out:
106 	return NULL;
107 }
108 EXPORT_SYMBOL_GPL(gmap_alloc);
109 
110 /**
111  * gmap_create - create a guest address space
112  * @mm: pointer to the parent mm_struct
113  * @limit: maximum size of the gmap address space
114  *
115  * Returns a guest address space structure.
116  */
gmap_create(struct mm_struct * mm,unsigned long limit)117 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
118 {
119 	struct gmap *gmap;
120 	unsigned long gmap_asce;
121 
122 	gmap = gmap_alloc(limit);
123 	if (!gmap)
124 		return NULL;
125 	gmap->mm = mm;
126 	spin_lock(&mm->context.lock);
127 	list_add_rcu(&gmap->list, &mm->context.gmap_list);
128 	if (list_is_singular(&mm->context.gmap_list))
129 		gmap_asce = gmap->asce;
130 	else
131 		gmap_asce = -1UL;
132 	WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
133 	spin_unlock(&mm->context.lock);
134 	return gmap;
135 }
136 EXPORT_SYMBOL_GPL(gmap_create);
137 
gmap_flush_tlb(struct gmap * gmap)138 static void gmap_flush_tlb(struct gmap *gmap)
139 {
140 	if (cpu_has_idte())
141 		__tlb_flush_idte(gmap->asce);
142 	else
143 		__tlb_flush_global();
144 }
145 
gmap_radix_tree_free(struct radix_tree_root * root)146 static void gmap_radix_tree_free(struct radix_tree_root *root)
147 {
148 	struct radix_tree_iter iter;
149 	unsigned long indices[16];
150 	unsigned long index;
151 	void __rcu **slot;
152 	int i, nr;
153 
154 	/* A radix tree is freed by deleting all of its entries */
155 	index = 0;
156 	do {
157 		nr = 0;
158 		radix_tree_for_each_slot(slot, root, &iter, index) {
159 			indices[nr] = iter.index;
160 			if (++nr == 16)
161 				break;
162 		}
163 		for (i = 0; i < nr; i++) {
164 			index = indices[i];
165 			radix_tree_delete(root, index);
166 		}
167 	} while (nr > 0);
168 }
169 
gmap_rmap_radix_tree_free(struct radix_tree_root * root)170 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
171 {
172 	struct gmap_rmap *rmap, *rnext, *head;
173 	struct radix_tree_iter iter;
174 	unsigned long indices[16];
175 	unsigned long index;
176 	void __rcu **slot;
177 	int i, nr;
178 
179 	/* A radix tree is freed by deleting all of its entries */
180 	index = 0;
181 	do {
182 		nr = 0;
183 		radix_tree_for_each_slot(slot, root, &iter, index) {
184 			indices[nr] = iter.index;
185 			if (++nr == 16)
186 				break;
187 		}
188 		for (i = 0; i < nr; i++) {
189 			index = indices[i];
190 			head = radix_tree_delete(root, index);
191 			gmap_for_each_rmap_safe(rmap, rnext, head)
192 				kfree(rmap);
193 		}
194 	} while (nr > 0);
195 }
196 
gmap_free_crst(unsigned long * table,bool free_ptes)197 static void gmap_free_crst(unsigned long *table, bool free_ptes)
198 {
199 	bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
200 	int i;
201 
202 	if (is_segment) {
203 		if (!free_ptes)
204 			goto out;
205 		for (i = 0; i < _CRST_ENTRIES; i++)
206 			if (!(table[i] & _SEGMENT_ENTRY_INVALID))
207 				page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
208 	} else {
209 		for (i = 0; i < _CRST_ENTRIES; i++)
210 			if (!(table[i] & _REGION_ENTRY_INVALID))
211 				gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
212 	}
213 
214 out:
215 	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
216 }
217 
218 /**
219  * gmap_free - free a guest address space
220  * @gmap: pointer to the guest address space structure
221  *
222  * No locks required. There are no references to this gmap anymore.
223  */
gmap_free(struct gmap * gmap)224 void gmap_free(struct gmap *gmap)
225 {
226 	/* Flush tlb of all gmaps (if not already done for shadows) */
227 	if (!(gmap_is_shadow(gmap) && gmap->removed))
228 		gmap_flush_tlb(gmap);
229 	/* Free all segment & region tables. */
230 	gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
231 
232 	gmap_radix_tree_free(&gmap->guest_to_host);
233 	gmap_radix_tree_free(&gmap->host_to_guest);
234 
235 	/* Free additional data for a shadow gmap */
236 	if (gmap_is_shadow(gmap)) {
237 		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
238 		/* Release reference to the parent */
239 		gmap_put(gmap->parent);
240 	}
241 
242 	kfree(gmap);
243 }
244 EXPORT_SYMBOL_GPL(gmap_free);
245 
246 /**
247  * gmap_get - increase reference counter for guest address space
248  * @gmap: pointer to the guest address space structure
249  *
250  * Returns the gmap pointer
251  */
gmap_get(struct gmap * gmap)252 struct gmap *gmap_get(struct gmap *gmap)
253 {
254 	refcount_inc(&gmap->ref_count);
255 	return gmap;
256 }
257 EXPORT_SYMBOL_GPL(gmap_get);
258 
259 /**
260  * gmap_put - decrease reference counter for guest address space
261  * @gmap: pointer to the guest address space structure
262  *
263  * If the reference counter reaches zero the guest address space is freed.
264  */
gmap_put(struct gmap * gmap)265 void gmap_put(struct gmap *gmap)
266 {
267 	if (refcount_dec_and_test(&gmap->ref_count))
268 		gmap_free(gmap);
269 }
270 EXPORT_SYMBOL_GPL(gmap_put);
271 
272 /**
273  * gmap_remove - remove a guest address space but do not free it yet
274  * @gmap: pointer to the guest address space structure
275  */
gmap_remove(struct gmap * gmap)276 void gmap_remove(struct gmap *gmap)
277 {
278 	struct gmap *sg, *next;
279 	unsigned long gmap_asce;
280 
281 	/* Remove all shadow gmaps linked to this gmap */
282 	if (!list_empty(&gmap->children)) {
283 		spin_lock(&gmap->shadow_lock);
284 		list_for_each_entry_safe(sg, next, &gmap->children, list) {
285 			list_del(&sg->list);
286 			gmap_put(sg);
287 		}
288 		spin_unlock(&gmap->shadow_lock);
289 	}
290 	/* Remove gmap from the pre-mm list */
291 	spin_lock(&gmap->mm->context.lock);
292 	list_del_rcu(&gmap->list);
293 	if (list_empty(&gmap->mm->context.gmap_list))
294 		gmap_asce = 0;
295 	else if (list_is_singular(&gmap->mm->context.gmap_list))
296 		gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
297 					     struct gmap, list)->asce;
298 	else
299 		gmap_asce = -1UL;
300 	WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
301 	spin_unlock(&gmap->mm->context.lock);
302 	synchronize_rcu();
303 	/* Put reference */
304 	gmap_put(gmap);
305 }
306 EXPORT_SYMBOL_GPL(gmap_remove);
307 
308 /*
309  * gmap_alloc_table is assumed to be called with mmap_lock held
310  */
gmap_alloc_table(struct gmap * gmap,unsigned long * table,unsigned long init,unsigned long gaddr)311 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
312 			    unsigned long init, unsigned long gaddr)
313 {
314 	struct page *page;
315 	unsigned long *new;
316 
317 	/* since we dont free the gmap table until gmap_free we can unlock */
318 	page = gmap_alloc_crst();
319 	if (!page)
320 		return -ENOMEM;
321 	new = page_to_virt(page);
322 	crst_table_init(new, init);
323 	spin_lock(&gmap->guest_table_lock);
324 	if (*table & _REGION_ENTRY_INVALID) {
325 		*table = __pa(new) | _REGION_ENTRY_LENGTH |
326 			(*table & _REGION_ENTRY_TYPE_MASK);
327 		page = NULL;
328 	}
329 	spin_unlock(&gmap->guest_table_lock);
330 	if (page)
331 		__free_pages(page, CRST_ALLOC_ORDER);
332 	return 0;
333 }
334 
host_to_guest_lookup(struct gmap * gmap,unsigned long vmaddr)335 static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
336 {
337 	return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
338 }
339 
host_to_guest_delete(struct gmap * gmap,unsigned long vmaddr)340 static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
341 {
342 	return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
343 }
344 
host_to_guest_pmd_delete(struct gmap * gmap,unsigned long vmaddr,unsigned long * gaddr)345 static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
346 				       unsigned long *gaddr)
347 {
348 	*gaddr = host_to_guest_delete(gmap, vmaddr);
349 	if (IS_GADDR_VALID(*gaddr))
350 		return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
351 	return NULL;
352 }
353 
354 /**
355  * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
356  * @gmap: pointer to the guest address space structure
357  * @vmaddr: address in the host process address space
358  *
359  * Returns 1 if a TLB flush is required
360  */
__gmap_unlink_by_vmaddr(struct gmap * gmap,unsigned long vmaddr)361 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
362 {
363 	unsigned long gaddr;
364 	int flush = 0;
365 	pmd_t *pmdp;
366 
367 	BUG_ON(gmap_is_shadow(gmap));
368 	spin_lock(&gmap->guest_table_lock);
369 
370 	pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
371 	if (pmdp) {
372 		flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
373 		*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
374 	}
375 
376 	spin_unlock(&gmap->guest_table_lock);
377 	return flush;
378 }
379 
380 /**
381  * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
382  * @gmap: pointer to the guest address space structure
383  * @gaddr: address in the guest address space
384  *
385  * Returns 1 if a TLB flush is required
386  */
__gmap_unmap_by_gaddr(struct gmap * gmap,unsigned long gaddr)387 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
388 {
389 	unsigned long vmaddr;
390 
391 	vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
392 						   gaddr >> PMD_SHIFT);
393 	return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
394 }
395 
396 /**
397  * gmap_unmap_segment - unmap segment from the guest address space
398  * @gmap: pointer to the guest address space structure
399  * @to: address in the guest address space
400  * @len: length of the memory area to unmap
401  *
402  * Returns 0 if the unmap succeeded, -EINVAL if not.
403  */
gmap_unmap_segment(struct gmap * gmap,unsigned long to,unsigned long len)404 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
405 {
406 	unsigned long off;
407 	int flush;
408 
409 	BUG_ON(gmap_is_shadow(gmap));
410 	if ((to | len) & (PMD_SIZE - 1))
411 		return -EINVAL;
412 	if (len == 0 || to + len < to)
413 		return -EINVAL;
414 
415 	flush = 0;
416 	mmap_write_lock(gmap->mm);
417 	for (off = 0; off < len; off += PMD_SIZE)
418 		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
419 	mmap_write_unlock(gmap->mm);
420 	if (flush)
421 		gmap_flush_tlb(gmap);
422 	return 0;
423 }
424 EXPORT_SYMBOL_GPL(gmap_unmap_segment);
425 
426 /**
427  * gmap_map_segment - map a segment to the guest address space
428  * @gmap: pointer to the guest address space structure
429  * @from: source address in the parent address space
430  * @to: target address in the guest address space
431  * @len: length of the memory area to map
432  *
433  * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
434  */
gmap_map_segment(struct gmap * gmap,unsigned long from,unsigned long to,unsigned long len)435 int gmap_map_segment(struct gmap *gmap, unsigned long from,
436 		     unsigned long to, unsigned long len)
437 {
438 	unsigned long off;
439 	int flush;
440 
441 	BUG_ON(gmap_is_shadow(gmap));
442 	if ((from | to | len) & (PMD_SIZE - 1))
443 		return -EINVAL;
444 	if (len == 0 || from + len < from || to + len < to ||
445 	    from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
446 		return -EINVAL;
447 
448 	flush = 0;
449 	mmap_write_lock(gmap->mm);
450 	for (off = 0; off < len; off += PMD_SIZE) {
451 		/* Remove old translation */
452 		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
453 		/* Store new translation */
454 		if (radix_tree_insert(&gmap->guest_to_host,
455 				      (to + off) >> PMD_SHIFT,
456 				      (void *) from + off))
457 			break;
458 	}
459 	mmap_write_unlock(gmap->mm);
460 	if (flush)
461 		gmap_flush_tlb(gmap);
462 	if (off >= len)
463 		return 0;
464 	gmap_unmap_segment(gmap, to, len);
465 	return -ENOMEM;
466 }
467 EXPORT_SYMBOL_GPL(gmap_map_segment);
468 
469 /**
470  * __gmap_translate - translate a guest address to a user space address
471  * @gmap: pointer to guest mapping meta data structure
472  * @gaddr: guest address
473  *
474  * Returns user space address which corresponds to the guest address or
475  * -EFAULT if no such mapping exists.
476  * This function does not establish potentially missing page table entries.
477  * The mmap_lock of the mm that belongs to the address space must be held
478  * when this function gets called.
479  *
480  * Note: Can also be called for shadow gmaps.
481  */
__gmap_translate(struct gmap * gmap,unsigned long gaddr)482 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
483 {
484 	unsigned long vmaddr;
485 
486 	vmaddr = (unsigned long)
487 		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
488 	/* Note: guest_to_host is empty for a shadow gmap */
489 	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
490 }
491 EXPORT_SYMBOL_GPL(__gmap_translate);
492 
493 /**
494  * gmap_unlink - disconnect a page table from the gmap shadow tables
495  * @mm: pointer to the parent mm_struct
496  * @table: pointer to the host page table
497  * @vmaddr: vm address associated with the host page table
498  */
gmap_unlink(struct mm_struct * mm,unsigned long * table,unsigned long vmaddr)499 void gmap_unlink(struct mm_struct *mm, unsigned long *table,
500 		 unsigned long vmaddr)
501 {
502 	struct gmap *gmap;
503 	int flush;
504 
505 	rcu_read_lock();
506 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
507 		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
508 		if (flush)
509 			gmap_flush_tlb(gmap);
510 	}
511 	rcu_read_unlock();
512 }
513 
514 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
515 			   unsigned long gaddr);
516 
517 /**
518  * __gmap_link - set up shadow page tables to connect a host to a guest address
519  * @gmap: pointer to guest mapping meta data structure
520  * @gaddr: guest address
521  * @vmaddr: vm address
522  *
523  * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
524  * if the vm address is already mapped to a different guest segment.
525  * The mmap_lock of the mm that belongs to the address space must be held
526  * when this function gets called.
527  */
__gmap_link(struct gmap * gmap,unsigned long gaddr,unsigned long vmaddr)528 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
529 {
530 	struct mm_struct *mm;
531 	unsigned long *table;
532 	spinlock_t *ptl;
533 	pgd_t *pgd;
534 	p4d_t *p4d;
535 	pud_t *pud;
536 	pmd_t *pmd;
537 	u64 unprot;
538 	int rc;
539 
540 	BUG_ON(gmap_is_shadow(gmap));
541 	/* Create higher level tables in the gmap page table */
542 	table = gmap->table;
543 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
544 		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
545 		if ((*table & _REGION_ENTRY_INVALID) &&
546 		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
547 				     gaddr & _REGION1_MASK))
548 			return -ENOMEM;
549 		table = __va(*table & _REGION_ENTRY_ORIGIN);
550 	}
551 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
552 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
553 		if ((*table & _REGION_ENTRY_INVALID) &&
554 		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
555 				     gaddr & _REGION2_MASK))
556 			return -ENOMEM;
557 		table = __va(*table & _REGION_ENTRY_ORIGIN);
558 	}
559 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
560 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
561 		if ((*table & _REGION_ENTRY_INVALID) &&
562 		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
563 				     gaddr & _REGION3_MASK))
564 			return -ENOMEM;
565 		table = __va(*table & _REGION_ENTRY_ORIGIN);
566 	}
567 	table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
568 	/* Walk the parent mm page table */
569 	mm = gmap->mm;
570 	pgd = pgd_offset(mm, vmaddr);
571 	VM_BUG_ON(pgd_none(*pgd));
572 	p4d = p4d_offset(pgd, vmaddr);
573 	VM_BUG_ON(p4d_none(*p4d));
574 	pud = pud_offset(p4d, vmaddr);
575 	VM_BUG_ON(pud_none(*pud));
576 	/* large puds cannot yet be handled */
577 	if (pud_leaf(*pud))
578 		return -EFAULT;
579 	pmd = pmd_offset(pud, vmaddr);
580 	VM_BUG_ON(pmd_none(*pmd));
581 	/* Are we allowed to use huge pages? */
582 	if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
583 		return -EFAULT;
584 	/* Link gmap segment table entry location to page table. */
585 	rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
586 	if (rc)
587 		return rc;
588 	ptl = pmd_lock(mm, pmd);
589 	spin_lock(&gmap->guest_table_lock);
590 	if (*table == _SEGMENT_ENTRY_EMPTY) {
591 		rc = radix_tree_insert(&gmap->host_to_guest,
592 				       vmaddr >> PMD_SHIFT,
593 				       (void *)MAKE_VALID_GADDR(gaddr));
594 		if (!rc) {
595 			if (pmd_leaf(*pmd)) {
596 				*table = (pmd_val(*pmd) &
597 					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
598 					| _SEGMENT_ENTRY_GMAP_UC
599 					| _SEGMENT_ENTRY;
600 			} else
601 				*table = pmd_val(*pmd) &
602 					_SEGMENT_ENTRY_HARDWARE_BITS;
603 		}
604 	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
605 		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
606 		unprot = (u64)*table;
607 		unprot &= ~_SEGMENT_ENTRY_PROTECT;
608 		unprot |= _SEGMENT_ENTRY_GMAP_UC;
609 		gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
610 	}
611 	spin_unlock(&gmap->guest_table_lock);
612 	spin_unlock(ptl);
613 	radix_tree_preload_end();
614 	return rc;
615 }
616 EXPORT_SYMBOL(__gmap_link);
617 
618 /*
619  * this function is assumed to be called with mmap_lock held
620  */
__gmap_zap(struct gmap * gmap,unsigned long gaddr)621 void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
622 {
623 	struct vm_area_struct *vma;
624 	unsigned long vmaddr;
625 	spinlock_t *ptl;
626 	pte_t *ptep;
627 
628 	/* Find the vm address for the guest address */
629 	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
630 						   gaddr >> PMD_SHIFT);
631 	if (vmaddr) {
632 		vmaddr |= gaddr & ~PMD_MASK;
633 
634 		vma = vma_lookup(gmap->mm, vmaddr);
635 		if (!vma || is_vm_hugetlb_page(vma))
636 			return;
637 
638 		/* Get pointer to the page table entry */
639 		ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
640 		if (likely(ptep)) {
641 			ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
642 			pte_unmap_unlock(ptep, ptl);
643 		}
644 	}
645 }
646 EXPORT_SYMBOL_GPL(__gmap_zap);
647 
gmap_discard(struct gmap * gmap,unsigned long from,unsigned long to)648 void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
649 {
650 	unsigned long gaddr, vmaddr, size;
651 	struct vm_area_struct *vma;
652 
653 	mmap_read_lock(gmap->mm);
654 	for (gaddr = from; gaddr < to;
655 	     gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
656 		/* Find the vm address for the guest address */
657 		vmaddr = (unsigned long)
658 			radix_tree_lookup(&gmap->guest_to_host,
659 					  gaddr >> PMD_SHIFT);
660 		if (!vmaddr)
661 			continue;
662 		vmaddr |= gaddr & ~PMD_MASK;
663 		/* Find vma in the parent mm */
664 		vma = find_vma(gmap->mm, vmaddr);
665 		if (!vma)
666 			continue;
667 		/*
668 		 * We do not discard pages that are backed by
669 		 * hugetlbfs, so we don't have to refault them.
670 		 */
671 		if (is_vm_hugetlb_page(vma))
672 			continue;
673 		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
674 		zap_page_range_single(vma, vmaddr, size, NULL);
675 	}
676 	mmap_read_unlock(gmap->mm);
677 }
678 EXPORT_SYMBOL_GPL(gmap_discard);
679 
680 static LIST_HEAD(gmap_notifier_list);
681 static DEFINE_SPINLOCK(gmap_notifier_lock);
682 
683 /**
684  * gmap_register_pte_notifier - register a pte invalidation callback
685  * @nb: pointer to the gmap notifier block
686  */
gmap_register_pte_notifier(struct gmap_notifier * nb)687 void gmap_register_pte_notifier(struct gmap_notifier *nb)
688 {
689 	spin_lock(&gmap_notifier_lock);
690 	list_add_rcu(&nb->list, &gmap_notifier_list);
691 	spin_unlock(&gmap_notifier_lock);
692 }
693 EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
694 
695 /**
696  * gmap_unregister_pte_notifier - remove a pte invalidation callback
697  * @nb: pointer to the gmap notifier block
698  */
gmap_unregister_pte_notifier(struct gmap_notifier * nb)699 void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
700 {
701 	spin_lock(&gmap_notifier_lock);
702 	list_del_rcu(&nb->list);
703 	spin_unlock(&gmap_notifier_lock);
704 	synchronize_rcu();
705 }
706 EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
707 
708 /**
709  * gmap_call_notifier - call all registered invalidation callbacks
710  * @gmap: pointer to guest mapping meta data structure
711  * @start: start virtual address in the guest address space
712  * @end: end virtual address in the guest address space
713  */
gmap_call_notifier(struct gmap * gmap,unsigned long start,unsigned long end)714 static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
715 			       unsigned long end)
716 {
717 	struct gmap_notifier *nb;
718 
719 	list_for_each_entry(nb, &gmap_notifier_list, list)
720 		nb->notifier_call(gmap, start, end);
721 }
722 
723 /**
724  * gmap_table_walk - walk the gmap page tables
725  * @gmap: pointer to guest mapping meta data structure
726  * @gaddr: virtual address in the guest address space
727  * @level: page table level to stop at
728  *
729  * Returns a table entry pointer for the given guest address and @level
730  * @level=0 : returns a pointer to a page table table entry (or NULL)
731  * @level=1 : returns a pointer to a segment table entry (or NULL)
732  * @level=2 : returns a pointer to a region-3 table entry (or NULL)
733  * @level=3 : returns a pointer to a region-2 table entry (or NULL)
734  * @level=4 : returns a pointer to a region-1 table entry (or NULL)
735  *
736  * Returns NULL if the gmap page tables could not be walked to the
737  * requested level.
738  *
739  * Note: Can also be called for shadow gmaps.
740  */
gmap_table_walk(struct gmap * gmap,unsigned long gaddr,int level)741 unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
742 {
743 	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
744 	unsigned long *table = gmap->table;
745 
746 	if (gmap_is_shadow(gmap) && gmap->removed)
747 		return NULL;
748 
749 	if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
750 		return NULL;
751 
752 	if (asce_type != _ASCE_TYPE_REGION1 &&
753 	    gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
754 		return NULL;
755 
756 	switch (asce_type) {
757 	case _ASCE_TYPE_REGION1:
758 		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
759 		if (level == 4)
760 			break;
761 		if (*table & _REGION_ENTRY_INVALID)
762 			return NULL;
763 		table = __va(*table & _REGION_ENTRY_ORIGIN);
764 		fallthrough;
765 	case _ASCE_TYPE_REGION2:
766 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
767 		if (level == 3)
768 			break;
769 		if (*table & _REGION_ENTRY_INVALID)
770 			return NULL;
771 		table = __va(*table & _REGION_ENTRY_ORIGIN);
772 		fallthrough;
773 	case _ASCE_TYPE_REGION3:
774 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
775 		if (level == 2)
776 			break;
777 		if (*table & _REGION_ENTRY_INVALID)
778 			return NULL;
779 		table = __va(*table & _REGION_ENTRY_ORIGIN);
780 		fallthrough;
781 	case _ASCE_TYPE_SEGMENT:
782 		table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
783 		if (level == 1)
784 			break;
785 		if (*table & _REGION_ENTRY_INVALID)
786 			return NULL;
787 		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
788 		table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
789 	}
790 	return table;
791 }
792 EXPORT_SYMBOL(gmap_table_walk);
793 
794 /**
795  * gmap_pte_op_walk - walk the gmap page table, get the page table lock
796  *		      and return the pte pointer
797  * @gmap: pointer to guest mapping meta data structure
798  * @gaddr: virtual address in the guest address space
799  * @ptl: pointer to the spinlock pointer
800  *
801  * Returns a pointer to the locked pte for a guest address, or NULL
802  */
gmap_pte_op_walk(struct gmap * gmap,unsigned long gaddr,spinlock_t ** ptl)803 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
804 			       spinlock_t **ptl)
805 {
806 	unsigned long *table;
807 
808 	BUG_ON(gmap_is_shadow(gmap));
809 	/* Walk the gmap page table, lock and get pte pointer */
810 	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
811 	if (!table || *table & _SEGMENT_ENTRY_INVALID)
812 		return NULL;
813 	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
814 }
815 
816 /**
817  * gmap_pte_op_fixup - force a page in and connect the gmap page table
818  * @gmap: pointer to guest mapping meta data structure
819  * @gaddr: virtual address in the guest address space
820  * @vmaddr: address in the host process address space
821  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
822  *
823  * Returns 0 if the caller can retry __gmap_translate (might fail again),
824  * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
825  * up or connecting the gmap page table.
826  */
gmap_pte_op_fixup(struct gmap * gmap,unsigned long gaddr,unsigned long vmaddr,int prot)827 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
828 			     unsigned long vmaddr, int prot)
829 {
830 	struct mm_struct *mm = gmap->mm;
831 	unsigned int fault_flags;
832 	bool unlocked = false;
833 
834 	BUG_ON(gmap_is_shadow(gmap));
835 	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
836 	if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
837 		return -EFAULT;
838 	if (unlocked)
839 		/* lost mmap_lock, caller has to retry __gmap_translate */
840 		return 0;
841 	/* Connect the page tables */
842 	return __gmap_link(gmap, gaddr, vmaddr);
843 }
844 
845 /**
846  * gmap_pte_op_end - release the page table lock
847  * @ptep: pointer to the locked pte
848  * @ptl: pointer to the page table spinlock
849  */
gmap_pte_op_end(pte_t * ptep,spinlock_t * ptl)850 static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
851 {
852 	pte_unmap_unlock(ptep, ptl);
853 }
854 
855 /**
856  * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
857  *		      and return the pmd pointer
858  * @gmap: pointer to guest mapping meta data structure
859  * @gaddr: virtual address in the guest address space
860  *
861  * Returns a pointer to the pmd for a guest address, or NULL
862  */
gmap_pmd_op_walk(struct gmap * gmap,unsigned long gaddr)863 static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
864 {
865 	pmd_t *pmdp;
866 
867 	BUG_ON(gmap_is_shadow(gmap));
868 	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
869 	if (!pmdp)
870 		return NULL;
871 
872 	/* without huge pages, there is no need to take the table lock */
873 	if (!gmap->mm->context.allow_gmap_hpage_1m)
874 		return pmd_none(*pmdp) ? NULL : pmdp;
875 
876 	spin_lock(&gmap->guest_table_lock);
877 	if (pmd_none(*pmdp)) {
878 		spin_unlock(&gmap->guest_table_lock);
879 		return NULL;
880 	}
881 
882 	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
883 	if (!pmd_leaf(*pmdp))
884 		spin_unlock(&gmap->guest_table_lock);
885 	return pmdp;
886 }
887 
888 /**
889  * gmap_pmd_op_end - release the guest_table_lock if needed
890  * @gmap: pointer to the guest mapping meta data structure
891  * @pmdp: pointer to the pmd
892  */
gmap_pmd_op_end(struct gmap * gmap,pmd_t * pmdp)893 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
894 {
895 	if (pmd_leaf(*pmdp))
896 		spin_unlock(&gmap->guest_table_lock);
897 }
898 
899 /*
900  * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
901  * @pmdp: pointer to the pmd to be protected
902  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
903  * @bits: notification bits to set
904  *
905  * Returns:
906  * 0 if successfully protected
907  * -EAGAIN if a fixup is needed
908  * -EINVAL if unsupported notifier bits have been specified
909  *
910  * Expected to be called with sg->mm->mmap_lock in read and
911  * guest_table_lock held.
912  */
gmap_protect_pmd(struct gmap * gmap,unsigned long gaddr,pmd_t * pmdp,int prot,unsigned long bits)913 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
914 			    pmd_t *pmdp, int prot, unsigned long bits)
915 {
916 	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
917 	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
918 	pmd_t new = *pmdp;
919 
920 	/* Fixup needed */
921 	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
922 		return -EAGAIN;
923 
924 	if (prot == PROT_NONE && !pmd_i) {
925 		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
926 		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
927 	}
928 
929 	if (prot == PROT_READ && !pmd_p) {
930 		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
931 		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
932 		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
933 	}
934 
935 	if (bits & GMAP_NOTIFY_MPROT)
936 		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
937 
938 	/* Shadow GMAP protection needs split PMDs */
939 	if (bits & GMAP_NOTIFY_SHADOW)
940 		return -EINVAL;
941 
942 	return 0;
943 }
944 
945 /*
946  * gmap_protect_pte - remove access rights to memory and set pgste bits
947  * @gmap: pointer to guest mapping meta data structure
948  * @gaddr: virtual address in the guest address space
949  * @pmdp: pointer to the pmd associated with the pte
950  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
951  * @bits: notification bits to set
952  *
953  * Returns 0 if successfully protected, -ENOMEM if out of memory and
954  * -EAGAIN if a fixup is needed.
955  *
956  * Expected to be called with sg->mm->mmap_lock in read
957  */
gmap_protect_pte(struct gmap * gmap,unsigned long gaddr,pmd_t * pmdp,int prot,unsigned long bits)958 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
959 			    pmd_t *pmdp, int prot, unsigned long bits)
960 {
961 	int rc;
962 	pte_t *ptep;
963 	spinlock_t *ptl;
964 	unsigned long pbits = 0;
965 
966 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
967 		return -EAGAIN;
968 
969 	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
970 	if (!ptep)
971 		return -ENOMEM;
972 
973 	pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
974 	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
975 	/* Protect and unlock. */
976 	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
977 	gmap_pte_op_end(ptep, ptl);
978 	return rc;
979 }
980 
981 /*
982  * gmap_protect_range - remove access rights to memory and set pgste bits
983  * @gmap: pointer to guest mapping meta data structure
984  * @gaddr: virtual address in the guest address space
985  * @len: size of area
986  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
987  * @bits: pgste notification bits to set
988  *
989  * Returns:
990  *   PAGE_SIZE if a small page was successfully protected;
991  *   HPAGE_SIZE if a large page was successfully protected;
992  *   -ENOMEM if out of memory;
993  *   -EFAULT if gaddr is invalid (or mapping for shadows is missing);
994  *   -EAGAIN if the guest mapping is missing and should be fixed by the caller.
995  *
996  * Context: Called with sg->mm->mmap_lock in read.
997  */
gmap_protect_one(struct gmap * gmap,unsigned long gaddr,int prot,unsigned long bits)998 int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
999 {
1000 	pmd_t *pmdp;
1001 	int rc = 0;
1002 
1003 	BUG_ON(gmap_is_shadow(gmap));
1004 
1005 	pmdp = gmap_pmd_op_walk(gmap, gaddr);
1006 	if (!pmdp)
1007 		return -EAGAIN;
1008 
1009 	if (!pmd_leaf(*pmdp)) {
1010 		rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
1011 		if (!rc)
1012 			rc = PAGE_SIZE;
1013 	} else {
1014 		rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
1015 		if (!rc)
1016 			rc = HPAGE_SIZE;
1017 	}
1018 	gmap_pmd_op_end(gmap, pmdp);
1019 
1020 	return rc;
1021 }
1022 EXPORT_SYMBOL_GPL(gmap_protect_one);
1023 
1024 /**
1025  * gmap_read_table - get an unsigned long value from a guest page table using
1026  *                   absolute addressing, without marking the page referenced.
1027  * @gmap: pointer to guest mapping meta data structure
1028  * @gaddr: virtual address in the guest address space
1029  * @val: pointer to the unsigned long value to return
1030  *
1031  * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
1032  * if reading using the virtual address failed. -EINVAL if called on a gmap
1033  * shadow.
1034  *
1035  * Called with gmap->mm->mmap_lock in read.
1036  */
gmap_read_table(struct gmap * gmap,unsigned long gaddr,unsigned long * val)1037 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
1038 {
1039 	unsigned long address, vmaddr;
1040 	spinlock_t *ptl;
1041 	pte_t *ptep, pte;
1042 	int rc;
1043 
1044 	if (gmap_is_shadow(gmap))
1045 		return -EINVAL;
1046 
1047 	while (1) {
1048 		rc = -EAGAIN;
1049 		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
1050 		if (ptep) {
1051 			pte = *ptep;
1052 			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
1053 				address = pte_val(pte) & PAGE_MASK;
1054 				address += gaddr & ~PAGE_MASK;
1055 				*val = *(unsigned long *)__va(address);
1056 				set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
1057 				/* Do *NOT* clear the _PAGE_INVALID bit! */
1058 				rc = 0;
1059 			}
1060 			gmap_pte_op_end(ptep, ptl);
1061 		}
1062 		if (!rc)
1063 			break;
1064 		vmaddr = __gmap_translate(gmap, gaddr);
1065 		if (IS_ERR_VALUE(vmaddr)) {
1066 			rc = vmaddr;
1067 			break;
1068 		}
1069 		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
1070 		if (rc)
1071 			break;
1072 	}
1073 	return rc;
1074 }
1075 EXPORT_SYMBOL_GPL(gmap_read_table);
1076 
1077 /**
1078  * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
1079  * @sg: pointer to the shadow guest address space structure
1080  * @vmaddr: vm address associated with the rmap
1081  * @rmap: pointer to the rmap structure
1082  *
1083  * Called with the sg->guest_table_lock
1084  */
gmap_insert_rmap(struct gmap * sg,unsigned long vmaddr,struct gmap_rmap * rmap)1085 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1086 				    struct gmap_rmap *rmap)
1087 {
1088 	struct gmap_rmap *temp;
1089 	void __rcu **slot;
1090 
1091 	BUG_ON(!gmap_is_shadow(sg));
1092 	slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1093 	if (slot) {
1094 		rmap->next = radix_tree_deref_slot_protected(slot,
1095 							&sg->guest_table_lock);
1096 		for (temp = rmap->next; temp; temp = temp->next) {
1097 			if (temp->raddr == rmap->raddr) {
1098 				kfree(rmap);
1099 				return;
1100 			}
1101 		}
1102 		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1103 	} else {
1104 		rmap->next = NULL;
1105 		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
1106 				  rmap);
1107 	}
1108 }
1109 
1110 /**
1111  * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
1112  * @sg: pointer to the shadow guest address space structure
1113  * @raddr: rmap address in the shadow gmap
1114  * @paddr: address in the parent guest address space
1115  * @len: length of the memory area to protect
1116  *
1117  * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1118  * if out of memory and -EFAULT if paddr is invalid.
1119  */
gmap_protect_rmap(struct gmap * sg,unsigned long raddr,unsigned long paddr,unsigned long len)1120 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1121 			     unsigned long paddr, unsigned long len)
1122 {
1123 	struct gmap *parent;
1124 	struct gmap_rmap *rmap;
1125 	unsigned long vmaddr;
1126 	spinlock_t *ptl;
1127 	pte_t *ptep;
1128 	int rc;
1129 
1130 	BUG_ON(!gmap_is_shadow(sg));
1131 	parent = sg->parent;
1132 	while (len) {
1133 		vmaddr = __gmap_translate(parent, paddr);
1134 		if (IS_ERR_VALUE(vmaddr))
1135 			return vmaddr;
1136 		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1137 		if (!rmap)
1138 			return -ENOMEM;
1139 		rmap->raddr = raddr;
1140 		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1141 		if (rc) {
1142 			kfree(rmap);
1143 			return rc;
1144 		}
1145 		rc = -EAGAIN;
1146 		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
1147 		if (ptep) {
1148 			spin_lock(&sg->guest_table_lock);
1149 			rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
1150 					     PGSTE_VSIE_BIT);
1151 			if (!rc)
1152 				gmap_insert_rmap(sg, vmaddr, rmap);
1153 			spin_unlock(&sg->guest_table_lock);
1154 			gmap_pte_op_end(ptep, ptl);
1155 		}
1156 		radix_tree_preload_end();
1157 		if (rc) {
1158 			kfree(rmap);
1159 			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
1160 			if (rc)
1161 				return rc;
1162 			continue;
1163 		}
1164 		paddr += PAGE_SIZE;
1165 		len -= PAGE_SIZE;
1166 	}
1167 	return 0;
1168 }
1169 
1170 #define _SHADOW_RMAP_MASK	0x7
1171 #define _SHADOW_RMAP_REGION1	0x5
1172 #define _SHADOW_RMAP_REGION2	0x4
1173 #define _SHADOW_RMAP_REGION3	0x3
1174 #define _SHADOW_RMAP_SEGMENT	0x2
1175 #define _SHADOW_RMAP_PGTABLE	0x1
1176 
1177 /**
1178  * gmap_idte_one - invalidate a single region or segment table entry
1179  * @asce: region or segment table *origin* + table-type bits
1180  * @vaddr: virtual address to identify the table entry to flush
1181  *
1182  * The invalid bit of a single region or segment table entry is set
1183  * and the associated TLB entries depending on the entry are flushed.
1184  * The table-type of the @asce identifies the portion of the @vaddr
1185  * that is used as the invalidation index.
1186  */
gmap_idte_one(unsigned long asce,unsigned long vaddr)1187 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1188 {
1189 	asm volatile(
1190 		"	idte	%0,0,%1"
1191 		: : "a" (asce), "a" (vaddr) : "cc", "memory");
1192 }
1193 
1194 /**
1195  * gmap_unshadow_page - remove a page from a shadow page table
1196  * @sg: pointer to the shadow guest address space structure
1197  * @raddr: rmap address in the shadow guest address space
1198  *
1199  * Called with the sg->guest_table_lock
1200  */
gmap_unshadow_page(struct gmap * sg,unsigned long raddr)1201 static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1202 {
1203 	unsigned long *table;
1204 
1205 	BUG_ON(!gmap_is_shadow(sg));
1206 	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1207 	if (!table || *table & _PAGE_INVALID)
1208 		return;
1209 	gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
1210 	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1211 }
1212 
1213 /**
1214  * __gmap_unshadow_pgt - remove all entries from a shadow page table
1215  * @sg: pointer to the shadow guest address space structure
1216  * @raddr: rmap address in the shadow guest address space
1217  * @pgt: pointer to the start of a shadow page table
1218  *
1219  * Called with the sg->guest_table_lock
1220  */
__gmap_unshadow_pgt(struct gmap * sg,unsigned long raddr,unsigned long * pgt)1221 static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1222 				unsigned long *pgt)
1223 {
1224 	int i;
1225 
1226 	BUG_ON(!gmap_is_shadow(sg));
1227 	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
1228 		pgt[i] = _PAGE_INVALID;
1229 }
1230 
1231 /**
1232  * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1233  * @sg: pointer to the shadow guest address space structure
1234  * @raddr: address in the shadow guest address space
1235  *
1236  * Called with the sg->guest_table_lock
1237  */
gmap_unshadow_pgt(struct gmap * sg,unsigned long raddr)1238 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1239 {
1240 	unsigned long *ste;
1241 	phys_addr_t sto, pgt;
1242 	struct ptdesc *ptdesc;
1243 
1244 	BUG_ON(!gmap_is_shadow(sg));
1245 	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1246 	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
1247 		return;
1248 	gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
1249 	sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
1250 	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1251 	pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
1252 	*ste = _SEGMENT_ENTRY_EMPTY;
1253 	__gmap_unshadow_pgt(sg, raddr, __va(pgt));
1254 	/* Free page table */
1255 	ptdesc = page_ptdesc(phys_to_page(pgt));
1256 	page_table_free_pgste(ptdesc);
1257 }
1258 
1259 /**
1260  * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1261  * @sg: pointer to the shadow guest address space structure
1262  * @raddr: rmap address in the shadow guest address space
1263  * @sgt: pointer to the start of a shadow segment table
1264  *
1265  * Called with the sg->guest_table_lock
1266  */
__gmap_unshadow_sgt(struct gmap * sg,unsigned long raddr,unsigned long * sgt)1267 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1268 				unsigned long *sgt)
1269 {
1270 	struct ptdesc *ptdesc;
1271 	phys_addr_t pgt;
1272 	int i;
1273 
1274 	BUG_ON(!gmap_is_shadow(sg));
1275 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
1276 		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
1277 			continue;
1278 		pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
1279 		sgt[i] = _SEGMENT_ENTRY_EMPTY;
1280 		__gmap_unshadow_pgt(sg, raddr, __va(pgt));
1281 		/* Free page table */
1282 		ptdesc = page_ptdesc(phys_to_page(pgt));
1283 		page_table_free_pgste(ptdesc);
1284 	}
1285 }
1286 
1287 /**
1288  * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1289  * @sg: pointer to the shadow guest address space structure
1290  * @raddr: rmap address in the shadow guest address space
1291  *
1292  * Called with the shadow->guest_table_lock
1293  */
gmap_unshadow_sgt(struct gmap * sg,unsigned long raddr)1294 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1295 {
1296 	unsigned long r3o, *r3e;
1297 	phys_addr_t sgt;
1298 	struct page *page;
1299 
1300 	BUG_ON(!gmap_is_shadow(sg));
1301 	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1302 	if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
1303 		return;
1304 	gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
1305 	r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
1306 	gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
1307 	sgt = *r3e & _REGION_ENTRY_ORIGIN;
1308 	*r3e = _REGION3_ENTRY_EMPTY;
1309 	__gmap_unshadow_sgt(sg, raddr, __va(sgt));
1310 	/* Free segment table */
1311 	page = phys_to_page(sgt);
1312 	__free_pages(page, CRST_ALLOC_ORDER);
1313 }
1314 
1315 /**
1316  * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1317  * @sg: pointer to the shadow guest address space structure
1318  * @raddr: address in the shadow guest address space
1319  * @r3t: pointer to the start of a shadow region-3 table
1320  *
1321  * Called with the sg->guest_table_lock
1322  */
__gmap_unshadow_r3t(struct gmap * sg,unsigned long raddr,unsigned long * r3t)1323 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1324 				unsigned long *r3t)
1325 {
1326 	struct page *page;
1327 	phys_addr_t sgt;
1328 	int i;
1329 
1330 	BUG_ON(!gmap_is_shadow(sg));
1331 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
1332 		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
1333 			continue;
1334 		sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
1335 		r3t[i] = _REGION3_ENTRY_EMPTY;
1336 		__gmap_unshadow_sgt(sg, raddr, __va(sgt));
1337 		/* Free segment table */
1338 		page = phys_to_page(sgt);
1339 		__free_pages(page, CRST_ALLOC_ORDER);
1340 	}
1341 }
1342 
1343 /**
1344  * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1345  * @sg: pointer to the shadow guest address space structure
1346  * @raddr: rmap address in the shadow guest address space
1347  *
1348  * Called with the sg->guest_table_lock
1349  */
gmap_unshadow_r3t(struct gmap * sg,unsigned long raddr)1350 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1351 {
1352 	unsigned long r2o, *r2e;
1353 	phys_addr_t r3t;
1354 	struct page *page;
1355 
1356 	BUG_ON(!gmap_is_shadow(sg));
1357 	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1358 	if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
1359 		return;
1360 	gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
1361 	r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
1362 	gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
1363 	r3t = *r2e & _REGION_ENTRY_ORIGIN;
1364 	*r2e = _REGION2_ENTRY_EMPTY;
1365 	__gmap_unshadow_r3t(sg, raddr, __va(r3t));
1366 	/* Free region 3 table */
1367 	page = phys_to_page(r3t);
1368 	__free_pages(page, CRST_ALLOC_ORDER);
1369 }
1370 
1371 /**
1372  * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1373  * @sg: pointer to the shadow guest address space structure
1374  * @raddr: rmap address in the shadow guest address space
1375  * @r2t: pointer to the start of a shadow region-2 table
1376  *
1377  * Called with the sg->guest_table_lock
1378  */
__gmap_unshadow_r2t(struct gmap * sg,unsigned long raddr,unsigned long * r2t)1379 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1380 				unsigned long *r2t)
1381 {
1382 	phys_addr_t r3t;
1383 	struct page *page;
1384 	int i;
1385 
1386 	BUG_ON(!gmap_is_shadow(sg));
1387 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
1388 		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
1389 			continue;
1390 		r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
1391 		r2t[i] = _REGION2_ENTRY_EMPTY;
1392 		__gmap_unshadow_r3t(sg, raddr, __va(r3t));
1393 		/* Free region 3 table */
1394 		page = phys_to_page(r3t);
1395 		__free_pages(page, CRST_ALLOC_ORDER);
1396 	}
1397 }
1398 
1399 /**
1400  * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1401  * @sg: pointer to the shadow guest address space structure
1402  * @raddr: rmap address in the shadow guest address space
1403  *
1404  * Called with the sg->guest_table_lock
1405  */
gmap_unshadow_r2t(struct gmap * sg,unsigned long raddr)1406 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1407 {
1408 	unsigned long r1o, *r1e;
1409 	struct page *page;
1410 	phys_addr_t r2t;
1411 
1412 	BUG_ON(!gmap_is_shadow(sg));
1413 	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1414 	if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
1415 		return;
1416 	gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
1417 	r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
1418 	gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
1419 	r2t = *r1e & _REGION_ENTRY_ORIGIN;
1420 	*r1e = _REGION1_ENTRY_EMPTY;
1421 	__gmap_unshadow_r2t(sg, raddr, __va(r2t));
1422 	/* Free region 2 table */
1423 	page = phys_to_page(r2t);
1424 	__free_pages(page, CRST_ALLOC_ORDER);
1425 }
1426 
1427 /**
1428  * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1429  * @sg: pointer to the shadow guest address space structure
1430  * @raddr: rmap address in the shadow guest address space
1431  * @r1t: pointer to the start of a shadow region-1 table
1432  *
1433  * Called with the shadow->guest_table_lock
1434  */
__gmap_unshadow_r1t(struct gmap * sg,unsigned long raddr,unsigned long * r1t)1435 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1436 				unsigned long *r1t)
1437 {
1438 	unsigned long asce;
1439 	struct page *page;
1440 	phys_addr_t r2t;
1441 	int i;
1442 
1443 	BUG_ON(!gmap_is_shadow(sg));
1444 	asce = __pa(r1t) | _ASCE_TYPE_REGION1;
1445 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
1446 		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
1447 			continue;
1448 		r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
1449 		__gmap_unshadow_r2t(sg, raddr, __va(r2t));
1450 		/* Clear entry and flush translation r1t -> r2t */
1451 		gmap_idte_one(asce, raddr);
1452 		r1t[i] = _REGION1_ENTRY_EMPTY;
1453 		/* Free region 2 table */
1454 		page = phys_to_page(r2t);
1455 		__free_pages(page, CRST_ALLOC_ORDER);
1456 	}
1457 }
1458 
1459 /**
1460  * gmap_unshadow - remove a shadow page table completely
1461  * @sg: pointer to the shadow guest address space structure
1462  *
1463  * Called with sg->guest_table_lock
1464  */
gmap_unshadow(struct gmap * sg)1465 void gmap_unshadow(struct gmap *sg)
1466 {
1467 	unsigned long *table;
1468 
1469 	BUG_ON(!gmap_is_shadow(sg));
1470 	if (sg->removed)
1471 		return;
1472 	sg->removed = 1;
1473 	gmap_call_notifier(sg, 0, -1UL);
1474 	gmap_flush_tlb(sg);
1475 	table = __va(sg->asce & _ASCE_ORIGIN);
1476 	switch (sg->asce & _ASCE_TYPE_MASK) {
1477 	case _ASCE_TYPE_REGION1:
1478 		__gmap_unshadow_r1t(sg, 0, table);
1479 		break;
1480 	case _ASCE_TYPE_REGION2:
1481 		__gmap_unshadow_r2t(sg, 0, table);
1482 		break;
1483 	case _ASCE_TYPE_REGION3:
1484 		__gmap_unshadow_r3t(sg, 0, table);
1485 		break;
1486 	case _ASCE_TYPE_SEGMENT:
1487 		__gmap_unshadow_sgt(sg, 0, table);
1488 		break;
1489 	}
1490 }
1491 EXPORT_SYMBOL(gmap_unshadow);
1492 
1493 /**
1494  * gmap_shadow_r2t - create an empty shadow region 2 table
1495  * @sg: pointer to the shadow guest address space structure
1496  * @saddr: faulting address in the shadow gmap
1497  * @r2t: parent gmap address of the region 2 table to get shadowed
1498  * @fake: r2t references contiguous guest memory block, not a r2t
1499  *
1500  * The r2t parameter specifies the address of the source table. The
1501  * four pages of the source table are made read-only in the parent gmap
1502  * address space. A write to the source table area @r2t will automatically
1503  * remove the shadow r2 table and all of its descendants.
1504  *
1505  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1506  * shadow table structure is incomplete, -ENOMEM if out of memory and
1507  * -EFAULT if an address in the parent gmap could not be resolved.
1508  *
1509  * Called with sg->mm->mmap_lock in read.
1510  */
gmap_shadow_r2t(struct gmap * sg,unsigned long saddr,unsigned long r2t,int fake)1511 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
1512 		    int fake)
1513 {
1514 	unsigned long raddr, origin, offset, len;
1515 	unsigned long *table;
1516 	phys_addr_t s_r2t;
1517 	struct page *page;
1518 	int rc;
1519 
1520 	BUG_ON(!gmap_is_shadow(sg));
1521 	/* Allocate a shadow region second table */
1522 	page = gmap_alloc_crst();
1523 	if (!page)
1524 		return -ENOMEM;
1525 	s_r2t = page_to_phys(page);
1526 	/* Install shadow region second table */
1527 	spin_lock(&sg->guest_table_lock);
1528 	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1529 	if (!table) {
1530 		rc = -EAGAIN;		/* Race with unshadow */
1531 		goto out_free;
1532 	}
1533 	if (!(*table & _REGION_ENTRY_INVALID)) {
1534 		rc = 0;			/* Already established */
1535 		goto out_free;
1536 	} else if (*table & _REGION_ENTRY_ORIGIN) {
1537 		rc = -EAGAIN;		/* Race with shadow */
1538 		goto out_free;
1539 	}
1540 	crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
1541 	/* mark as invalid as long as the parent table is not protected */
1542 	*table = s_r2t | _REGION_ENTRY_LENGTH |
1543 		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
1544 	if (sg->edat_level >= 1)
1545 		*table |= (r2t & _REGION_ENTRY_PROTECT);
1546 	if (fake) {
1547 		/* nothing to protect for fake tables */
1548 		*table &= ~_REGION_ENTRY_INVALID;
1549 		spin_unlock(&sg->guest_table_lock);
1550 		return 0;
1551 	}
1552 	spin_unlock(&sg->guest_table_lock);
1553 	/* Make r2t read-only in parent gmap page table */
1554 	raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
1555 	origin = r2t & _REGION_ENTRY_ORIGIN;
1556 	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1557 	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1558 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1559 	spin_lock(&sg->guest_table_lock);
1560 	if (!rc) {
1561 		table = gmap_table_walk(sg, saddr, 4);
1562 		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
1563 			rc = -EAGAIN;		/* Race with unshadow */
1564 		else
1565 			*table &= ~_REGION_ENTRY_INVALID;
1566 	} else {
1567 		gmap_unshadow_r2t(sg, raddr);
1568 	}
1569 	spin_unlock(&sg->guest_table_lock);
1570 	return rc;
1571 out_free:
1572 	spin_unlock(&sg->guest_table_lock);
1573 	__free_pages(page, CRST_ALLOC_ORDER);
1574 	return rc;
1575 }
1576 EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1577 
1578 /**
1579  * gmap_shadow_r3t - create a shadow region 3 table
1580  * @sg: pointer to the shadow guest address space structure
1581  * @saddr: faulting address in the shadow gmap
1582  * @r3t: parent gmap address of the region 3 table to get shadowed
1583  * @fake: r3t references contiguous guest memory block, not a r3t
1584  *
1585  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1586  * shadow table structure is incomplete, -ENOMEM if out of memory and
1587  * -EFAULT if an address in the parent gmap could not be resolved.
1588  *
1589  * Called with sg->mm->mmap_lock in read.
1590  */
gmap_shadow_r3t(struct gmap * sg,unsigned long saddr,unsigned long r3t,int fake)1591 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
1592 		    int fake)
1593 {
1594 	unsigned long raddr, origin, offset, len;
1595 	unsigned long *table;
1596 	phys_addr_t s_r3t;
1597 	struct page *page;
1598 	int rc;
1599 
1600 	BUG_ON(!gmap_is_shadow(sg));
1601 	/* Allocate a shadow region second table */
1602 	page = gmap_alloc_crst();
1603 	if (!page)
1604 		return -ENOMEM;
1605 	s_r3t = page_to_phys(page);
1606 	/* Install shadow region second table */
1607 	spin_lock(&sg->guest_table_lock);
1608 	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1609 	if (!table) {
1610 		rc = -EAGAIN;		/* Race with unshadow */
1611 		goto out_free;
1612 	}
1613 	if (!(*table & _REGION_ENTRY_INVALID)) {
1614 		rc = 0;			/* Already established */
1615 		goto out_free;
1616 	} else if (*table & _REGION_ENTRY_ORIGIN) {
1617 		rc = -EAGAIN;		/* Race with shadow */
1618 		goto out_free;
1619 	}
1620 	crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
1621 	/* mark as invalid as long as the parent table is not protected */
1622 	*table = s_r3t | _REGION_ENTRY_LENGTH |
1623 		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
1624 	if (sg->edat_level >= 1)
1625 		*table |= (r3t & _REGION_ENTRY_PROTECT);
1626 	if (fake) {
1627 		/* nothing to protect for fake tables */
1628 		*table &= ~_REGION_ENTRY_INVALID;
1629 		spin_unlock(&sg->guest_table_lock);
1630 		return 0;
1631 	}
1632 	spin_unlock(&sg->guest_table_lock);
1633 	/* Make r3t read-only in parent gmap page table */
1634 	raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
1635 	origin = r3t & _REGION_ENTRY_ORIGIN;
1636 	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1637 	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1638 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1639 	spin_lock(&sg->guest_table_lock);
1640 	if (!rc) {
1641 		table = gmap_table_walk(sg, saddr, 3);
1642 		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
1643 			rc = -EAGAIN;		/* Race with unshadow */
1644 		else
1645 			*table &= ~_REGION_ENTRY_INVALID;
1646 	} else {
1647 		gmap_unshadow_r3t(sg, raddr);
1648 	}
1649 	spin_unlock(&sg->guest_table_lock);
1650 	return rc;
1651 out_free:
1652 	spin_unlock(&sg->guest_table_lock);
1653 	__free_pages(page, CRST_ALLOC_ORDER);
1654 	return rc;
1655 }
1656 EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1657 
1658 /**
1659  * gmap_shadow_sgt - create a shadow segment table
1660  * @sg: pointer to the shadow guest address space structure
1661  * @saddr: faulting address in the shadow gmap
1662  * @sgt: parent gmap address of the segment table to get shadowed
1663  * @fake: sgt references contiguous guest memory block, not a sgt
1664  *
1665  * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1666  * shadow table structure is incomplete, -ENOMEM if out of memory and
1667  * -EFAULT if an address in the parent gmap could not be resolved.
1668  *
1669  * Called with sg->mm->mmap_lock in read.
1670  */
gmap_shadow_sgt(struct gmap * sg,unsigned long saddr,unsigned long sgt,int fake)1671 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
1672 		    int fake)
1673 {
1674 	unsigned long raddr, origin, offset, len;
1675 	unsigned long *table;
1676 	phys_addr_t s_sgt;
1677 	struct page *page;
1678 	int rc;
1679 
1680 	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
1681 	/* Allocate a shadow segment table */
1682 	page = gmap_alloc_crst();
1683 	if (!page)
1684 		return -ENOMEM;
1685 	s_sgt = page_to_phys(page);
1686 	/* Install shadow region second table */
1687 	spin_lock(&sg->guest_table_lock);
1688 	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1689 	if (!table) {
1690 		rc = -EAGAIN;		/* Race with unshadow */
1691 		goto out_free;
1692 	}
1693 	if (!(*table & _REGION_ENTRY_INVALID)) {
1694 		rc = 0;			/* Already established */
1695 		goto out_free;
1696 	} else if (*table & _REGION_ENTRY_ORIGIN) {
1697 		rc = -EAGAIN;		/* Race with shadow */
1698 		goto out_free;
1699 	}
1700 	crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
1701 	/* mark as invalid as long as the parent table is not protected */
1702 	*table = s_sgt | _REGION_ENTRY_LENGTH |
1703 		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
1704 	if (sg->edat_level >= 1)
1705 		*table |= sgt & _REGION_ENTRY_PROTECT;
1706 	if (fake) {
1707 		/* nothing to protect for fake tables */
1708 		*table &= ~_REGION_ENTRY_INVALID;
1709 		spin_unlock(&sg->guest_table_lock);
1710 		return 0;
1711 	}
1712 	spin_unlock(&sg->guest_table_lock);
1713 	/* Make sgt read-only in parent gmap page table */
1714 	raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
1715 	origin = sgt & _REGION_ENTRY_ORIGIN;
1716 	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1717 	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1718 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1719 	spin_lock(&sg->guest_table_lock);
1720 	if (!rc) {
1721 		table = gmap_table_walk(sg, saddr, 2);
1722 		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
1723 			rc = -EAGAIN;		/* Race with unshadow */
1724 		else
1725 			*table &= ~_REGION_ENTRY_INVALID;
1726 	} else {
1727 		gmap_unshadow_sgt(sg, raddr);
1728 	}
1729 	spin_unlock(&sg->guest_table_lock);
1730 	return rc;
1731 out_free:
1732 	spin_unlock(&sg->guest_table_lock);
1733 	__free_pages(page, CRST_ALLOC_ORDER);
1734 	return rc;
1735 }
1736 EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1737 
gmap_pgste_set_pgt_addr(struct ptdesc * ptdesc,unsigned long pgt_addr)1738 static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
1739 {
1740 	unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
1741 
1742 	pgstes += _PAGE_ENTRIES;
1743 
1744 	pgstes[0] &= ~PGSTE_ST2_MASK;
1745 	pgstes[1] &= ~PGSTE_ST2_MASK;
1746 	pgstes[2] &= ~PGSTE_ST2_MASK;
1747 	pgstes[3] &= ~PGSTE_ST2_MASK;
1748 
1749 	pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
1750 	pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
1751 	pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
1752 	pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
1753 }
1754 
1755 /**
1756  * gmap_shadow_pgt - instantiate a shadow page table
1757  * @sg: pointer to the shadow guest address space structure
1758  * @saddr: faulting address in the shadow gmap
1759  * @pgt: parent gmap address of the page table to get shadowed
1760  * @fake: pgt references contiguous guest memory block, not a pgtable
1761  *
1762  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1763  * shadow table structure is incomplete, -ENOMEM if out of memory,
1764  * -EFAULT if an address in the parent gmap could not be resolved and
1765  *
1766  * Called with gmap->mm->mmap_lock in read
1767  */
gmap_shadow_pgt(struct gmap * sg,unsigned long saddr,unsigned long pgt,int fake)1768 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
1769 		    int fake)
1770 {
1771 	unsigned long raddr, origin;
1772 	unsigned long *table;
1773 	struct ptdesc *ptdesc;
1774 	phys_addr_t s_pgt;
1775 	int rc;
1776 
1777 	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
1778 	/* Allocate a shadow page table */
1779 	ptdesc = page_table_alloc_pgste(sg->mm);
1780 	if (!ptdesc)
1781 		return -ENOMEM;
1782 	origin = pgt & _SEGMENT_ENTRY_ORIGIN;
1783 	if (fake)
1784 		origin |= GMAP_SHADOW_FAKE_TABLE;
1785 	gmap_pgste_set_pgt_addr(ptdesc, origin);
1786 	s_pgt = page_to_phys(ptdesc_page(ptdesc));
1787 	/* Install shadow page table */
1788 	spin_lock(&sg->guest_table_lock);
1789 	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1790 	if (!table) {
1791 		rc = -EAGAIN;		/* Race with unshadow */
1792 		goto out_free;
1793 	}
1794 	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
1795 		rc = 0;			/* Already established */
1796 		goto out_free;
1797 	} else if (*table & _SEGMENT_ENTRY_ORIGIN) {
1798 		rc = -EAGAIN;		/* Race with shadow */
1799 		goto out_free;
1800 	}
1801 	/* mark as invalid as long as the parent table is not protected */
1802 	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
1803 		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
1804 	if (fake) {
1805 		/* nothing to protect for fake tables */
1806 		*table &= ~_SEGMENT_ENTRY_INVALID;
1807 		spin_unlock(&sg->guest_table_lock);
1808 		return 0;
1809 	}
1810 	spin_unlock(&sg->guest_table_lock);
1811 	/* Make pgt read-only in parent gmap page table (not the pgste) */
1812 	raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
1813 	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
1814 	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
1815 	spin_lock(&sg->guest_table_lock);
1816 	if (!rc) {
1817 		table = gmap_table_walk(sg, saddr, 1);
1818 		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
1819 			rc = -EAGAIN;		/* Race with unshadow */
1820 		else
1821 			*table &= ~_SEGMENT_ENTRY_INVALID;
1822 	} else {
1823 		gmap_unshadow_pgt(sg, raddr);
1824 	}
1825 	spin_unlock(&sg->guest_table_lock);
1826 	return rc;
1827 out_free:
1828 	spin_unlock(&sg->guest_table_lock);
1829 	page_table_free_pgste(ptdesc);
1830 	return rc;
1831 
1832 }
1833 EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
1834 
1835 /**
1836  * gmap_shadow_page - create a shadow page mapping
1837  * @sg: pointer to the shadow guest address space structure
1838  * @saddr: faulting address in the shadow gmap
1839  * @pte: pte in parent gmap address space to get shadowed
1840  *
1841  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1842  * shadow table structure is incomplete, -ENOMEM if out of memory and
1843  * -EFAULT if an address in the parent gmap could not be resolved.
1844  *
1845  * Called with sg->mm->mmap_lock in read.
1846  */
gmap_shadow_page(struct gmap * sg,unsigned long saddr,pte_t pte)1847 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
1848 {
1849 	struct gmap *parent;
1850 	struct gmap_rmap *rmap;
1851 	unsigned long vmaddr, paddr;
1852 	spinlock_t *ptl;
1853 	pte_t *sptep, *tptep;
1854 	int prot;
1855 	int rc;
1856 
1857 	BUG_ON(!gmap_is_shadow(sg));
1858 	parent = sg->parent;
1859 	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
1860 
1861 	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1862 	if (!rmap)
1863 		return -ENOMEM;
1864 	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
1865 
1866 	while (1) {
1867 		paddr = pte_val(pte) & PAGE_MASK;
1868 		vmaddr = __gmap_translate(parent, paddr);
1869 		if (IS_ERR_VALUE(vmaddr)) {
1870 			rc = vmaddr;
1871 			break;
1872 		}
1873 		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1874 		if (rc)
1875 			break;
1876 		rc = -EAGAIN;
1877 		sptep = gmap_pte_op_walk(parent, paddr, &ptl);
1878 		if (sptep) {
1879 			spin_lock(&sg->guest_table_lock);
1880 			/* Get page table pointer */
1881 			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
1882 			if (!tptep) {
1883 				spin_unlock(&sg->guest_table_lock);
1884 				gmap_pte_op_end(sptep, ptl);
1885 				radix_tree_preload_end();
1886 				break;
1887 			}
1888 			rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
1889 			if (rc > 0) {
1890 				/* Success and a new mapping */
1891 				gmap_insert_rmap(sg, vmaddr, rmap);
1892 				rmap = NULL;
1893 				rc = 0;
1894 			}
1895 			gmap_pte_op_end(sptep, ptl);
1896 			spin_unlock(&sg->guest_table_lock);
1897 		}
1898 		radix_tree_preload_end();
1899 		if (!rc)
1900 			break;
1901 		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
1902 		if (rc)
1903 			break;
1904 	}
1905 	kfree(rmap);
1906 	return rc;
1907 }
1908 EXPORT_SYMBOL_GPL(gmap_shadow_page);
1909 
1910 /*
1911  * gmap_shadow_notify - handle notifications for shadow gmap
1912  *
1913  * Called with sg->parent->shadow_lock.
1914  */
gmap_shadow_notify(struct gmap * sg,unsigned long vmaddr,unsigned long gaddr)1915 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
1916 			       unsigned long gaddr)
1917 {
1918 	struct gmap_rmap *rmap, *rnext, *head;
1919 	unsigned long start, end, bits, raddr;
1920 
1921 	BUG_ON(!gmap_is_shadow(sg));
1922 
1923 	spin_lock(&sg->guest_table_lock);
1924 	if (sg->removed) {
1925 		spin_unlock(&sg->guest_table_lock);
1926 		return;
1927 	}
1928 	/* Check for top level table */
1929 	start = sg->orig_asce & _ASCE_ORIGIN;
1930 	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
1931 	if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
1932 	    gaddr < end) {
1933 		/* The complete shadow table has to go */
1934 		gmap_unshadow(sg);
1935 		spin_unlock(&sg->guest_table_lock);
1936 		list_del(&sg->list);
1937 		gmap_put(sg);
1938 		return;
1939 	}
1940 	/* Remove the page table tree from on specific entry */
1941 	head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1942 	gmap_for_each_rmap_safe(rmap, rnext, head) {
1943 		bits = rmap->raddr & _SHADOW_RMAP_MASK;
1944 		raddr = rmap->raddr ^ bits;
1945 		switch (bits) {
1946 		case _SHADOW_RMAP_REGION1:
1947 			gmap_unshadow_r2t(sg, raddr);
1948 			break;
1949 		case _SHADOW_RMAP_REGION2:
1950 			gmap_unshadow_r3t(sg, raddr);
1951 			break;
1952 		case _SHADOW_RMAP_REGION3:
1953 			gmap_unshadow_sgt(sg, raddr);
1954 			break;
1955 		case _SHADOW_RMAP_SEGMENT:
1956 			gmap_unshadow_pgt(sg, raddr);
1957 			break;
1958 		case _SHADOW_RMAP_PGTABLE:
1959 			gmap_unshadow_page(sg, raddr);
1960 			break;
1961 		}
1962 		kfree(rmap);
1963 	}
1964 	spin_unlock(&sg->guest_table_lock);
1965 }
1966 
1967 /**
1968  * ptep_notify - call all invalidation callbacks for a specific pte.
1969  * @mm: pointer to the process mm_struct
1970  * @vmaddr: virtual address in the process address space
1971  * @pte: pointer to the page table entry
1972  * @bits: bits from the pgste that caused the notify call
1973  *
1974  * This function is assumed to be called with the page table lock held
1975  * for the pte to notify.
1976  */
ptep_notify(struct mm_struct * mm,unsigned long vmaddr,pte_t * pte,unsigned long bits)1977 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
1978 		 pte_t *pte, unsigned long bits)
1979 {
1980 	unsigned long offset, gaddr = 0;
1981 	struct gmap *gmap, *sg, *next;
1982 
1983 	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
1984 	offset = offset * (PAGE_SIZE / sizeof(pte_t));
1985 	rcu_read_lock();
1986 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
1987 		spin_lock(&gmap->guest_table_lock);
1988 		gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
1989 		spin_unlock(&gmap->guest_table_lock);
1990 		if (!IS_GADDR_VALID(gaddr))
1991 			continue;
1992 
1993 		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
1994 			spin_lock(&gmap->shadow_lock);
1995 			list_for_each_entry_safe(sg, next,
1996 						 &gmap->children, list)
1997 				gmap_shadow_notify(sg, vmaddr, gaddr);
1998 			spin_unlock(&gmap->shadow_lock);
1999 		}
2000 		if (bits & PGSTE_IN_BIT)
2001 			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
2002 	}
2003 	rcu_read_unlock();
2004 }
2005 EXPORT_SYMBOL_GPL(ptep_notify);
2006 
pmdp_notify_gmap(struct gmap * gmap,pmd_t * pmdp,unsigned long gaddr)2007 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
2008 			     unsigned long gaddr)
2009 {
2010 	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
2011 	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
2012 }
2013 
2014 /**
2015  * gmap_pmdp_xchg - exchange a gmap pmd with another
2016  * @gmap: pointer to the guest address space structure
2017  * @pmdp: pointer to the pmd entry
2018  * @new: replacement entry
2019  * @gaddr: the affected guest address
2020  *
2021  * This function is assumed to be called with the guest_table_lock
2022  * held.
2023  */
gmap_pmdp_xchg(struct gmap * gmap,pmd_t * pmdp,pmd_t new,unsigned long gaddr)2024 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
2025 			   unsigned long gaddr)
2026 {
2027 	gaddr &= HPAGE_MASK;
2028 	pmdp_notify_gmap(gmap, pmdp, gaddr);
2029 	new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
2030 	if (machine_has_tlb_guest())
2031 		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
2032 			    IDTE_GLOBAL);
2033 	else if (cpu_has_idte())
2034 		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
2035 	else
2036 		__pmdp_csp(pmdp);
2037 	set_pmd(pmdp, new);
2038 }
2039 
gmap_pmdp_clear(struct mm_struct * mm,unsigned long vmaddr,int purge)2040 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
2041 			    int purge)
2042 {
2043 	pmd_t *pmdp;
2044 	struct gmap *gmap;
2045 	unsigned long gaddr;
2046 
2047 	rcu_read_lock();
2048 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2049 		spin_lock(&gmap->guest_table_lock);
2050 		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2051 		if (pmdp) {
2052 			pmdp_notify_gmap(gmap, pmdp, gaddr);
2053 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2054 						   _SEGMENT_ENTRY_GMAP_UC |
2055 						   _SEGMENT_ENTRY));
2056 			if (purge)
2057 				__pmdp_csp(pmdp);
2058 			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
2059 		}
2060 		spin_unlock(&gmap->guest_table_lock);
2061 	}
2062 	rcu_read_unlock();
2063 }
2064 
2065 /**
2066  * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
2067  *                        flushing
2068  * @mm: pointer to the process mm_struct
2069  * @vmaddr: virtual address in the process address space
2070  */
gmap_pmdp_invalidate(struct mm_struct * mm,unsigned long vmaddr)2071 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
2072 {
2073 	gmap_pmdp_clear(mm, vmaddr, 0);
2074 }
2075 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
2076 
2077 /**
2078  * gmap_pmdp_csp - csp all affected guest pmd entries
2079  * @mm: pointer to the process mm_struct
2080  * @vmaddr: virtual address in the process address space
2081  */
gmap_pmdp_csp(struct mm_struct * mm,unsigned long vmaddr)2082 void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
2083 {
2084 	gmap_pmdp_clear(mm, vmaddr, 1);
2085 }
2086 EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
2087 
2088 /**
2089  * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
2090  * @mm: pointer to the process mm_struct
2091  * @vmaddr: virtual address in the process address space
2092  */
gmap_pmdp_idte_local(struct mm_struct * mm,unsigned long vmaddr)2093 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
2094 {
2095 	unsigned long gaddr;
2096 	struct gmap *gmap;
2097 	pmd_t *pmdp;
2098 
2099 	rcu_read_lock();
2100 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2101 		spin_lock(&gmap->guest_table_lock);
2102 		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2103 		if (pmdp) {
2104 			pmdp_notify_gmap(gmap, pmdp, gaddr);
2105 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2106 						   _SEGMENT_ENTRY_GMAP_UC |
2107 						   _SEGMENT_ENTRY));
2108 			if (machine_has_tlb_guest())
2109 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2110 					    gmap->asce, IDTE_LOCAL);
2111 			else if (cpu_has_idte())
2112 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
2113 			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2114 		}
2115 		spin_unlock(&gmap->guest_table_lock);
2116 	}
2117 	rcu_read_unlock();
2118 }
2119 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
2120 
2121 /**
2122  * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
2123  * @mm: pointer to the process mm_struct
2124  * @vmaddr: virtual address in the process address space
2125  */
gmap_pmdp_idte_global(struct mm_struct * mm,unsigned long vmaddr)2126 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
2127 {
2128 	unsigned long gaddr;
2129 	struct gmap *gmap;
2130 	pmd_t *pmdp;
2131 
2132 	rcu_read_lock();
2133 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2134 		spin_lock(&gmap->guest_table_lock);
2135 		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2136 		if (pmdp) {
2137 			pmdp_notify_gmap(gmap, pmdp, gaddr);
2138 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2139 						   _SEGMENT_ENTRY_GMAP_UC |
2140 						   _SEGMENT_ENTRY));
2141 			if (machine_has_tlb_guest())
2142 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2143 					    gmap->asce, IDTE_GLOBAL);
2144 			else if (cpu_has_idte())
2145 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
2146 			else
2147 				__pmdp_csp(pmdp);
2148 			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2149 		}
2150 		spin_unlock(&gmap->guest_table_lock);
2151 	}
2152 	rcu_read_unlock();
2153 }
2154 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
2155 
2156 /**
2157  * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
2158  * @gmap: pointer to guest address space
2159  * @pmdp: pointer to the pmd to be tested
2160  * @gaddr: virtual address in the guest address space
2161  *
2162  * This function is assumed to be called with the guest_table_lock
2163  * held.
2164  */
gmap_test_and_clear_dirty_pmd(struct gmap * gmap,pmd_t * pmdp,unsigned long gaddr)2165 static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2166 					  unsigned long gaddr)
2167 {
2168 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
2169 		return false;
2170 
2171 	/* Already protected memory, which did not change is clean */
2172 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
2173 	    !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
2174 		return false;
2175 
2176 	/* Clear UC indication and reset protection */
2177 	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
2178 	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
2179 	return true;
2180 }
2181 
2182 /**
2183  * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
2184  * @gmap: pointer to guest address space
2185  * @bitmap: dirty bitmap for this pmd
2186  * @gaddr: virtual address in the guest address space
2187  * @vmaddr: virtual address in the host address space
2188  *
2189  * This function is assumed to be called with the guest_table_lock
2190  * held.
2191  */
gmap_sync_dirty_log_pmd(struct gmap * gmap,unsigned long bitmap[4],unsigned long gaddr,unsigned long vmaddr)2192 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
2193 			     unsigned long gaddr, unsigned long vmaddr)
2194 {
2195 	int i;
2196 	pmd_t *pmdp;
2197 	pte_t *ptep;
2198 	spinlock_t *ptl;
2199 
2200 	pmdp = gmap_pmd_op_walk(gmap, gaddr);
2201 	if (!pmdp)
2202 		return;
2203 
2204 	if (pmd_leaf(*pmdp)) {
2205 		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
2206 			bitmap_fill(bitmap, _PAGE_ENTRIES);
2207 	} else {
2208 		for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
2209 			ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
2210 			if (!ptep)
2211 				continue;
2212 			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
2213 				set_bit(i, bitmap);
2214 			pte_unmap_unlock(ptep, ptl);
2215 		}
2216 	}
2217 	gmap_pmd_op_end(gmap, pmdp);
2218 }
2219 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
2220 
2221 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
thp_split_walk_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)2222 static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
2223 				    unsigned long end, struct mm_walk *walk)
2224 {
2225 	struct vm_area_struct *vma = walk->vma;
2226 
2227 	split_huge_pmd(vma, pmd, addr);
2228 	return 0;
2229 }
2230 
2231 static const struct mm_walk_ops thp_split_walk_ops = {
2232 	.pmd_entry	= thp_split_walk_pmd_entry,
2233 	.walk_lock	= PGWALK_WRLOCK_VERIFY,
2234 };
2235 
thp_split_mm(struct mm_struct * mm)2236 static inline void thp_split_mm(struct mm_struct *mm)
2237 {
2238 	struct vm_area_struct *vma;
2239 	VMA_ITERATOR(vmi, mm, 0);
2240 
2241 	for_each_vma(vmi, vma) {
2242 		vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
2243 		walk_page_vma(vma, &thp_split_walk_ops, NULL);
2244 	}
2245 	mm->def_flags |= VM_NOHUGEPAGE;
2246 }
2247 #else
thp_split_mm(struct mm_struct * mm)2248 static inline void thp_split_mm(struct mm_struct *mm)
2249 {
2250 }
2251 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2252 
2253 /*
2254  * switch on pgstes for its userspace process (for kvm)
2255  */
s390_enable_sie(void)2256 int s390_enable_sie(void)
2257 {
2258 	struct mm_struct *mm = current->mm;
2259 
2260 	/* Do we have pgstes? if yes, we are done */
2261 	if (mm_has_pgste(mm))
2262 		return 0;
2263 	mmap_write_lock(mm);
2264 	mm->context.has_pgste = 1;
2265 	/* split thp mappings and disable thp for future mappings */
2266 	thp_split_mm(mm);
2267 	mmap_write_unlock(mm);
2268 	return 0;
2269 }
2270 EXPORT_SYMBOL_GPL(s390_enable_sie);
2271 
find_zeropage_pte_entry(pte_t * pte,unsigned long addr,unsigned long end,struct mm_walk * walk)2272 static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
2273 				   unsigned long end, struct mm_walk *walk)
2274 {
2275 	unsigned long *found_addr = walk->private;
2276 
2277 	/* Return 1 of the page is a zeropage. */
2278 	if (is_zero_pfn(pte_pfn(*pte))) {
2279 		/*
2280 		 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
2281 		 * right thing and likely don't care: FAULT_FLAG_UNSHARE
2282 		 * currently only works in COW mappings, which is also where
2283 		 * mm_forbids_zeropage() is checked.
2284 		 */
2285 		if (!is_cow_mapping(walk->vma->vm_flags))
2286 			return -EFAULT;
2287 
2288 		*found_addr = addr;
2289 		return 1;
2290 	}
2291 	return 0;
2292 }
2293 
2294 static const struct mm_walk_ops find_zeropage_ops = {
2295 	.pte_entry	= find_zeropage_pte_entry,
2296 	.walk_lock	= PGWALK_WRLOCK,
2297 };
2298 
2299 /*
2300  * Unshare all shared zeropages, replacing them by anonymous pages. Note that
2301  * we cannot simply zap all shared zeropages, because this could later
2302  * trigger unexpected userfaultfd missing events.
2303  *
2304  * This must be called after mm->context.allow_cow_sharing was
2305  * set to 0, to avoid future mappings of shared zeropages.
2306  *
2307  * mm contracts with s390, that even if mm were to remove a page table,
2308  * and racing with walk_page_range_vma() calling pte_offset_map_lock()
2309  * would fail, it will never insert a page table containing empty zero
2310  * pages once mm_forbids_zeropage(mm) i.e.
2311  * mm->context.allow_cow_sharing is set to 0.
2312  */
__s390_unshare_zeropages(struct mm_struct * mm)2313 static int __s390_unshare_zeropages(struct mm_struct *mm)
2314 {
2315 	struct vm_area_struct *vma;
2316 	VMA_ITERATOR(vmi, mm, 0);
2317 	unsigned long addr;
2318 	vm_fault_t fault;
2319 	int rc;
2320 
2321 	for_each_vma(vmi, vma) {
2322 		/*
2323 		 * We could only look at COW mappings, but it's more future
2324 		 * proof to catch unexpected zeropages in other mappings and
2325 		 * fail.
2326 		 */
2327 		if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
2328 			continue;
2329 		addr = vma->vm_start;
2330 
2331 retry:
2332 		rc = walk_page_range_vma(vma, addr, vma->vm_end,
2333 					 &find_zeropage_ops, &addr);
2334 		if (rc < 0)
2335 			return rc;
2336 		else if (!rc)
2337 			continue;
2338 
2339 		/* addr was updated by find_zeropage_pte_entry() */
2340 		fault = handle_mm_fault(vma, addr,
2341 					FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
2342 					NULL);
2343 		if (fault & VM_FAULT_OOM)
2344 			return -ENOMEM;
2345 		/*
2346 		 * See break_ksm(): even after handle_mm_fault() returned 0, we
2347 		 * must start the lookup from the current address, because
2348 		 * handle_mm_fault() may back out if there's any difficulty.
2349 		 *
2350 		 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
2351 		 * maybe they could trigger in the future on concurrent
2352 		 * truncation. In that case, the shared zeropage would be gone
2353 		 * and we can simply retry and make progress.
2354 		 */
2355 		cond_resched();
2356 		goto retry;
2357 	}
2358 
2359 	return 0;
2360 }
2361 
__s390_disable_cow_sharing(struct mm_struct * mm)2362 static int __s390_disable_cow_sharing(struct mm_struct *mm)
2363 {
2364 	int rc;
2365 
2366 	if (!mm->context.allow_cow_sharing)
2367 		return 0;
2368 
2369 	mm->context.allow_cow_sharing = 0;
2370 
2371 	/* Replace all shared zeropages by anonymous pages. */
2372 	rc = __s390_unshare_zeropages(mm);
2373 	/*
2374 	 * Make sure to disable KSM (if enabled for the whole process or
2375 	 * individual VMAs). Note that nothing currently hinders user space
2376 	 * from re-enabling it.
2377 	 */
2378 	if (!rc)
2379 		rc = ksm_disable(mm);
2380 	if (rc)
2381 		mm->context.allow_cow_sharing = 1;
2382 	return rc;
2383 }
2384 
2385 /*
2386  * Disable most COW-sharing of memory pages for the whole process:
2387  * (1) Disable KSM and unmerge/unshare any KSM pages.
2388  * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
2389  *
2390  * Not that we currently don't bother with COW-shared pages that are shared
2391  * with parent/child processes due to fork().
2392  */
s390_disable_cow_sharing(void)2393 int s390_disable_cow_sharing(void)
2394 {
2395 	int rc;
2396 
2397 	mmap_write_lock(current->mm);
2398 	rc = __s390_disable_cow_sharing(current->mm);
2399 	mmap_write_unlock(current->mm);
2400 	return rc;
2401 }
2402 EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
2403 
2404 /*
2405  * Enable storage key handling from now on and initialize the storage
2406  * keys with the default key.
2407  */
__s390_enable_skey_pte(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)2408 static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
2409 				  unsigned long next, struct mm_walk *walk)
2410 {
2411 	/* Clear storage key */
2412 	ptep_zap_key(walk->mm, addr, pte);
2413 	return 0;
2414 }
2415 
2416 /*
2417  * Give a chance to schedule after setting a key to 256 pages.
2418  * We only hold the mm lock, which is a rwsem and the kvm srcu.
2419  * Both can sleep.
2420  */
__s390_enable_skey_pmd(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)2421 static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
2422 				  unsigned long next, struct mm_walk *walk)
2423 {
2424 	cond_resched();
2425 	return 0;
2426 }
2427 
__s390_enable_skey_hugetlb(pte_t * pte,unsigned long addr,unsigned long hmask,unsigned long next,struct mm_walk * walk)2428 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2429 				      unsigned long hmask, unsigned long next,
2430 				      struct mm_walk *walk)
2431 {
2432 	pmd_t *pmd = (pmd_t *)pte;
2433 	unsigned long start, end;
2434 	struct folio *folio = page_folio(pmd_page(*pmd));
2435 
2436 	/*
2437 	 * The write check makes sure we do not set a key on shared
2438 	 * memory. This is needed as the walker does not differentiate
2439 	 * between actual guest memory and the process executable or
2440 	 * shared libraries.
2441 	 */
2442 	if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
2443 	    !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
2444 		return 0;
2445 
2446 	start = pmd_val(*pmd) & HPAGE_MASK;
2447 	end = start + HPAGE_SIZE;
2448 	__storage_key_init_range(start, end);
2449 	set_bit(PG_arch_1, &folio->flags);
2450 	cond_resched();
2451 	return 0;
2452 }
2453 
2454 static const struct mm_walk_ops enable_skey_walk_ops = {
2455 	.hugetlb_entry		= __s390_enable_skey_hugetlb,
2456 	.pte_entry		= __s390_enable_skey_pte,
2457 	.pmd_entry		= __s390_enable_skey_pmd,
2458 	.walk_lock		= PGWALK_WRLOCK,
2459 };
2460 
s390_enable_skey(void)2461 int s390_enable_skey(void)
2462 {
2463 	struct mm_struct *mm = current->mm;
2464 	int rc = 0;
2465 
2466 	mmap_write_lock(mm);
2467 	if (mm_uses_skeys(mm))
2468 		goto out_up;
2469 
2470 	mm->context.uses_skeys = 1;
2471 	rc = __s390_disable_cow_sharing(mm);
2472 	if (rc) {
2473 		mm->context.uses_skeys = 0;
2474 		goto out_up;
2475 	}
2476 	walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
2477 
2478 out_up:
2479 	mmap_write_unlock(mm);
2480 	return rc;
2481 }
2482 EXPORT_SYMBOL_GPL(s390_enable_skey);
2483 
2484 /*
2485  * Reset CMMA state, make all pages stable again.
2486  */
__s390_reset_cmma(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)2487 static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2488 			     unsigned long next, struct mm_walk *walk)
2489 {
2490 	ptep_zap_unused(walk->mm, addr, pte, 1);
2491 	return 0;
2492 }
2493 
2494 static const struct mm_walk_ops reset_cmma_walk_ops = {
2495 	.pte_entry		= __s390_reset_cmma,
2496 	.walk_lock		= PGWALK_WRLOCK,
2497 };
2498 
s390_reset_cmma(struct mm_struct * mm)2499 void s390_reset_cmma(struct mm_struct *mm)
2500 {
2501 	mmap_write_lock(mm);
2502 	walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2503 	mmap_write_unlock(mm);
2504 }
2505 EXPORT_SYMBOL_GPL(s390_reset_cmma);
2506 
2507 #define GATHER_GET_PAGES 32
2508 
2509 struct reset_walk_state {
2510 	unsigned long next;
2511 	unsigned long count;
2512 	unsigned long pfns[GATHER_GET_PAGES];
2513 };
2514 
s390_gather_pages(pte_t * ptep,unsigned long addr,unsigned long next,struct mm_walk * walk)2515 static int s390_gather_pages(pte_t *ptep, unsigned long addr,
2516 			     unsigned long next, struct mm_walk *walk)
2517 {
2518 	struct reset_walk_state *p = walk->private;
2519 	pte_t pte = READ_ONCE(*ptep);
2520 
2521 	if (pte_present(pte)) {
2522 		/* we have a reference from the mapping, take an extra one */
2523 		get_page(phys_to_page(pte_val(pte)));
2524 		p->pfns[p->count] = phys_to_pfn(pte_val(pte));
2525 		p->next = next;
2526 		p->count++;
2527 	}
2528 	return p->count >= GATHER_GET_PAGES;
2529 }
2530 
2531 static const struct mm_walk_ops gather_pages_ops = {
2532 	.pte_entry = s390_gather_pages,
2533 	.walk_lock = PGWALK_RDLOCK,
2534 };
2535 
2536 /*
2537  * Call the Destroy secure page UVC on each page in the given array of PFNs.
2538  * Each page needs to have an extra reference, which will be released here.
2539  */
s390_uv_destroy_pfns(unsigned long count,unsigned long * pfns)2540 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
2541 {
2542 	struct folio *folio;
2543 	unsigned long i;
2544 
2545 	for (i = 0; i < count; i++) {
2546 		folio = pfn_folio(pfns[i]);
2547 		/* we always have an extra reference */
2548 		uv_destroy_folio(folio);
2549 		/* get rid of the extra reference */
2550 		folio_put(folio);
2551 		cond_resched();
2552 	}
2553 }
2554 EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
2555 
2556 /**
2557  * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
2558  * in the given range of the given address space.
2559  * @mm: the mm to operate on
2560  * @start: the start of the range
2561  * @end: the end of the range
2562  * @interruptible: if not 0, stop when a fatal signal is received
2563  *
2564  * Walk the given range of the given address space and call the destroy
2565  * secure page UVC on each page. Optionally exit early if a fatal signal is
2566  * pending.
2567  *
2568  * Return: 0 on success, -EINTR if the function stopped before completing
2569  */
__s390_uv_destroy_range(struct mm_struct * mm,unsigned long start,unsigned long end,bool interruptible)2570 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
2571 			    unsigned long end, bool interruptible)
2572 {
2573 	struct reset_walk_state state = { .next = start };
2574 	int r = 1;
2575 
2576 	while (r > 0) {
2577 		state.count = 0;
2578 		mmap_read_lock(mm);
2579 		r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
2580 		mmap_read_unlock(mm);
2581 		cond_resched();
2582 		s390_uv_destroy_pfns(state.count, state.pfns);
2583 		if (interruptible && fatal_signal_pending(current))
2584 			return -EINTR;
2585 	}
2586 	return 0;
2587 }
2588 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
2589 
2590 /**
2591  * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
2592  * @gmap: the gmap whose ASCE needs to be replaced
2593  *
2594  * If the ASCE is a SEGMENT type then this function will return -EINVAL,
2595  * otherwise the pointers in the host_to_guest radix tree will keep pointing
2596  * to the wrong pages, causing use-after-free and memory corruption.
2597  * If the allocation of the new top level page table fails, the ASCE is not
2598  * replaced.
2599  * In any case, the old ASCE is always removed from the gmap CRST list.
2600  * Therefore the caller has to make sure to save a pointer to it
2601  * beforehand, unless a leak is actually intended.
2602  */
s390_replace_asce(struct gmap * gmap)2603 int s390_replace_asce(struct gmap *gmap)
2604 {
2605 	unsigned long asce;
2606 	struct page *page;
2607 	void *table;
2608 
2609 	/* Replacing segment type ASCEs would cause serious issues */
2610 	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
2611 		return -EINVAL;
2612 
2613 	page = gmap_alloc_crst();
2614 	if (!page)
2615 		return -ENOMEM;
2616 	table = page_to_virt(page);
2617 	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
2618 
2619 	/* Set new table origin while preserving existing ASCE control bits */
2620 	asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
2621 	WRITE_ONCE(gmap->asce, asce);
2622 	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
2623 	WRITE_ONCE(gmap->table, table);
2624 
2625 	return 0;
2626 }
2627 EXPORT_SYMBOL_GPL(s390_replace_asce);
2628