1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  KVM guest address space mapping code
4  *
5  *    Copyright IBM Corp. 2007, 2020
6  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7  *		 David Hildenbrand <david@redhat.com>
8  *		 Janosch Frank <frankja@linux.vnet.ibm.com>
9  */
10 
11 #include <linux/cpufeature.h>
12 #include <linux/kernel.h>
13 #include <linux/pagewalk.h>
14 #include <linux/swap.h>
15 #include <linux/smp.h>
16 #include <linux/spinlock.h>
17 #include <linux/slab.h>
18 #include <linux/swapops.h>
19 #include <linux/ksm.h>
20 #include <linux/mman.h>
21 #include <linux/pgtable.h>
22 #include <asm/page-states.h>
23 #include <asm/pgalloc.h>
24 #include <asm/machine.h>
25 #include <asm/gmap_helpers.h>
26 #include <asm/gmap.h>
27 #include <asm/page.h>
28 
29 /*
30  * The address is saved in a radix tree directly; NULL would be ambiguous,
31  * since 0 is a valid address, and NULL is returned when nothing was found.
32  * The lower bits are ignored by all users of the macro, so it can be used
33  * to distinguish a valid address 0 from a NULL.
34  */
35 #define VALID_GADDR_FLAG 1
36 #define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
37 #define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
38 
39 #define GMAP_SHADOW_FAKE_TABLE 1ULL
40 
41 static struct page *gmap_alloc_crst(void)
42 {
43 	struct page *page;
44 
45 	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
46 	if (!page)
47 		return NULL;
48 	__arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
49 	return page;
50 }
51 
52 /**
53  * gmap_alloc - allocate and initialize a guest address space
54  * @limit: maximum address of the gmap address space
55  *
56  * Returns a guest address space structure.
57  */
58 struct gmap *gmap_alloc(unsigned long limit)
59 {
60 	struct gmap *gmap;
61 	struct page *page;
62 	unsigned long *table;
63 	unsigned long etype, atype;
64 
65 	if (limit < _REGION3_SIZE) {
66 		limit = _REGION3_SIZE - 1;
67 		atype = _ASCE_TYPE_SEGMENT;
68 		etype = _SEGMENT_ENTRY_EMPTY;
69 	} else if (limit < _REGION2_SIZE) {
70 		limit = _REGION2_SIZE - 1;
71 		atype = _ASCE_TYPE_REGION3;
72 		etype = _REGION3_ENTRY_EMPTY;
73 	} else if (limit < _REGION1_SIZE) {
74 		limit = _REGION1_SIZE - 1;
75 		atype = _ASCE_TYPE_REGION2;
76 		etype = _REGION2_ENTRY_EMPTY;
77 	} else {
78 		limit = -1UL;
79 		atype = _ASCE_TYPE_REGION1;
80 		etype = _REGION1_ENTRY_EMPTY;
81 	}
82 	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
83 	if (!gmap)
84 		goto out;
85 	INIT_LIST_HEAD(&gmap->children);
86 	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
87 	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
88 	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
89 	spin_lock_init(&gmap->guest_table_lock);
90 	spin_lock_init(&gmap->shadow_lock);
91 	refcount_set(&gmap->ref_count, 1);
92 	page = gmap_alloc_crst();
93 	if (!page)
94 		goto out_free;
95 	table = page_to_virt(page);
96 	crst_table_init(table, etype);
97 	gmap->table = table;
98 	gmap->asce = atype | _ASCE_TABLE_LENGTH |
99 		_ASCE_USER_BITS | __pa(table);
100 	gmap->asce_end = limit;
101 	return gmap;
102 
103 out_free:
104 	kfree(gmap);
105 out:
106 	return NULL;
107 }
108 EXPORT_SYMBOL_GPL(gmap_alloc);
109 
110 /**
111  * gmap_create - create a guest address space
112  * @mm: pointer to the parent mm_struct
113  * @limit: maximum size of the gmap address space
114  *
115  * Returns a guest address space structure.
116  */
117 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
118 {
119 	struct gmap *gmap;
120 	unsigned long gmap_asce;
121 
122 	gmap = gmap_alloc(limit);
123 	if (!gmap)
124 		return NULL;
125 	gmap->mm = mm;
126 	spin_lock(&mm->context.lock);
127 	list_add_rcu(&gmap->list, &mm->context.gmap_list);
128 	if (list_is_singular(&mm->context.gmap_list))
129 		gmap_asce = gmap->asce;
130 	else
131 		gmap_asce = -1UL;
132 	WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
133 	spin_unlock(&mm->context.lock);
134 	return gmap;
135 }
136 EXPORT_SYMBOL_GPL(gmap_create);
137 
138 static void gmap_flush_tlb(struct gmap *gmap)
139 {
140 	if (cpu_has_idte())
141 		__tlb_flush_idte(gmap->asce);
142 	else
143 		__tlb_flush_global();
144 }
145 
146 static void gmap_radix_tree_free(struct radix_tree_root *root)
147 {
148 	struct radix_tree_iter iter;
149 	unsigned long indices[16];
150 	unsigned long index;
151 	void __rcu **slot;
152 	int i, nr;
153 
154 	/* A radix tree is freed by deleting all of its entries */
155 	index = 0;
156 	do {
157 		nr = 0;
158 		radix_tree_for_each_slot(slot, root, &iter, index) {
159 			indices[nr] = iter.index;
160 			if (++nr == 16)
161 				break;
162 		}
163 		for (i = 0; i < nr; i++) {
164 			index = indices[i];
165 			radix_tree_delete(root, index);
166 		}
167 	} while (nr > 0);
168 }
169 
170 static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
171 {
172 	struct gmap_rmap *rmap, *rnext, *head;
173 	struct radix_tree_iter iter;
174 	unsigned long indices[16];
175 	unsigned long index;
176 	void __rcu **slot;
177 	int i, nr;
178 
179 	/* A radix tree is freed by deleting all of its entries */
180 	index = 0;
181 	do {
182 		nr = 0;
183 		radix_tree_for_each_slot(slot, root, &iter, index) {
184 			indices[nr] = iter.index;
185 			if (++nr == 16)
186 				break;
187 		}
188 		for (i = 0; i < nr; i++) {
189 			index = indices[i];
190 			head = radix_tree_delete(root, index);
191 			gmap_for_each_rmap_safe(rmap, rnext, head)
192 				kfree(rmap);
193 		}
194 	} while (nr > 0);
195 }
196 
197 static void gmap_free_crst(unsigned long *table, bool free_ptes)
198 {
199 	bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
200 	int i;
201 
202 	if (is_segment) {
203 		if (!free_ptes)
204 			goto out;
205 		for (i = 0; i < _CRST_ENTRIES; i++)
206 			if (!(table[i] & _SEGMENT_ENTRY_INVALID))
207 				page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
208 	} else {
209 		for (i = 0; i < _CRST_ENTRIES; i++)
210 			if (!(table[i] & _REGION_ENTRY_INVALID))
211 				gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
212 	}
213 
214 out:
215 	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
216 }
217 
218 /**
219  * gmap_free - free a guest address space
220  * @gmap: pointer to the guest address space structure
221  *
222  * No locks required. There are no references to this gmap anymore.
223  */
224 void gmap_free(struct gmap *gmap)
225 {
226 	/* Flush tlb of all gmaps (if not already done for shadows) */
227 	if (!(gmap_is_shadow(gmap) && gmap->removed))
228 		gmap_flush_tlb(gmap);
229 	/* Free all segment & region tables. */
230 	gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
231 
232 	gmap_radix_tree_free(&gmap->guest_to_host);
233 	gmap_radix_tree_free(&gmap->host_to_guest);
234 
235 	/* Free additional data for a shadow gmap */
236 	if (gmap_is_shadow(gmap)) {
237 		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
238 		/* Release reference to the parent */
239 		gmap_put(gmap->parent);
240 	}
241 
242 	kfree(gmap);
243 }
244 EXPORT_SYMBOL_GPL(gmap_free);
245 
246 /**
247  * gmap_get - increase reference counter for guest address space
248  * @gmap: pointer to the guest address space structure
249  *
250  * Returns the gmap pointer
251  */
252 struct gmap *gmap_get(struct gmap *gmap)
253 {
254 	refcount_inc(&gmap->ref_count);
255 	return gmap;
256 }
257 EXPORT_SYMBOL_GPL(gmap_get);
258 
259 /**
260  * gmap_put - decrease reference counter for guest address space
261  * @gmap: pointer to the guest address space structure
262  *
263  * If the reference counter reaches zero the guest address space is freed.
264  */
265 void gmap_put(struct gmap *gmap)
266 {
267 	if (refcount_dec_and_test(&gmap->ref_count))
268 		gmap_free(gmap);
269 }
270 EXPORT_SYMBOL_GPL(gmap_put);
271 
272 /**
273  * gmap_remove - remove a guest address space but do not free it yet
274  * @gmap: pointer to the guest address space structure
275  */
276 void gmap_remove(struct gmap *gmap)
277 {
278 	struct gmap *sg, *next;
279 	unsigned long gmap_asce;
280 
281 	/* Remove all shadow gmaps linked to this gmap */
282 	if (!list_empty(&gmap->children)) {
283 		spin_lock(&gmap->shadow_lock);
284 		list_for_each_entry_safe(sg, next, &gmap->children, list) {
285 			list_del(&sg->list);
286 			gmap_put(sg);
287 		}
288 		spin_unlock(&gmap->shadow_lock);
289 	}
290 	/* Remove gmap from the pre-mm list */
291 	spin_lock(&gmap->mm->context.lock);
292 	list_del_rcu(&gmap->list);
293 	if (list_empty(&gmap->mm->context.gmap_list))
294 		gmap_asce = 0;
295 	else if (list_is_singular(&gmap->mm->context.gmap_list))
296 		gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
297 					     struct gmap, list)->asce;
298 	else
299 		gmap_asce = -1UL;
300 	WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
301 	spin_unlock(&gmap->mm->context.lock);
302 	synchronize_rcu();
303 	/* Put reference */
304 	gmap_put(gmap);
305 }
306 EXPORT_SYMBOL_GPL(gmap_remove);
307 
308 /*
309  * gmap_alloc_table is assumed to be called with mmap_lock held
310  */
311 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
312 			    unsigned long init, unsigned long gaddr)
313 {
314 	struct page *page;
315 	unsigned long *new;
316 
317 	/* since we dont free the gmap table until gmap_free we can unlock */
318 	page = gmap_alloc_crst();
319 	if (!page)
320 		return -ENOMEM;
321 	new = page_to_virt(page);
322 	crst_table_init(new, init);
323 	spin_lock(&gmap->guest_table_lock);
324 	if (*table & _REGION_ENTRY_INVALID) {
325 		*table = __pa(new) | _REGION_ENTRY_LENGTH |
326 			(*table & _REGION_ENTRY_TYPE_MASK);
327 		page = NULL;
328 	}
329 	spin_unlock(&gmap->guest_table_lock);
330 	if (page)
331 		__free_pages(page, CRST_ALLOC_ORDER);
332 	return 0;
333 }
334 
335 static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
336 {
337 	return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
338 }
339 
340 static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
341 {
342 	return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
343 }
344 
345 static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
346 				       unsigned long *gaddr)
347 {
348 	*gaddr = host_to_guest_delete(gmap, vmaddr);
349 	if (IS_GADDR_VALID(*gaddr))
350 		return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
351 	return NULL;
352 }
353 
354 /**
355  * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
356  * @gmap: pointer to the guest address space structure
357  * @vmaddr: address in the host process address space
358  *
359  * Returns 1 if a TLB flush is required
360  */
361 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
362 {
363 	unsigned long gaddr;
364 	int flush = 0;
365 	pmd_t *pmdp;
366 
367 	BUG_ON(gmap_is_shadow(gmap));
368 	spin_lock(&gmap->guest_table_lock);
369 
370 	pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
371 	if (pmdp) {
372 		flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
373 		*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
374 	}
375 
376 	spin_unlock(&gmap->guest_table_lock);
377 	return flush;
378 }
379 
380 /**
381  * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
382  * @gmap: pointer to the guest address space structure
383  * @gaddr: address in the guest address space
384  *
385  * Returns 1 if a TLB flush is required
386  */
387 static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
388 {
389 	unsigned long vmaddr;
390 
391 	vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
392 						   gaddr >> PMD_SHIFT);
393 	return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
394 }
395 
396 /**
397  * gmap_unmap_segment - unmap segment from the guest address space
398  * @gmap: pointer to the guest address space structure
399  * @to: address in the guest address space
400  * @len: length of the memory area to unmap
401  *
402  * Returns 0 if the unmap succeeded, -EINVAL if not.
403  */
404 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
405 {
406 	unsigned long off;
407 	int flush;
408 
409 	BUG_ON(gmap_is_shadow(gmap));
410 	if ((to | len) & (PMD_SIZE - 1))
411 		return -EINVAL;
412 	if (len == 0 || to + len < to)
413 		return -EINVAL;
414 
415 	flush = 0;
416 	mmap_write_lock(gmap->mm);
417 	for (off = 0; off < len; off += PMD_SIZE)
418 		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
419 	mmap_write_unlock(gmap->mm);
420 	if (flush)
421 		gmap_flush_tlb(gmap);
422 	return 0;
423 }
424 EXPORT_SYMBOL_GPL(gmap_unmap_segment);
425 
426 /**
427  * gmap_map_segment - map a segment to the guest address space
428  * @gmap: pointer to the guest address space structure
429  * @from: source address in the parent address space
430  * @to: target address in the guest address space
431  * @len: length of the memory area to map
432  *
433  * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
434  */
435 int gmap_map_segment(struct gmap *gmap, unsigned long from,
436 		     unsigned long to, unsigned long len)
437 {
438 	unsigned long off;
439 	int flush;
440 
441 	BUG_ON(gmap_is_shadow(gmap));
442 	if ((from | to | len) & (PMD_SIZE - 1))
443 		return -EINVAL;
444 	if (len == 0 || from + len < from || to + len < to ||
445 	    from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
446 		return -EINVAL;
447 
448 	flush = 0;
449 	mmap_write_lock(gmap->mm);
450 	for (off = 0; off < len; off += PMD_SIZE) {
451 		/* Remove old translation */
452 		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
453 		/* Store new translation */
454 		if (radix_tree_insert(&gmap->guest_to_host,
455 				      (to + off) >> PMD_SHIFT,
456 				      (void *) from + off))
457 			break;
458 	}
459 	mmap_write_unlock(gmap->mm);
460 	if (flush)
461 		gmap_flush_tlb(gmap);
462 	if (off >= len)
463 		return 0;
464 	gmap_unmap_segment(gmap, to, len);
465 	return -ENOMEM;
466 }
467 EXPORT_SYMBOL_GPL(gmap_map_segment);
468 
469 /**
470  * __gmap_translate - translate a guest address to a user space address
471  * @gmap: pointer to guest mapping meta data structure
472  * @gaddr: guest address
473  *
474  * Returns user space address which corresponds to the guest address or
475  * -EFAULT if no such mapping exists.
476  * This function does not establish potentially missing page table entries.
477  * The mmap_lock of the mm that belongs to the address space must be held
478  * when this function gets called.
479  *
480  * Note: Can also be called for shadow gmaps.
481  */
482 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
483 {
484 	unsigned long vmaddr;
485 
486 	vmaddr = (unsigned long)
487 		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
488 	/* Note: guest_to_host is empty for a shadow gmap */
489 	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
490 }
491 EXPORT_SYMBOL_GPL(__gmap_translate);
492 
493 /**
494  * gmap_unlink - disconnect a page table from the gmap shadow tables
495  * @mm: pointer to the parent mm_struct
496  * @table: pointer to the host page table
497  * @vmaddr: vm address associated with the host page table
498  */
499 void gmap_unlink(struct mm_struct *mm, unsigned long *table,
500 		 unsigned long vmaddr)
501 {
502 	struct gmap *gmap;
503 	int flush;
504 
505 	rcu_read_lock();
506 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
507 		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
508 		if (flush)
509 			gmap_flush_tlb(gmap);
510 	}
511 	rcu_read_unlock();
512 }
513 
514 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
515 			   unsigned long gaddr);
516 
517 /**
518  * __gmap_link - set up shadow page tables to connect a host to a guest address
519  * @gmap: pointer to guest mapping meta data structure
520  * @gaddr: guest address
521  * @vmaddr: vm address
522  *
523  * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
524  * if the vm address is already mapped to a different guest segment.
525  * The mmap_lock of the mm that belongs to the address space must be held
526  * when this function gets called.
527  */
528 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
529 {
530 	struct mm_struct *mm;
531 	unsigned long *table;
532 	spinlock_t *ptl;
533 	pgd_t *pgd;
534 	p4d_t *p4d;
535 	pud_t *pud;
536 	pmd_t *pmd;
537 	u64 unprot;
538 	int rc;
539 
540 	BUG_ON(gmap_is_shadow(gmap));
541 	/* Create higher level tables in the gmap page table */
542 	table = gmap->table;
543 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
544 		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
545 		if ((*table & _REGION_ENTRY_INVALID) &&
546 		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
547 				     gaddr & _REGION1_MASK))
548 			return -ENOMEM;
549 		table = __va(*table & _REGION_ENTRY_ORIGIN);
550 	}
551 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
552 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
553 		if ((*table & _REGION_ENTRY_INVALID) &&
554 		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
555 				     gaddr & _REGION2_MASK))
556 			return -ENOMEM;
557 		table = __va(*table & _REGION_ENTRY_ORIGIN);
558 	}
559 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
560 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
561 		if ((*table & _REGION_ENTRY_INVALID) &&
562 		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
563 				     gaddr & _REGION3_MASK))
564 			return -ENOMEM;
565 		table = __va(*table & _REGION_ENTRY_ORIGIN);
566 	}
567 	table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
568 	/* Walk the parent mm page table */
569 	mm = gmap->mm;
570 	pgd = pgd_offset(mm, vmaddr);
571 	VM_BUG_ON(pgd_none(*pgd));
572 	p4d = p4d_offset(pgd, vmaddr);
573 	VM_BUG_ON(p4d_none(*p4d));
574 	pud = pud_offset(p4d, vmaddr);
575 	VM_BUG_ON(pud_none(*pud));
576 	/* large puds cannot yet be handled */
577 	if (pud_leaf(*pud))
578 		return -EFAULT;
579 	pmd = pmd_offset(pud, vmaddr);
580 	VM_BUG_ON(pmd_none(*pmd));
581 	/* Are we allowed to use huge pages? */
582 	if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
583 		return -EFAULT;
584 	/* Link gmap segment table entry location to page table. */
585 	rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
586 	if (rc)
587 		return rc;
588 	ptl = pmd_lock(mm, pmd);
589 	spin_lock(&gmap->guest_table_lock);
590 	if (*table == _SEGMENT_ENTRY_EMPTY) {
591 		rc = radix_tree_insert(&gmap->host_to_guest,
592 				       vmaddr >> PMD_SHIFT,
593 				       (void *)MAKE_VALID_GADDR(gaddr));
594 		if (!rc) {
595 			if (pmd_leaf(*pmd)) {
596 				*table = (pmd_val(*pmd) &
597 					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
598 					| _SEGMENT_ENTRY_GMAP_UC
599 					| _SEGMENT_ENTRY;
600 			} else
601 				*table = pmd_val(*pmd) &
602 					_SEGMENT_ENTRY_HARDWARE_BITS;
603 		}
604 	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
605 		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
606 		unprot = (u64)*table;
607 		unprot &= ~_SEGMENT_ENTRY_PROTECT;
608 		unprot |= _SEGMENT_ENTRY_GMAP_UC;
609 		gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
610 	}
611 	spin_unlock(&gmap->guest_table_lock);
612 	spin_unlock(ptl);
613 	radix_tree_preload_end();
614 	return rc;
615 }
616 EXPORT_SYMBOL(__gmap_link);
617 
618 /*
619  * this function is assumed to be called with mmap_lock held
620  */
621 void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
622 {
623 	unsigned long vmaddr;
624 
625 	mmap_assert_locked(gmap->mm);
626 
627 	/* Find the vm address for the guest address */
628 	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
629 						   gaddr >> PMD_SHIFT);
630 	if (vmaddr) {
631 		vmaddr |= gaddr & ~PMD_MASK;
632 		gmap_helper_zap_one_page(gmap->mm, vmaddr);
633 	}
634 }
635 EXPORT_SYMBOL_GPL(__gmap_zap);
636 
637 static LIST_HEAD(gmap_notifier_list);
638 static DEFINE_SPINLOCK(gmap_notifier_lock);
639 
640 /**
641  * gmap_register_pte_notifier - register a pte invalidation callback
642  * @nb: pointer to the gmap notifier block
643  */
644 void gmap_register_pte_notifier(struct gmap_notifier *nb)
645 {
646 	spin_lock(&gmap_notifier_lock);
647 	list_add_rcu(&nb->list, &gmap_notifier_list);
648 	spin_unlock(&gmap_notifier_lock);
649 }
650 EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
651 
652 /**
653  * gmap_unregister_pte_notifier - remove a pte invalidation callback
654  * @nb: pointer to the gmap notifier block
655  */
656 void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
657 {
658 	spin_lock(&gmap_notifier_lock);
659 	list_del_rcu(&nb->list);
660 	spin_unlock(&gmap_notifier_lock);
661 	synchronize_rcu();
662 }
663 EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
664 
665 /**
666  * gmap_call_notifier - call all registered invalidation callbacks
667  * @gmap: pointer to guest mapping meta data structure
668  * @start: start virtual address in the guest address space
669  * @end: end virtual address in the guest address space
670  */
671 static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
672 			       unsigned long end)
673 {
674 	struct gmap_notifier *nb;
675 
676 	list_for_each_entry(nb, &gmap_notifier_list, list)
677 		nb->notifier_call(gmap, start, end);
678 }
679 
680 /**
681  * gmap_table_walk - walk the gmap page tables
682  * @gmap: pointer to guest mapping meta data structure
683  * @gaddr: virtual address in the guest address space
684  * @level: page table level to stop at
685  *
686  * Returns a table entry pointer for the given guest address and @level
687  * @level=0 : returns a pointer to a page table table entry (or NULL)
688  * @level=1 : returns a pointer to a segment table entry (or NULL)
689  * @level=2 : returns a pointer to a region-3 table entry (or NULL)
690  * @level=3 : returns a pointer to a region-2 table entry (or NULL)
691  * @level=4 : returns a pointer to a region-1 table entry (or NULL)
692  *
693  * Returns NULL if the gmap page tables could not be walked to the
694  * requested level.
695  *
696  * Note: Can also be called for shadow gmaps.
697  */
698 unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
699 {
700 	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
701 	unsigned long *table = gmap->table;
702 
703 	if (gmap_is_shadow(gmap) && gmap->removed)
704 		return NULL;
705 
706 	if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
707 		return NULL;
708 
709 	if (asce_type != _ASCE_TYPE_REGION1 &&
710 	    gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
711 		return NULL;
712 
713 	switch (asce_type) {
714 	case _ASCE_TYPE_REGION1:
715 		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
716 		if (level == 4)
717 			break;
718 		if (*table & _REGION_ENTRY_INVALID)
719 			return NULL;
720 		table = __va(*table & _REGION_ENTRY_ORIGIN);
721 		fallthrough;
722 	case _ASCE_TYPE_REGION2:
723 		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
724 		if (level == 3)
725 			break;
726 		if (*table & _REGION_ENTRY_INVALID)
727 			return NULL;
728 		table = __va(*table & _REGION_ENTRY_ORIGIN);
729 		fallthrough;
730 	case _ASCE_TYPE_REGION3:
731 		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
732 		if (level == 2)
733 			break;
734 		if (*table & _REGION_ENTRY_INVALID)
735 			return NULL;
736 		table = __va(*table & _REGION_ENTRY_ORIGIN);
737 		fallthrough;
738 	case _ASCE_TYPE_SEGMENT:
739 		table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
740 		if (level == 1)
741 			break;
742 		if (*table & _REGION_ENTRY_INVALID)
743 			return NULL;
744 		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
745 		table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
746 	}
747 	return table;
748 }
749 EXPORT_SYMBOL(gmap_table_walk);
750 
751 /**
752  * gmap_pte_op_walk - walk the gmap page table, get the page table lock
753  *		      and return the pte pointer
754  * @gmap: pointer to guest mapping meta data structure
755  * @gaddr: virtual address in the guest address space
756  * @ptl: pointer to the spinlock pointer
757  *
758  * Returns a pointer to the locked pte for a guest address, or NULL
759  */
760 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
761 			       spinlock_t **ptl)
762 {
763 	unsigned long *table;
764 
765 	BUG_ON(gmap_is_shadow(gmap));
766 	/* Walk the gmap page table, lock and get pte pointer */
767 	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
768 	if (!table || *table & _SEGMENT_ENTRY_INVALID)
769 		return NULL;
770 	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
771 }
772 
773 /**
774  * gmap_pte_op_fixup - force a page in and connect the gmap page table
775  * @gmap: pointer to guest mapping meta data structure
776  * @gaddr: virtual address in the guest address space
777  * @vmaddr: address in the host process address space
778  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
779  *
780  * Returns 0 if the caller can retry __gmap_translate (might fail again),
781  * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
782  * up or connecting the gmap page table.
783  */
784 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
785 			     unsigned long vmaddr, int prot)
786 {
787 	struct mm_struct *mm = gmap->mm;
788 	unsigned int fault_flags;
789 	bool unlocked = false;
790 
791 	BUG_ON(gmap_is_shadow(gmap));
792 	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
793 	if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
794 		return -EFAULT;
795 	if (unlocked)
796 		/* lost mmap_lock, caller has to retry __gmap_translate */
797 		return 0;
798 	/* Connect the page tables */
799 	return __gmap_link(gmap, gaddr, vmaddr);
800 }
801 
802 /**
803  * gmap_pte_op_end - release the page table lock
804  * @ptep: pointer to the locked pte
805  * @ptl: pointer to the page table spinlock
806  */
807 static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
808 {
809 	pte_unmap_unlock(ptep, ptl);
810 }
811 
812 /**
813  * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
814  *		      and return the pmd pointer
815  * @gmap: pointer to guest mapping meta data structure
816  * @gaddr: virtual address in the guest address space
817  *
818  * Returns a pointer to the pmd for a guest address, or NULL
819  */
820 static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
821 {
822 	pmd_t *pmdp;
823 
824 	BUG_ON(gmap_is_shadow(gmap));
825 	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
826 	if (!pmdp)
827 		return NULL;
828 
829 	/* without huge pages, there is no need to take the table lock */
830 	if (!gmap->mm->context.allow_gmap_hpage_1m)
831 		return pmd_none(*pmdp) ? NULL : pmdp;
832 
833 	spin_lock(&gmap->guest_table_lock);
834 	if (pmd_none(*pmdp)) {
835 		spin_unlock(&gmap->guest_table_lock);
836 		return NULL;
837 	}
838 
839 	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
840 	if (!pmd_leaf(*pmdp))
841 		spin_unlock(&gmap->guest_table_lock);
842 	return pmdp;
843 }
844 
845 /**
846  * gmap_pmd_op_end - release the guest_table_lock if needed
847  * @gmap: pointer to the guest mapping meta data structure
848  * @pmdp: pointer to the pmd
849  */
850 static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
851 {
852 	if (pmd_leaf(*pmdp))
853 		spin_unlock(&gmap->guest_table_lock);
854 }
855 
856 /*
857  * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
858  * @pmdp: pointer to the pmd to be protected
859  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
860  * @bits: notification bits to set
861  *
862  * Returns:
863  * 0 if successfully protected
864  * -EAGAIN if a fixup is needed
865  * -EINVAL if unsupported notifier bits have been specified
866  *
867  * Expected to be called with sg->mm->mmap_lock in read and
868  * guest_table_lock held.
869  */
870 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
871 			    pmd_t *pmdp, int prot, unsigned long bits)
872 {
873 	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
874 	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
875 	pmd_t new = *pmdp;
876 
877 	/* Fixup needed */
878 	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
879 		return -EAGAIN;
880 
881 	if (prot == PROT_NONE && !pmd_i) {
882 		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
883 		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
884 	}
885 
886 	if (prot == PROT_READ && !pmd_p) {
887 		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
888 		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
889 		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
890 	}
891 
892 	if (bits & GMAP_NOTIFY_MPROT)
893 		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
894 
895 	/* Shadow GMAP protection needs split PMDs */
896 	if (bits & GMAP_NOTIFY_SHADOW)
897 		return -EINVAL;
898 
899 	return 0;
900 }
901 
902 /*
903  * gmap_protect_pte - remove access rights to memory and set pgste bits
904  * @gmap: pointer to guest mapping meta data structure
905  * @gaddr: virtual address in the guest address space
906  * @pmdp: pointer to the pmd associated with the pte
907  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
908  * @bits: notification bits to set
909  *
910  * Returns 0 if successfully protected, -ENOMEM if out of memory and
911  * -EAGAIN if a fixup is needed.
912  *
913  * Expected to be called with sg->mm->mmap_lock in read
914  */
915 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
916 			    pmd_t *pmdp, int prot, unsigned long bits)
917 {
918 	int rc;
919 	pte_t *ptep;
920 	spinlock_t *ptl;
921 	unsigned long pbits = 0;
922 
923 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
924 		return -EAGAIN;
925 
926 	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
927 	if (!ptep)
928 		return -ENOMEM;
929 
930 	pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
931 	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
932 	/* Protect and unlock. */
933 	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
934 	gmap_pte_op_end(ptep, ptl);
935 	return rc;
936 }
937 
938 /*
939  * gmap_protect_range - remove access rights to memory and set pgste bits
940  * @gmap: pointer to guest mapping meta data structure
941  * @gaddr: virtual address in the guest address space
942  * @len: size of area
943  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
944  * @bits: pgste notification bits to set
945  *
946  * Returns:
947  *   PAGE_SIZE if a small page was successfully protected;
948  *   HPAGE_SIZE if a large page was successfully protected;
949  *   -ENOMEM if out of memory;
950  *   -EFAULT if gaddr is invalid (or mapping for shadows is missing);
951  *   -EAGAIN if the guest mapping is missing and should be fixed by the caller.
952  *
953  * Context: Called with sg->mm->mmap_lock in read.
954  */
955 int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
956 {
957 	pmd_t *pmdp;
958 	int rc = 0;
959 
960 	BUG_ON(gmap_is_shadow(gmap));
961 
962 	pmdp = gmap_pmd_op_walk(gmap, gaddr);
963 	if (!pmdp)
964 		return -EAGAIN;
965 
966 	if (!pmd_leaf(*pmdp)) {
967 		rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
968 		if (!rc)
969 			rc = PAGE_SIZE;
970 	} else {
971 		rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
972 		if (!rc)
973 			rc = HPAGE_SIZE;
974 	}
975 	gmap_pmd_op_end(gmap, pmdp);
976 
977 	return rc;
978 }
979 EXPORT_SYMBOL_GPL(gmap_protect_one);
980 
981 /**
982  * gmap_read_table - get an unsigned long value from a guest page table using
983  *                   absolute addressing, without marking the page referenced.
984  * @gmap: pointer to guest mapping meta data structure
985  * @gaddr: virtual address in the guest address space
986  * @val: pointer to the unsigned long value to return
987  *
988  * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
989  * if reading using the virtual address failed. -EINVAL if called on a gmap
990  * shadow.
991  *
992  * Called with gmap->mm->mmap_lock in read.
993  */
994 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
995 {
996 	unsigned long address, vmaddr;
997 	spinlock_t *ptl;
998 	pte_t *ptep, pte;
999 	int rc;
1000 
1001 	if (gmap_is_shadow(gmap))
1002 		return -EINVAL;
1003 
1004 	while (1) {
1005 		rc = -EAGAIN;
1006 		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
1007 		if (ptep) {
1008 			pte = *ptep;
1009 			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
1010 				address = pte_val(pte) & PAGE_MASK;
1011 				address += gaddr & ~PAGE_MASK;
1012 				*val = *(unsigned long *)__va(address);
1013 				set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
1014 				/* Do *NOT* clear the _PAGE_INVALID bit! */
1015 				rc = 0;
1016 			}
1017 			gmap_pte_op_end(ptep, ptl);
1018 		}
1019 		if (!rc)
1020 			break;
1021 		vmaddr = __gmap_translate(gmap, gaddr);
1022 		if (IS_ERR_VALUE(vmaddr)) {
1023 			rc = vmaddr;
1024 			break;
1025 		}
1026 		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
1027 		if (rc)
1028 			break;
1029 	}
1030 	return rc;
1031 }
1032 EXPORT_SYMBOL_GPL(gmap_read_table);
1033 
1034 /**
1035  * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
1036  * @sg: pointer to the shadow guest address space structure
1037  * @vmaddr: vm address associated with the rmap
1038  * @rmap: pointer to the rmap structure
1039  *
1040  * Called with the sg->guest_table_lock
1041  */
1042 static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1043 				    struct gmap_rmap *rmap)
1044 {
1045 	struct gmap_rmap *temp;
1046 	void __rcu **slot;
1047 
1048 	BUG_ON(!gmap_is_shadow(sg));
1049 	slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1050 	if (slot) {
1051 		rmap->next = radix_tree_deref_slot_protected(slot,
1052 							&sg->guest_table_lock);
1053 		for (temp = rmap->next; temp; temp = temp->next) {
1054 			if (temp->raddr == rmap->raddr) {
1055 				kfree(rmap);
1056 				return;
1057 			}
1058 		}
1059 		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1060 	} else {
1061 		rmap->next = NULL;
1062 		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
1063 				  rmap);
1064 	}
1065 }
1066 
1067 /**
1068  * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
1069  * @sg: pointer to the shadow guest address space structure
1070  * @raddr: rmap address in the shadow gmap
1071  * @paddr: address in the parent guest address space
1072  * @len: length of the memory area to protect
1073  *
1074  * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1075  * if out of memory and -EFAULT if paddr is invalid.
1076  */
1077 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1078 			     unsigned long paddr, unsigned long len)
1079 {
1080 	struct gmap *parent;
1081 	struct gmap_rmap *rmap;
1082 	unsigned long vmaddr;
1083 	spinlock_t *ptl;
1084 	pte_t *ptep;
1085 	int rc;
1086 
1087 	BUG_ON(!gmap_is_shadow(sg));
1088 	parent = sg->parent;
1089 	while (len) {
1090 		vmaddr = __gmap_translate(parent, paddr);
1091 		if (IS_ERR_VALUE(vmaddr))
1092 			return vmaddr;
1093 		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1094 		if (!rmap)
1095 			return -ENOMEM;
1096 		rmap->raddr = raddr;
1097 		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1098 		if (rc) {
1099 			kfree(rmap);
1100 			return rc;
1101 		}
1102 		rc = -EAGAIN;
1103 		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
1104 		if (ptep) {
1105 			spin_lock(&sg->guest_table_lock);
1106 			rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
1107 					     PGSTE_VSIE_BIT);
1108 			if (!rc)
1109 				gmap_insert_rmap(sg, vmaddr, rmap);
1110 			spin_unlock(&sg->guest_table_lock);
1111 			gmap_pte_op_end(ptep, ptl);
1112 		}
1113 		radix_tree_preload_end();
1114 		if (rc) {
1115 			kfree(rmap);
1116 			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
1117 			if (rc)
1118 				return rc;
1119 			continue;
1120 		}
1121 		paddr += PAGE_SIZE;
1122 		len -= PAGE_SIZE;
1123 	}
1124 	return 0;
1125 }
1126 
1127 #define _SHADOW_RMAP_MASK	0x7
1128 #define _SHADOW_RMAP_REGION1	0x5
1129 #define _SHADOW_RMAP_REGION2	0x4
1130 #define _SHADOW_RMAP_REGION3	0x3
1131 #define _SHADOW_RMAP_SEGMENT	0x2
1132 #define _SHADOW_RMAP_PGTABLE	0x1
1133 
1134 /**
1135  * gmap_idte_one - invalidate a single region or segment table entry
1136  * @asce: region or segment table *origin* + table-type bits
1137  * @vaddr: virtual address to identify the table entry to flush
1138  *
1139  * The invalid bit of a single region or segment table entry is set
1140  * and the associated TLB entries depending on the entry are flushed.
1141  * The table-type of the @asce identifies the portion of the @vaddr
1142  * that is used as the invalidation index.
1143  */
1144 static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1145 {
1146 	asm volatile(
1147 		"	idte	%0,0,%1"
1148 		: : "a" (asce), "a" (vaddr) : "cc", "memory");
1149 }
1150 
1151 /**
1152  * gmap_unshadow_page - remove a page from a shadow page table
1153  * @sg: pointer to the shadow guest address space structure
1154  * @raddr: rmap address in the shadow guest address space
1155  *
1156  * Called with the sg->guest_table_lock
1157  */
1158 static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1159 {
1160 	unsigned long *table;
1161 
1162 	BUG_ON(!gmap_is_shadow(sg));
1163 	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1164 	if (!table || *table & _PAGE_INVALID)
1165 		return;
1166 	gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
1167 	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1168 }
1169 
1170 /**
1171  * __gmap_unshadow_pgt - remove all entries from a shadow page table
1172  * @sg: pointer to the shadow guest address space structure
1173  * @raddr: rmap address in the shadow guest address space
1174  * @pgt: pointer to the start of a shadow page table
1175  *
1176  * Called with the sg->guest_table_lock
1177  */
1178 static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1179 				unsigned long *pgt)
1180 {
1181 	int i;
1182 
1183 	BUG_ON(!gmap_is_shadow(sg));
1184 	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
1185 		pgt[i] = _PAGE_INVALID;
1186 }
1187 
1188 /**
1189  * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1190  * @sg: pointer to the shadow guest address space structure
1191  * @raddr: address in the shadow guest address space
1192  *
1193  * Called with the sg->guest_table_lock
1194  */
1195 static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1196 {
1197 	unsigned long *ste;
1198 	phys_addr_t sto, pgt;
1199 	struct ptdesc *ptdesc;
1200 
1201 	BUG_ON(!gmap_is_shadow(sg));
1202 	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1203 	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
1204 		return;
1205 	gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
1206 	sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
1207 	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1208 	pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
1209 	*ste = _SEGMENT_ENTRY_EMPTY;
1210 	__gmap_unshadow_pgt(sg, raddr, __va(pgt));
1211 	/* Free page table */
1212 	ptdesc = page_ptdesc(phys_to_page(pgt));
1213 	page_table_free_pgste(ptdesc);
1214 }
1215 
1216 /**
1217  * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1218  * @sg: pointer to the shadow guest address space structure
1219  * @raddr: rmap address in the shadow guest address space
1220  * @sgt: pointer to the start of a shadow segment table
1221  *
1222  * Called with the sg->guest_table_lock
1223  */
1224 static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1225 				unsigned long *sgt)
1226 {
1227 	struct ptdesc *ptdesc;
1228 	phys_addr_t pgt;
1229 	int i;
1230 
1231 	BUG_ON(!gmap_is_shadow(sg));
1232 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
1233 		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
1234 			continue;
1235 		pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
1236 		sgt[i] = _SEGMENT_ENTRY_EMPTY;
1237 		__gmap_unshadow_pgt(sg, raddr, __va(pgt));
1238 		/* Free page table */
1239 		ptdesc = page_ptdesc(phys_to_page(pgt));
1240 		page_table_free_pgste(ptdesc);
1241 	}
1242 }
1243 
1244 /**
1245  * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1246  * @sg: pointer to the shadow guest address space structure
1247  * @raddr: rmap address in the shadow guest address space
1248  *
1249  * Called with the shadow->guest_table_lock
1250  */
1251 static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1252 {
1253 	unsigned long r3o, *r3e;
1254 	phys_addr_t sgt;
1255 	struct page *page;
1256 
1257 	BUG_ON(!gmap_is_shadow(sg));
1258 	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1259 	if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
1260 		return;
1261 	gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
1262 	r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
1263 	gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
1264 	sgt = *r3e & _REGION_ENTRY_ORIGIN;
1265 	*r3e = _REGION3_ENTRY_EMPTY;
1266 	__gmap_unshadow_sgt(sg, raddr, __va(sgt));
1267 	/* Free segment table */
1268 	page = phys_to_page(sgt);
1269 	__free_pages(page, CRST_ALLOC_ORDER);
1270 }
1271 
1272 /**
1273  * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1274  * @sg: pointer to the shadow guest address space structure
1275  * @raddr: address in the shadow guest address space
1276  * @r3t: pointer to the start of a shadow region-3 table
1277  *
1278  * Called with the sg->guest_table_lock
1279  */
1280 static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1281 				unsigned long *r3t)
1282 {
1283 	struct page *page;
1284 	phys_addr_t sgt;
1285 	int i;
1286 
1287 	BUG_ON(!gmap_is_shadow(sg));
1288 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
1289 		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
1290 			continue;
1291 		sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
1292 		r3t[i] = _REGION3_ENTRY_EMPTY;
1293 		__gmap_unshadow_sgt(sg, raddr, __va(sgt));
1294 		/* Free segment table */
1295 		page = phys_to_page(sgt);
1296 		__free_pages(page, CRST_ALLOC_ORDER);
1297 	}
1298 }
1299 
1300 /**
1301  * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1302  * @sg: pointer to the shadow guest address space structure
1303  * @raddr: rmap address in the shadow guest address space
1304  *
1305  * Called with the sg->guest_table_lock
1306  */
1307 static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1308 {
1309 	unsigned long r2o, *r2e;
1310 	phys_addr_t r3t;
1311 	struct page *page;
1312 
1313 	BUG_ON(!gmap_is_shadow(sg));
1314 	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1315 	if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
1316 		return;
1317 	gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
1318 	r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
1319 	gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
1320 	r3t = *r2e & _REGION_ENTRY_ORIGIN;
1321 	*r2e = _REGION2_ENTRY_EMPTY;
1322 	__gmap_unshadow_r3t(sg, raddr, __va(r3t));
1323 	/* Free region 3 table */
1324 	page = phys_to_page(r3t);
1325 	__free_pages(page, CRST_ALLOC_ORDER);
1326 }
1327 
1328 /**
1329  * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1330  * @sg: pointer to the shadow guest address space structure
1331  * @raddr: rmap address in the shadow guest address space
1332  * @r2t: pointer to the start of a shadow region-2 table
1333  *
1334  * Called with the sg->guest_table_lock
1335  */
1336 static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1337 				unsigned long *r2t)
1338 {
1339 	phys_addr_t r3t;
1340 	struct page *page;
1341 	int i;
1342 
1343 	BUG_ON(!gmap_is_shadow(sg));
1344 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
1345 		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
1346 			continue;
1347 		r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
1348 		r2t[i] = _REGION2_ENTRY_EMPTY;
1349 		__gmap_unshadow_r3t(sg, raddr, __va(r3t));
1350 		/* Free region 3 table */
1351 		page = phys_to_page(r3t);
1352 		__free_pages(page, CRST_ALLOC_ORDER);
1353 	}
1354 }
1355 
1356 /**
1357  * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1358  * @sg: pointer to the shadow guest address space structure
1359  * @raddr: rmap address in the shadow guest address space
1360  *
1361  * Called with the sg->guest_table_lock
1362  */
1363 static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1364 {
1365 	unsigned long r1o, *r1e;
1366 	struct page *page;
1367 	phys_addr_t r2t;
1368 
1369 	BUG_ON(!gmap_is_shadow(sg));
1370 	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1371 	if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
1372 		return;
1373 	gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
1374 	r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
1375 	gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
1376 	r2t = *r1e & _REGION_ENTRY_ORIGIN;
1377 	*r1e = _REGION1_ENTRY_EMPTY;
1378 	__gmap_unshadow_r2t(sg, raddr, __va(r2t));
1379 	/* Free region 2 table */
1380 	page = phys_to_page(r2t);
1381 	__free_pages(page, CRST_ALLOC_ORDER);
1382 }
1383 
1384 /**
1385  * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1386  * @sg: pointer to the shadow guest address space structure
1387  * @raddr: rmap address in the shadow guest address space
1388  * @r1t: pointer to the start of a shadow region-1 table
1389  *
1390  * Called with the shadow->guest_table_lock
1391  */
1392 static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1393 				unsigned long *r1t)
1394 {
1395 	unsigned long asce;
1396 	struct page *page;
1397 	phys_addr_t r2t;
1398 	int i;
1399 
1400 	BUG_ON(!gmap_is_shadow(sg));
1401 	asce = __pa(r1t) | _ASCE_TYPE_REGION1;
1402 	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
1403 		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
1404 			continue;
1405 		r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
1406 		__gmap_unshadow_r2t(sg, raddr, __va(r2t));
1407 		/* Clear entry and flush translation r1t -> r2t */
1408 		gmap_idte_one(asce, raddr);
1409 		r1t[i] = _REGION1_ENTRY_EMPTY;
1410 		/* Free region 2 table */
1411 		page = phys_to_page(r2t);
1412 		__free_pages(page, CRST_ALLOC_ORDER);
1413 	}
1414 }
1415 
1416 /**
1417  * gmap_unshadow - remove a shadow page table completely
1418  * @sg: pointer to the shadow guest address space structure
1419  *
1420  * Called with sg->guest_table_lock
1421  */
1422 void gmap_unshadow(struct gmap *sg)
1423 {
1424 	unsigned long *table;
1425 
1426 	BUG_ON(!gmap_is_shadow(sg));
1427 	if (sg->removed)
1428 		return;
1429 	sg->removed = 1;
1430 	gmap_call_notifier(sg, 0, -1UL);
1431 	gmap_flush_tlb(sg);
1432 	table = __va(sg->asce & _ASCE_ORIGIN);
1433 	switch (sg->asce & _ASCE_TYPE_MASK) {
1434 	case _ASCE_TYPE_REGION1:
1435 		__gmap_unshadow_r1t(sg, 0, table);
1436 		break;
1437 	case _ASCE_TYPE_REGION2:
1438 		__gmap_unshadow_r2t(sg, 0, table);
1439 		break;
1440 	case _ASCE_TYPE_REGION3:
1441 		__gmap_unshadow_r3t(sg, 0, table);
1442 		break;
1443 	case _ASCE_TYPE_SEGMENT:
1444 		__gmap_unshadow_sgt(sg, 0, table);
1445 		break;
1446 	}
1447 }
1448 EXPORT_SYMBOL(gmap_unshadow);
1449 
1450 /**
1451  * gmap_shadow_r2t - create an empty shadow region 2 table
1452  * @sg: pointer to the shadow guest address space structure
1453  * @saddr: faulting address in the shadow gmap
1454  * @r2t: parent gmap address of the region 2 table to get shadowed
1455  * @fake: r2t references contiguous guest memory block, not a r2t
1456  *
1457  * The r2t parameter specifies the address of the source table. The
1458  * four pages of the source table are made read-only in the parent gmap
1459  * address space. A write to the source table area @r2t will automatically
1460  * remove the shadow r2 table and all of its descendants.
1461  *
1462  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1463  * shadow table structure is incomplete, -ENOMEM if out of memory and
1464  * -EFAULT if an address in the parent gmap could not be resolved.
1465  *
1466  * Called with sg->mm->mmap_lock in read.
1467  */
1468 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
1469 		    int fake)
1470 {
1471 	unsigned long raddr, origin, offset, len;
1472 	unsigned long *table;
1473 	phys_addr_t s_r2t;
1474 	struct page *page;
1475 	int rc;
1476 
1477 	BUG_ON(!gmap_is_shadow(sg));
1478 	/* Allocate a shadow region second table */
1479 	page = gmap_alloc_crst();
1480 	if (!page)
1481 		return -ENOMEM;
1482 	s_r2t = page_to_phys(page);
1483 	/* Install shadow region second table */
1484 	spin_lock(&sg->guest_table_lock);
1485 	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1486 	if (!table) {
1487 		rc = -EAGAIN;		/* Race with unshadow */
1488 		goto out_free;
1489 	}
1490 	if (!(*table & _REGION_ENTRY_INVALID)) {
1491 		rc = 0;			/* Already established */
1492 		goto out_free;
1493 	} else if (*table & _REGION_ENTRY_ORIGIN) {
1494 		rc = -EAGAIN;		/* Race with shadow */
1495 		goto out_free;
1496 	}
1497 	crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
1498 	/* mark as invalid as long as the parent table is not protected */
1499 	*table = s_r2t | _REGION_ENTRY_LENGTH |
1500 		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
1501 	if (sg->edat_level >= 1)
1502 		*table |= (r2t & _REGION_ENTRY_PROTECT);
1503 	if (fake) {
1504 		/* nothing to protect for fake tables */
1505 		*table &= ~_REGION_ENTRY_INVALID;
1506 		spin_unlock(&sg->guest_table_lock);
1507 		return 0;
1508 	}
1509 	spin_unlock(&sg->guest_table_lock);
1510 	/* Make r2t read-only in parent gmap page table */
1511 	raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
1512 	origin = r2t & _REGION_ENTRY_ORIGIN;
1513 	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1514 	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1515 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1516 	spin_lock(&sg->guest_table_lock);
1517 	if (!rc) {
1518 		table = gmap_table_walk(sg, saddr, 4);
1519 		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
1520 			rc = -EAGAIN;		/* Race with unshadow */
1521 		else
1522 			*table &= ~_REGION_ENTRY_INVALID;
1523 	} else {
1524 		gmap_unshadow_r2t(sg, raddr);
1525 	}
1526 	spin_unlock(&sg->guest_table_lock);
1527 	return rc;
1528 out_free:
1529 	spin_unlock(&sg->guest_table_lock);
1530 	__free_pages(page, CRST_ALLOC_ORDER);
1531 	return rc;
1532 }
1533 EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1534 
1535 /**
1536  * gmap_shadow_r3t - create a shadow region 3 table
1537  * @sg: pointer to the shadow guest address space structure
1538  * @saddr: faulting address in the shadow gmap
1539  * @r3t: parent gmap address of the region 3 table to get shadowed
1540  * @fake: r3t references contiguous guest memory block, not a r3t
1541  *
1542  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1543  * shadow table structure is incomplete, -ENOMEM if out of memory and
1544  * -EFAULT if an address in the parent gmap could not be resolved.
1545  *
1546  * Called with sg->mm->mmap_lock in read.
1547  */
1548 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
1549 		    int fake)
1550 {
1551 	unsigned long raddr, origin, offset, len;
1552 	unsigned long *table;
1553 	phys_addr_t s_r3t;
1554 	struct page *page;
1555 	int rc;
1556 
1557 	BUG_ON(!gmap_is_shadow(sg));
1558 	/* Allocate a shadow region second table */
1559 	page = gmap_alloc_crst();
1560 	if (!page)
1561 		return -ENOMEM;
1562 	s_r3t = page_to_phys(page);
1563 	/* Install shadow region second table */
1564 	spin_lock(&sg->guest_table_lock);
1565 	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1566 	if (!table) {
1567 		rc = -EAGAIN;		/* Race with unshadow */
1568 		goto out_free;
1569 	}
1570 	if (!(*table & _REGION_ENTRY_INVALID)) {
1571 		rc = 0;			/* Already established */
1572 		goto out_free;
1573 	} else if (*table & _REGION_ENTRY_ORIGIN) {
1574 		rc = -EAGAIN;		/* Race with shadow */
1575 		goto out_free;
1576 	}
1577 	crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
1578 	/* mark as invalid as long as the parent table is not protected */
1579 	*table = s_r3t | _REGION_ENTRY_LENGTH |
1580 		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
1581 	if (sg->edat_level >= 1)
1582 		*table |= (r3t & _REGION_ENTRY_PROTECT);
1583 	if (fake) {
1584 		/* nothing to protect for fake tables */
1585 		*table &= ~_REGION_ENTRY_INVALID;
1586 		spin_unlock(&sg->guest_table_lock);
1587 		return 0;
1588 	}
1589 	spin_unlock(&sg->guest_table_lock);
1590 	/* Make r3t read-only in parent gmap page table */
1591 	raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
1592 	origin = r3t & _REGION_ENTRY_ORIGIN;
1593 	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1594 	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1595 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1596 	spin_lock(&sg->guest_table_lock);
1597 	if (!rc) {
1598 		table = gmap_table_walk(sg, saddr, 3);
1599 		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
1600 			rc = -EAGAIN;		/* Race with unshadow */
1601 		else
1602 			*table &= ~_REGION_ENTRY_INVALID;
1603 	} else {
1604 		gmap_unshadow_r3t(sg, raddr);
1605 	}
1606 	spin_unlock(&sg->guest_table_lock);
1607 	return rc;
1608 out_free:
1609 	spin_unlock(&sg->guest_table_lock);
1610 	__free_pages(page, CRST_ALLOC_ORDER);
1611 	return rc;
1612 }
1613 EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1614 
1615 /**
1616  * gmap_shadow_sgt - create a shadow segment table
1617  * @sg: pointer to the shadow guest address space structure
1618  * @saddr: faulting address in the shadow gmap
1619  * @sgt: parent gmap address of the segment table to get shadowed
1620  * @fake: sgt references contiguous guest memory block, not a sgt
1621  *
1622  * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1623  * shadow table structure is incomplete, -ENOMEM if out of memory and
1624  * -EFAULT if an address in the parent gmap could not be resolved.
1625  *
1626  * Called with sg->mm->mmap_lock in read.
1627  */
1628 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
1629 		    int fake)
1630 {
1631 	unsigned long raddr, origin, offset, len;
1632 	unsigned long *table;
1633 	phys_addr_t s_sgt;
1634 	struct page *page;
1635 	int rc;
1636 
1637 	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
1638 	/* Allocate a shadow segment table */
1639 	page = gmap_alloc_crst();
1640 	if (!page)
1641 		return -ENOMEM;
1642 	s_sgt = page_to_phys(page);
1643 	/* Install shadow region second table */
1644 	spin_lock(&sg->guest_table_lock);
1645 	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1646 	if (!table) {
1647 		rc = -EAGAIN;		/* Race with unshadow */
1648 		goto out_free;
1649 	}
1650 	if (!(*table & _REGION_ENTRY_INVALID)) {
1651 		rc = 0;			/* Already established */
1652 		goto out_free;
1653 	} else if (*table & _REGION_ENTRY_ORIGIN) {
1654 		rc = -EAGAIN;		/* Race with shadow */
1655 		goto out_free;
1656 	}
1657 	crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
1658 	/* mark as invalid as long as the parent table is not protected */
1659 	*table = s_sgt | _REGION_ENTRY_LENGTH |
1660 		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
1661 	if (sg->edat_level >= 1)
1662 		*table |= sgt & _REGION_ENTRY_PROTECT;
1663 	if (fake) {
1664 		/* nothing to protect for fake tables */
1665 		*table &= ~_REGION_ENTRY_INVALID;
1666 		spin_unlock(&sg->guest_table_lock);
1667 		return 0;
1668 	}
1669 	spin_unlock(&sg->guest_table_lock);
1670 	/* Make sgt read-only in parent gmap page table */
1671 	raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
1672 	origin = sgt & _REGION_ENTRY_ORIGIN;
1673 	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1674 	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1675 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1676 	spin_lock(&sg->guest_table_lock);
1677 	if (!rc) {
1678 		table = gmap_table_walk(sg, saddr, 2);
1679 		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
1680 			rc = -EAGAIN;		/* Race with unshadow */
1681 		else
1682 			*table &= ~_REGION_ENTRY_INVALID;
1683 	} else {
1684 		gmap_unshadow_sgt(sg, raddr);
1685 	}
1686 	spin_unlock(&sg->guest_table_lock);
1687 	return rc;
1688 out_free:
1689 	spin_unlock(&sg->guest_table_lock);
1690 	__free_pages(page, CRST_ALLOC_ORDER);
1691 	return rc;
1692 }
1693 EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1694 
1695 static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
1696 {
1697 	unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
1698 
1699 	pgstes += _PAGE_ENTRIES;
1700 
1701 	pgstes[0] &= ~PGSTE_ST2_MASK;
1702 	pgstes[1] &= ~PGSTE_ST2_MASK;
1703 	pgstes[2] &= ~PGSTE_ST2_MASK;
1704 	pgstes[3] &= ~PGSTE_ST2_MASK;
1705 
1706 	pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
1707 	pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
1708 	pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
1709 	pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
1710 }
1711 
1712 /**
1713  * gmap_shadow_pgt - instantiate a shadow page table
1714  * @sg: pointer to the shadow guest address space structure
1715  * @saddr: faulting address in the shadow gmap
1716  * @pgt: parent gmap address of the page table to get shadowed
1717  * @fake: pgt references contiguous guest memory block, not a pgtable
1718  *
1719  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1720  * shadow table structure is incomplete, -ENOMEM if out of memory,
1721  * -EFAULT if an address in the parent gmap could not be resolved and
1722  *
1723  * Called with gmap->mm->mmap_lock in read
1724  */
1725 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
1726 		    int fake)
1727 {
1728 	unsigned long raddr, origin;
1729 	unsigned long *table;
1730 	struct ptdesc *ptdesc;
1731 	phys_addr_t s_pgt;
1732 	int rc;
1733 
1734 	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
1735 	/* Allocate a shadow page table */
1736 	ptdesc = page_table_alloc_pgste(sg->mm);
1737 	if (!ptdesc)
1738 		return -ENOMEM;
1739 	origin = pgt & _SEGMENT_ENTRY_ORIGIN;
1740 	if (fake)
1741 		origin |= GMAP_SHADOW_FAKE_TABLE;
1742 	gmap_pgste_set_pgt_addr(ptdesc, origin);
1743 	s_pgt = page_to_phys(ptdesc_page(ptdesc));
1744 	/* Install shadow page table */
1745 	spin_lock(&sg->guest_table_lock);
1746 	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1747 	if (!table) {
1748 		rc = -EAGAIN;		/* Race with unshadow */
1749 		goto out_free;
1750 	}
1751 	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
1752 		rc = 0;			/* Already established */
1753 		goto out_free;
1754 	} else if (*table & _SEGMENT_ENTRY_ORIGIN) {
1755 		rc = -EAGAIN;		/* Race with shadow */
1756 		goto out_free;
1757 	}
1758 	/* mark as invalid as long as the parent table is not protected */
1759 	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
1760 		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
1761 	if (fake) {
1762 		/* nothing to protect for fake tables */
1763 		*table &= ~_SEGMENT_ENTRY_INVALID;
1764 		spin_unlock(&sg->guest_table_lock);
1765 		return 0;
1766 	}
1767 	spin_unlock(&sg->guest_table_lock);
1768 	/* Make pgt read-only in parent gmap page table (not the pgste) */
1769 	raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
1770 	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
1771 	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
1772 	spin_lock(&sg->guest_table_lock);
1773 	if (!rc) {
1774 		table = gmap_table_walk(sg, saddr, 1);
1775 		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
1776 			rc = -EAGAIN;		/* Race with unshadow */
1777 		else
1778 			*table &= ~_SEGMENT_ENTRY_INVALID;
1779 	} else {
1780 		gmap_unshadow_pgt(sg, raddr);
1781 	}
1782 	spin_unlock(&sg->guest_table_lock);
1783 	return rc;
1784 out_free:
1785 	spin_unlock(&sg->guest_table_lock);
1786 	page_table_free_pgste(ptdesc);
1787 	return rc;
1788 
1789 }
1790 EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
1791 
1792 /**
1793  * gmap_shadow_page - create a shadow page mapping
1794  * @sg: pointer to the shadow guest address space structure
1795  * @saddr: faulting address in the shadow gmap
1796  * @pte: pte in parent gmap address space to get shadowed
1797  *
1798  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1799  * shadow table structure is incomplete, -ENOMEM if out of memory and
1800  * -EFAULT if an address in the parent gmap could not be resolved.
1801  *
1802  * Called with sg->mm->mmap_lock in read.
1803  */
1804 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
1805 {
1806 	struct gmap *parent;
1807 	struct gmap_rmap *rmap;
1808 	unsigned long vmaddr, paddr;
1809 	spinlock_t *ptl;
1810 	pte_t *sptep, *tptep;
1811 	int prot;
1812 	int rc;
1813 
1814 	BUG_ON(!gmap_is_shadow(sg));
1815 	parent = sg->parent;
1816 	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
1817 
1818 	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1819 	if (!rmap)
1820 		return -ENOMEM;
1821 	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
1822 
1823 	while (1) {
1824 		paddr = pte_val(pte) & PAGE_MASK;
1825 		vmaddr = __gmap_translate(parent, paddr);
1826 		if (IS_ERR_VALUE(vmaddr)) {
1827 			rc = vmaddr;
1828 			break;
1829 		}
1830 		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1831 		if (rc)
1832 			break;
1833 		rc = -EAGAIN;
1834 		sptep = gmap_pte_op_walk(parent, paddr, &ptl);
1835 		if (sptep) {
1836 			spin_lock(&sg->guest_table_lock);
1837 			/* Get page table pointer */
1838 			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
1839 			if (!tptep) {
1840 				spin_unlock(&sg->guest_table_lock);
1841 				gmap_pte_op_end(sptep, ptl);
1842 				radix_tree_preload_end();
1843 				break;
1844 			}
1845 			rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
1846 			if (rc > 0) {
1847 				/* Success and a new mapping */
1848 				gmap_insert_rmap(sg, vmaddr, rmap);
1849 				rmap = NULL;
1850 				rc = 0;
1851 			}
1852 			gmap_pte_op_end(sptep, ptl);
1853 			spin_unlock(&sg->guest_table_lock);
1854 		}
1855 		radix_tree_preload_end();
1856 		if (!rc)
1857 			break;
1858 		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
1859 		if (rc)
1860 			break;
1861 	}
1862 	kfree(rmap);
1863 	return rc;
1864 }
1865 EXPORT_SYMBOL_GPL(gmap_shadow_page);
1866 
1867 /*
1868  * gmap_shadow_notify - handle notifications for shadow gmap
1869  *
1870  * Called with sg->parent->shadow_lock.
1871  */
1872 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
1873 			       unsigned long gaddr)
1874 {
1875 	struct gmap_rmap *rmap, *rnext, *head;
1876 	unsigned long start, end, bits, raddr;
1877 
1878 	BUG_ON(!gmap_is_shadow(sg));
1879 
1880 	spin_lock(&sg->guest_table_lock);
1881 	if (sg->removed) {
1882 		spin_unlock(&sg->guest_table_lock);
1883 		return;
1884 	}
1885 	/* Check for top level table */
1886 	start = sg->orig_asce & _ASCE_ORIGIN;
1887 	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
1888 	if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
1889 	    gaddr < end) {
1890 		/* The complete shadow table has to go */
1891 		gmap_unshadow(sg);
1892 		spin_unlock(&sg->guest_table_lock);
1893 		list_del(&sg->list);
1894 		gmap_put(sg);
1895 		return;
1896 	}
1897 	/* Remove the page table tree from on specific entry */
1898 	head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1899 	gmap_for_each_rmap_safe(rmap, rnext, head) {
1900 		bits = rmap->raddr & _SHADOW_RMAP_MASK;
1901 		raddr = rmap->raddr ^ bits;
1902 		switch (bits) {
1903 		case _SHADOW_RMAP_REGION1:
1904 			gmap_unshadow_r2t(sg, raddr);
1905 			break;
1906 		case _SHADOW_RMAP_REGION2:
1907 			gmap_unshadow_r3t(sg, raddr);
1908 			break;
1909 		case _SHADOW_RMAP_REGION3:
1910 			gmap_unshadow_sgt(sg, raddr);
1911 			break;
1912 		case _SHADOW_RMAP_SEGMENT:
1913 			gmap_unshadow_pgt(sg, raddr);
1914 			break;
1915 		case _SHADOW_RMAP_PGTABLE:
1916 			gmap_unshadow_page(sg, raddr);
1917 			break;
1918 		}
1919 		kfree(rmap);
1920 	}
1921 	spin_unlock(&sg->guest_table_lock);
1922 }
1923 
1924 /**
1925  * ptep_notify - call all invalidation callbacks for a specific pte.
1926  * @mm: pointer to the process mm_struct
1927  * @vmaddr: virtual address in the process address space
1928  * @pte: pointer to the page table entry
1929  * @bits: bits from the pgste that caused the notify call
1930  *
1931  * This function is assumed to be called with the page table lock held
1932  * for the pte to notify.
1933  */
1934 void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
1935 		 pte_t *pte, unsigned long bits)
1936 {
1937 	unsigned long offset, gaddr = 0;
1938 	struct gmap *gmap, *sg, *next;
1939 
1940 	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
1941 	offset = offset * (PAGE_SIZE / sizeof(pte_t));
1942 	rcu_read_lock();
1943 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
1944 		spin_lock(&gmap->guest_table_lock);
1945 		gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
1946 		spin_unlock(&gmap->guest_table_lock);
1947 		if (!IS_GADDR_VALID(gaddr))
1948 			continue;
1949 
1950 		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
1951 			spin_lock(&gmap->shadow_lock);
1952 			list_for_each_entry_safe(sg, next,
1953 						 &gmap->children, list)
1954 				gmap_shadow_notify(sg, vmaddr, gaddr);
1955 			spin_unlock(&gmap->shadow_lock);
1956 		}
1957 		if (bits & PGSTE_IN_BIT)
1958 			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
1959 	}
1960 	rcu_read_unlock();
1961 }
1962 EXPORT_SYMBOL_GPL(ptep_notify);
1963 
1964 static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
1965 			     unsigned long gaddr)
1966 {
1967 	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
1968 	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
1969 }
1970 
1971 /**
1972  * gmap_pmdp_xchg - exchange a gmap pmd with another
1973  * @gmap: pointer to the guest address space structure
1974  * @pmdp: pointer to the pmd entry
1975  * @new: replacement entry
1976  * @gaddr: the affected guest address
1977  *
1978  * This function is assumed to be called with the guest_table_lock
1979  * held.
1980  */
1981 static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
1982 			   unsigned long gaddr)
1983 {
1984 	gaddr &= HPAGE_MASK;
1985 	pmdp_notify_gmap(gmap, pmdp, gaddr);
1986 	new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
1987 	if (machine_has_tlb_guest())
1988 		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
1989 			    IDTE_GLOBAL);
1990 	else if (cpu_has_idte())
1991 		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
1992 	else
1993 		__pmdp_csp(pmdp);
1994 	set_pmd(pmdp, new);
1995 }
1996 
1997 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
1998 			    int purge)
1999 {
2000 	pmd_t *pmdp;
2001 	struct gmap *gmap;
2002 	unsigned long gaddr;
2003 
2004 	rcu_read_lock();
2005 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2006 		spin_lock(&gmap->guest_table_lock);
2007 		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2008 		if (pmdp) {
2009 			pmdp_notify_gmap(gmap, pmdp, gaddr);
2010 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2011 						   _SEGMENT_ENTRY_GMAP_UC |
2012 						   _SEGMENT_ENTRY));
2013 			if (purge)
2014 				__pmdp_csp(pmdp);
2015 			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
2016 		}
2017 		spin_unlock(&gmap->guest_table_lock);
2018 	}
2019 	rcu_read_unlock();
2020 }
2021 
2022 /**
2023  * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
2024  *                        flushing
2025  * @mm: pointer to the process mm_struct
2026  * @vmaddr: virtual address in the process address space
2027  */
2028 void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
2029 {
2030 	gmap_pmdp_clear(mm, vmaddr, 0);
2031 }
2032 EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
2033 
2034 /**
2035  * gmap_pmdp_csp - csp all affected guest pmd entries
2036  * @mm: pointer to the process mm_struct
2037  * @vmaddr: virtual address in the process address space
2038  */
2039 void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
2040 {
2041 	gmap_pmdp_clear(mm, vmaddr, 1);
2042 }
2043 EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
2044 
2045 /**
2046  * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
2047  * @mm: pointer to the process mm_struct
2048  * @vmaddr: virtual address in the process address space
2049  */
2050 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
2051 {
2052 	unsigned long gaddr;
2053 	struct gmap *gmap;
2054 	pmd_t *pmdp;
2055 
2056 	rcu_read_lock();
2057 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2058 		spin_lock(&gmap->guest_table_lock);
2059 		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2060 		if (pmdp) {
2061 			pmdp_notify_gmap(gmap, pmdp, gaddr);
2062 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2063 						   _SEGMENT_ENTRY_GMAP_UC |
2064 						   _SEGMENT_ENTRY));
2065 			if (machine_has_tlb_guest())
2066 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2067 					    gmap->asce, IDTE_LOCAL);
2068 			else if (cpu_has_idte())
2069 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
2070 			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2071 		}
2072 		spin_unlock(&gmap->guest_table_lock);
2073 	}
2074 	rcu_read_unlock();
2075 }
2076 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
2077 
2078 /**
2079  * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
2080  * @mm: pointer to the process mm_struct
2081  * @vmaddr: virtual address in the process address space
2082  */
2083 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
2084 {
2085 	unsigned long gaddr;
2086 	struct gmap *gmap;
2087 	pmd_t *pmdp;
2088 
2089 	rcu_read_lock();
2090 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2091 		spin_lock(&gmap->guest_table_lock);
2092 		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
2093 		if (pmdp) {
2094 			pmdp_notify_gmap(gmap, pmdp, gaddr);
2095 			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2096 						   _SEGMENT_ENTRY_GMAP_UC |
2097 						   _SEGMENT_ENTRY));
2098 			if (machine_has_tlb_guest())
2099 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2100 					    gmap->asce, IDTE_GLOBAL);
2101 			else if (cpu_has_idte())
2102 				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
2103 			else
2104 				__pmdp_csp(pmdp);
2105 			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
2106 		}
2107 		spin_unlock(&gmap->guest_table_lock);
2108 	}
2109 	rcu_read_unlock();
2110 }
2111 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
2112 
2113 /**
2114  * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
2115  * @gmap: pointer to guest address space
2116  * @pmdp: pointer to the pmd to be tested
2117  * @gaddr: virtual address in the guest address space
2118  *
2119  * This function is assumed to be called with the guest_table_lock
2120  * held.
2121  */
2122 static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2123 					  unsigned long gaddr)
2124 {
2125 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
2126 		return false;
2127 
2128 	/* Already protected memory, which did not change is clean */
2129 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
2130 	    !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
2131 		return false;
2132 
2133 	/* Clear UC indication and reset protection */
2134 	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
2135 	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
2136 	return true;
2137 }
2138 
2139 /**
2140  * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
2141  * @gmap: pointer to guest address space
2142  * @bitmap: dirty bitmap for this pmd
2143  * @gaddr: virtual address in the guest address space
2144  * @vmaddr: virtual address in the host address space
2145  *
2146  * This function is assumed to be called with the guest_table_lock
2147  * held.
2148  */
2149 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
2150 			     unsigned long gaddr, unsigned long vmaddr)
2151 {
2152 	int i;
2153 	pmd_t *pmdp;
2154 	pte_t *ptep;
2155 	spinlock_t *ptl;
2156 
2157 	pmdp = gmap_pmd_op_walk(gmap, gaddr);
2158 	if (!pmdp)
2159 		return;
2160 
2161 	if (pmd_leaf(*pmdp)) {
2162 		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
2163 			bitmap_fill(bitmap, _PAGE_ENTRIES);
2164 	} else {
2165 		for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
2166 			ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
2167 			if (!ptep)
2168 				continue;
2169 			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
2170 				set_bit(i, bitmap);
2171 			pte_unmap_unlock(ptep, ptl);
2172 		}
2173 	}
2174 	gmap_pmd_op_end(gmap, pmdp);
2175 }
2176 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
2177 
2178 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2179 static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
2180 				    unsigned long end, struct mm_walk *walk)
2181 {
2182 	struct vm_area_struct *vma = walk->vma;
2183 
2184 	split_huge_pmd(vma, pmd, addr);
2185 	return 0;
2186 }
2187 
2188 static const struct mm_walk_ops thp_split_walk_ops = {
2189 	.pmd_entry	= thp_split_walk_pmd_entry,
2190 	.walk_lock	= PGWALK_WRLOCK_VERIFY,
2191 };
2192 
2193 static inline void thp_split_mm(struct mm_struct *mm)
2194 {
2195 	struct vm_area_struct *vma;
2196 	VMA_ITERATOR(vmi, mm, 0);
2197 
2198 	for_each_vma(vmi, vma) {
2199 		vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
2200 		walk_page_vma(vma, &thp_split_walk_ops, NULL);
2201 	}
2202 	mm->def_flags |= VM_NOHUGEPAGE;
2203 }
2204 #else
2205 static inline void thp_split_mm(struct mm_struct *mm)
2206 {
2207 }
2208 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2209 
2210 /*
2211  * switch on pgstes for its userspace process (for kvm)
2212  */
2213 int s390_enable_sie(void)
2214 {
2215 	struct mm_struct *mm = current->mm;
2216 
2217 	/* Do we have pgstes? if yes, we are done */
2218 	if (mm_has_pgste(mm))
2219 		return 0;
2220 	mmap_write_lock(mm);
2221 	mm->context.has_pgste = 1;
2222 	/* split thp mappings and disable thp for future mappings */
2223 	thp_split_mm(mm);
2224 	mmap_write_unlock(mm);
2225 	return 0;
2226 }
2227 EXPORT_SYMBOL_GPL(s390_enable_sie);
2228 
2229 /*
2230  * Enable storage key handling from now on and initialize the storage
2231  * keys with the default key.
2232  */
2233 static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
2234 				  unsigned long next, struct mm_walk *walk)
2235 {
2236 	/* Clear storage key */
2237 	ptep_zap_key(walk->mm, addr, pte);
2238 	return 0;
2239 }
2240 
2241 /*
2242  * Give a chance to schedule after setting a key to 256 pages.
2243  * We only hold the mm lock, which is a rwsem and the kvm srcu.
2244  * Both can sleep.
2245  */
2246 static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
2247 				  unsigned long next, struct mm_walk *walk)
2248 {
2249 	cond_resched();
2250 	return 0;
2251 }
2252 
2253 static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2254 				      unsigned long hmask, unsigned long next,
2255 				      struct mm_walk *walk)
2256 {
2257 	pmd_t *pmd = (pmd_t *)pte;
2258 	unsigned long start, end;
2259 	struct folio *folio = page_folio(pmd_page(*pmd));
2260 
2261 	/*
2262 	 * The write check makes sure we do not set a key on shared
2263 	 * memory. This is needed as the walker does not differentiate
2264 	 * between actual guest memory and the process executable or
2265 	 * shared libraries.
2266 	 */
2267 	if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
2268 	    !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
2269 		return 0;
2270 
2271 	start = pmd_val(*pmd) & HPAGE_MASK;
2272 	end = start + HPAGE_SIZE;
2273 	__storage_key_init_range(start, end);
2274 	set_bit(PG_arch_1, &folio->flags);
2275 	cond_resched();
2276 	return 0;
2277 }
2278 
2279 static const struct mm_walk_ops enable_skey_walk_ops = {
2280 	.hugetlb_entry		= __s390_enable_skey_hugetlb,
2281 	.pte_entry		= __s390_enable_skey_pte,
2282 	.pmd_entry		= __s390_enable_skey_pmd,
2283 	.walk_lock		= PGWALK_WRLOCK,
2284 };
2285 
2286 int s390_enable_skey(void)
2287 {
2288 	struct mm_struct *mm = current->mm;
2289 	int rc = 0;
2290 
2291 	mmap_write_lock(mm);
2292 	if (mm_uses_skeys(mm))
2293 		goto out_up;
2294 
2295 	mm->context.uses_skeys = 1;
2296 	rc = gmap_helper_disable_cow_sharing();
2297 	if (rc) {
2298 		mm->context.uses_skeys = 0;
2299 		goto out_up;
2300 	}
2301 	walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
2302 
2303 out_up:
2304 	mmap_write_unlock(mm);
2305 	return rc;
2306 }
2307 EXPORT_SYMBOL_GPL(s390_enable_skey);
2308 
2309 /*
2310  * Reset CMMA state, make all pages stable again.
2311  */
2312 static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2313 			     unsigned long next, struct mm_walk *walk)
2314 {
2315 	ptep_zap_unused(walk->mm, addr, pte, 1);
2316 	return 0;
2317 }
2318 
2319 static const struct mm_walk_ops reset_cmma_walk_ops = {
2320 	.pte_entry		= __s390_reset_cmma,
2321 	.walk_lock		= PGWALK_WRLOCK,
2322 };
2323 
2324 void s390_reset_cmma(struct mm_struct *mm)
2325 {
2326 	mmap_write_lock(mm);
2327 	walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2328 	mmap_write_unlock(mm);
2329 }
2330 EXPORT_SYMBOL_GPL(s390_reset_cmma);
2331 
2332 #define GATHER_GET_PAGES 32
2333 
2334 struct reset_walk_state {
2335 	unsigned long next;
2336 	unsigned long count;
2337 	unsigned long pfns[GATHER_GET_PAGES];
2338 };
2339 
2340 static int s390_gather_pages(pte_t *ptep, unsigned long addr,
2341 			     unsigned long next, struct mm_walk *walk)
2342 {
2343 	struct reset_walk_state *p = walk->private;
2344 	pte_t pte = READ_ONCE(*ptep);
2345 
2346 	if (pte_present(pte)) {
2347 		/* we have a reference from the mapping, take an extra one */
2348 		get_page(phys_to_page(pte_val(pte)));
2349 		p->pfns[p->count] = phys_to_pfn(pte_val(pte));
2350 		p->next = next;
2351 		p->count++;
2352 	}
2353 	return p->count >= GATHER_GET_PAGES;
2354 }
2355 
2356 static const struct mm_walk_ops gather_pages_ops = {
2357 	.pte_entry = s390_gather_pages,
2358 	.walk_lock = PGWALK_RDLOCK,
2359 };
2360 
2361 /*
2362  * Call the Destroy secure page UVC on each page in the given array of PFNs.
2363  * Each page needs to have an extra reference, which will be released here.
2364  */
2365 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
2366 {
2367 	struct folio *folio;
2368 	unsigned long i;
2369 
2370 	for (i = 0; i < count; i++) {
2371 		folio = pfn_folio(pfns[i]);
2372 		/* we always have an extra reference */
2373 		uv_destroy_folio(folio);
2374 		/* get rid of the extra reference */
2375 		folio_put(folio);
2376 		cond_resched();
2377 	}
2378 }
2379 EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
2380 
2381 /**
2382  * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
2383  * in the given range of the given address space.
2384  * @mm: the mm to operate on
2385  * @start: the start of the range
2386  * @end: the end of the range
2387  * @interruptible: if not 0, stop when a fatal signal is received
2388  *
2389  * Walk the given range of the given address space and call the destroy
2390  * secure page UVC on each page. Optionally exit early if a fatal signal is
2391  * pending.
2392  *
2393  * Return: 0 on success, -EINTR if the function stopped before completing
2394  */
2395 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
2396 			    unsigned long end, bool interruptible)
2397 {
2398 	struct reset_walk_state state = { .next = start };
2399 	int r = 1;
2400 
2401 	while (r > 0) {
2402 		state.count = 0;
2403 		mmap_read_lock(mm);
2404 		r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
2405 		mmap_read_unlock(mm);
2406 		cond_resched();
2407 		s390_uv_destroy_pfns(state.count, state.pfns);
2408 		if (interruptible && fatal_signal_pending(current))
2409 			return -EINTR;
2410 	}
2411 	return 0;
2412 }
2413 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
2414 
2415 /**
2416  * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
2417  * @gmap: the gmap whose ASCE needs to be replaced
2418  *
2419  * If the ASCE is a SEGMENT type then this function will return -EINVAL,
2420  * otherwise the pointers in the host_to_guest radix tree will keep pointing
2421  * to the wrong pages, causing use-after-free and memory corruption.
2422  * If the allocation of the new top level page table fails, the ASCE is not
2423  * replaced.
2424  * In any case, the old ASCE is always removed from the gmap CRST list.
2425  * Therefore the caller has to make sure to save a pointer to it
2426  * beforehand, unless a leak is actually intended.
2427  */
2428 int s390_replace_asce(struct gmap *gmap)
2429 {
2430 	unsigned long asce;
2431 	struct page *page;
2432 	void *table;
2433 
2434 	/* Replacing segment type ASCEs would cause serious issues */
2435 	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
2436 		return -EINVAL;
2437 
2438 	page = gmap_alloc_crst();
2439 	if (!page)
2440 		return -ENOMEM;
2441 	table = page_to_virt(page);
2442 	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
2443 
2444 	/* Set new table origin while preserving existing ASCE control bits */
2445 	asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
2446 	WRITE_ONCE(gmap->asce, asce);
2447 	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
2448 	WRITE_ONCE(gmap->table, table);
2449 
2450 	return 0;
2451 }
2452 EXPORT_SYMBOL_GPL(s390_replace_asce);
2453