1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Page table allocation functions
4  *
5  *    Copyright IBM Corp. 2016
6  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7  */
8 
9 #include <linux/sysctl.h>
10 #include <linux/slab.h>
11 #include <linux/mm.h>
12 #include <asm/mmu_context.h>
13 #include <asm/page-states.h>
14 #include <asm/pgalloc.h>
15 #include <asm/tlbflush.h>
16 
17 unsigned long *crst_table_alloc(struct mm_struct *mm)
18 {
19 	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
20 	unsigned long *table;
21 
22 	if (!ptdesc)
23 		return NULL;
24 	table = ptdesc_to_virt(ptdesc);
25 	__arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
26 	return table;
27 }
28 
29 void crst_table_free(struct mm_struct *mm, unsigned long *table)
30 {
31 	if (!table)
32 		return;
33 	pagetable_free(virt_to_ptdesc(table));
34 }
35 
36 static void __crst_table_upgrade(void *arg)
37 {
38 	struct mm_struct *mm = arg;
39 	struct ctlreg asce;
40 
41 	/* change all active ASCEs to avoid the creation of new TLBs */
42 	if (current->active_mm == mm) {
43 		asce.val = mm->context.asce;
44 		get_lowcore()->user_asce = asce;
45 		local_ctl_load(7, &asce);
46 		if (!test_thread_flag(TIF_ASCE_PRIMARY))
47 			local_ctl_load(1, &asce);
48 	}
49 	__tlb_flush_local();
50 }
51 
52 int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
53 {
54 	unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
55 	unsigned long asce_limit = mm->context.asce_limit;
56 
57 	mmap_assert_write_locked(mm);
58 
59 	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
60 	VM_BUG_ON(asce_limit < _REGION2_SIZE);
61 
62 	if (end <= asce_limit)
63 		return 0;
64 
65 	if (asce_limit == _REGION2_SIZE) {
66 		p4d = crst_table_alloc(mm);
67 		if (unlikely(!p4d))
68 			goto err_p4d;
69 		crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
70 		pagetable_p4d_ctor(virt_to_ptdesc(p4d));
71 	}
72 	if (end > _REGION1_SIZE) {
73 		pgd = crst_table_alloc(mm);
74 		if (unlikely(!pgd))
75 			goto err_pgd;
76 		crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
77 		pagetable_pgd_ctor(virt_to_ptdesc(pgd));
78 	}
79 
80 	spin_lock_bh(&mm->page_table_lock);
81 
82 	if (p4d) {
83 		__pgd = (unsigned long *) mm->pgd;
84 		p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
85 		mm->pgd = (pgd_t *) p4d;
86 		mm->context.asce_limit = _REGION1_SIZE;
87 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
88 			_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
89 		mm_inc_nr_puds(mm);
90 	}
91 	if (pgd) {
92 		__pgd = (unsigned long *) mm->pgd;
93 		pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
94 		mm->pgd = (pgd_t *) pgd;
95 		mm->context.asce_limit = TASK_SIZE_MAX;
96 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
97 			_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
98 	}
99 
100 	spin_unlock_bh(&mm->page_table_lock);
101 
102 	on_each_cpu(__crst_table_upgrade, mm, 0);
103 
104 	return 0;
105 
106 err_pgd:
107 	pagetable_dtor(virt_to_ptdesc(p4d));
108 	crst_table_free(mm, p4d);
109 err_p4d:
110 	return -ENOMEM;
111 }
112 
113 #ifdef CONFIG_PGSTE
114 
115 struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm)
116 {
117 	struct ptdesc *ptdesc;
118 	u64 *table;
119 
120 	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
121 	if (ptdesc) {
122 		table = (u64 *)ptdesc_to_virt(ptdesc);
123 		__arch_set_page_dat(table, 1);
124 		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
125 		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
126 	}
127 	return ptdesc;
128 }
129 
130 void page_table_free_pgste(struct ptdesc *ptdesc)
131 {
132 	pagetable_free(ptdesc);
133 }
134 
135 #endif /* CONFIG_PGSTE */
136 
137 unsigned long *page_table_alloc(struct mm_struct *mm)
138 {
139 	struct ptdesc *ptdesc;
140 	unsigned long *table;
141 
142 	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
143 	if (!ptdesc)
144 		return NULL;
145 	if (!pagetable_pte_ctor(mm, ptdesc)) {
146 		pagetable_free(ptdesc);
147 		return NULL;
148 	}
149 	table = ptdesc_to_virt(ptdesc);
150 	__arch_set_page_dat(table, 1);
151 	memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
152 	memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
153 	return table;
154 }
155 
156 void page_table_free(struct mm_struct *mm, unsigned long *table)
157 {
158 	struct ptdesc *ptdesc = virt_to_ptdesc(table);
159 
160 	pagetable_dtor_free(ptdesc);
161 }
162 
163 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
164 static void pte_free_now(struct rcu_head *head)
165 {
166 	struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
167 
168 	pagetable_dtor_free(ptdesc);
169 }
170 
171 void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
172 {
173 	struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
174 
175 	call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
176 	/*
177 	 * THPs are not allowed for KVM guests. Warn if pgste ever reaches here.
178 	 * Turn to the generic pte_free_defer() version once gmap is removed.
179 	 */
180 	WARN_ON_ONCE(mm_has_pgste(mm));
181 }
182 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
183 
184 /*
185  * Base infrastructure required to generate basic asces, region, segment,
186  * and page tables that do not make use of enhanced features like EDAT1.
187  */
188 
189 static struct kmem_cache *base_pgt_cache;
190 
191 static unsigned long *base_pgt_alloc(void)
192 {
193 	unsigned long *table;
194 
195 	table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
196 	if (table)
197 		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
198 	return table;
199 }
200 
201 static void base_pgt_free(unsigned long *table)
202 {
203 	kmem_cache_free(base_pgt_cache, table);
204 }
205 
206 static unsigned long *base_crst_alloc(unsigned long val)
207 {
208 	unsigned long *table;
209 	struct ptdesc *ptdesc;
210 
211 	ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
212 	if (!ptdesc)
213 		return NULL;
214 	table = ptdesc_address(ptdesc);
215 	crst_table_init(table, val);
216 	return table;
217 }
218 
219 static void base_crst_free(unsigned long *table)
220 {
221 	if (!table)
222 		return;
223 	pagetable_free(virt_to_ptdesc(table));
224 }
225 
226 #define BASE_ADDR_END_FUNC(NAME, SIZE)					\
227 static inline unsigned long base_##NAME##_addr_end(unsigned long addr,	\
228 						   unsigned long end)	\
229 {									\
230 	unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);		\
231 									\
232 	return (next - 1) < (end - 1) ? next : end;			\
233 }
234 
235 BASE_ADDR_END_FUNC(page,    PAGE_SIZE)
236 BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
237 BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
238 BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
239 BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
240 
241 static inline unsigned long base_lra(unsigned long address)
242 {
243 	unsigned long real;
244 
245 	asm volatile(
246 		"	lra	%0,0(%1)\n"
247 		: "=d" (real) : "a" (address) : "cc");
248 	return real;
249 }
250 
251 static int base_page_walk(unsigned long *origin, unsigned long addr,
252 			  unsigned long end, int alloc)
253 {
254 	unsigned long *pte, next;
255 
256 	if (!alloc)
257 		return 0;
258 	pte = origin;
259 	pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT;
260 	do {
261 		next = base_page_addr_end(addr, end);
262 		*pte = base_lra(addr);
263 	} while (pte++, addr = next, addr < end);
264 	return 0;
265 }
266 
267 static int base_segment_walk(unsigned long *origin, unsigned long addr,
268 			     unsigned long end, int alloc)
269 {
270 	unsigned long *ste, next, *table;
271 	int rc;
272 
273 	ste = origin;
274 	ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
275 	do {
276 		next = base_segment_addr_end(addr, end);
277 		if (*ste & _SEGMENT_ENTRY_INVALID) {
278 			if (!alloc)
279 				continue;
280 			table = base_pgt_alloc();
281 			if (!table)
282 				return -ENOMEM;
283 			*ste = __pa(table) | _SEGMENT_ENTRY;
284 		}
285 		table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
286 		rc = base_page_walk(table, addr, next, alloc);
287 		if (rc)
288 			return rc;
289 		if (!alloc)
290 			base_pgt_free(table);
291 		cond_resched();
292 	} while (ste++, addr = next, addr < end);
293 	return 0;
294 }
295 
296 static int base_region3_walk(unsigned long *origin, unsigned long addr,
297 			     unsigned long end, int alloc)
298 {
299 	unsigned long *rtte, next, *table;
300 	int rc;
301 
302 	rtte = origin;
303 	rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
304 	do {
305 		next = base_region3_addr_end(addr, end);
306 		if (*rtte & _REGION_ENTRY_INVALID) {
307 			if (!alloc)
308 				continue;
309 			table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
310 			if (!table)
311 				return -ENOMEM;
312 			*rtte = __pa(table) | _REGION3_ENTRY;
313 		}
314 		table = __va(*rtte & _REGION_ENTRY_ORIGIN);
315 		rc = base_segment_walk(table, addr, next, alloc);
316 		if (rc)
317 			return rc;
318 		if (!alloc)
319 			base_crst_free(table);
320 	} while (rtte++, addr = next, addr < end);
321 	return 0;
322 }
323 
324 static int base_region2_walk(unsigned long *origin, unsigned long addr,
325 			     unsigned long end, int alloc)
326 {
327 	unsigned long *rste, next, *table;
328 	int rc;
329 
330 	rste = origin;
331 	rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
332 	do {
333 		next = base_region2_addr_end(addr, end);
334 		if (*rste & _REGION_ENTRY_INVALID) {
335 			if (!alloc)
336 				continue;
337 			table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
338 			if (!table)
339 				return -ENOMEM;
340 			*rste = __pa(table) | _REGION2_ENTRY;
341 		}
342 		table = __va(*rste & _REGION_ENTRY_ORIGIN);
343 		rc = base_region3_walk(table, addr, next, alloc);
344 		if (rc)
345 			return rc;
346 		if (!alloc)
347 			base_crst_free(table);
348 	} while (rste++, addr = next, addr < end);
349 	return 0;
350 }
351 
352 static int base_region1_walk(unsigned long *origin, unsigned long addr,
353 			     unsigned long end, int alloc)
354 {
355 	unsigned long *rfte, next, *table;
356 	int rc;
357 
358 	rfte = origin;
359 	rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
360 	do {
361 		next = base_region1_addr_end(addr, end);
362 		if (*rfte & _REGION_ENTRY_INVALID) {
363 			if (!alloc)
364 				continue;
365 			table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
366 			if (!table)
367 				return -ENOMEM;
368 			*rfte = __pa(table) | _REGION1_ENTRY;
369 		}
370 		table = __va(*rfte & _REGION_ENTRY_ORIGIN);
371 		rc = base_region2_walk(table, addr, next, alloc);
372 		if (rc)
373 			return rc;
374 		if (!alloc)
375 			base_crst_free(table);
376 	} while (rfte++, addr = next, addr < end);
377 	return 0;
378 }
379 
380 /**
381  * base_asce_free - free asce and tables returned from base_asce_alloc()
382  * @asce: asce to be freed
383  *
384  * Frees all region, segment, and page tables that were allocated with a
385  * corresponding base_asce_alloc() call.
386  */
387 void base_asce_free(unsigned long asce)
388 {
389 	unsigned long *table = __va(asce & _ASCE_ORIGIN);
390 
391 	if (!asce)
392 		return;
393 	switch (asce & _ASCE_TYPE_MASK) {
394 	case _ASCE_TYPE_SEGMENT:
395 		base_segment_walk(table, 0, _REGION3_SIZE, 0);
396 		break;
397 	case _ASCE_TYPE_REGION3:
398 		base_region3_walk(table, 0, _REGION2_SIZE, 0);
399 		break;
400 	case _ASCE_TYPE_REGION2:
401 		base_region2_walk(table, 0, _REGION1_SIZE, 0);
402 		break;
403 	case _ASCE_TYPE_REGION1:
404 		base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
405 		break;
406 	}
407 	base_crst_free(table);
408 }
409 
410 static int base_pgt_cache_init(void)
411 {
412 	static DEFINE_MUTEX(base_pgt_cache_mutex);
413 	unsigned long sz = _PAGE_TABLE_SIZE;
414 
415 	if (base_pgt_cache)
416 		return 0;
417 	mutex_lock(&base_pgt_cache_mutex);
418 	if (!base_pgt_cache)
419 		base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
420 	mutex_unlock(&base_pgt_cache_mutex);
421 	return base_pgt_cache ? 0 : -ENOMEM;
422 }
423 
424 /**
425  * base_asce_alloc - create kernel mapping without enhanced DAT features
426  * @addr: virtual start address of kernel mapping
427  * @num_pages: number of consecutive pages
428  *
429  * Generate an asce, including all required region, segment and page tables,
430  * that can be used to access the virtual kernel mapping. The difference is
431  * that the returned asce does not make use of any enhanced DAT features like
432  * e.g. large pages. This is required for some I/O functions that pass an
433  * asce, like e.g. some service call requests.
434  *
435  * Note: the returned asce may NEVER be attached to any cpu. It may only be
436  *	 used for I/O requests. tlb entries that might result because the
437  *	 asce was attached to a cpu won't be cleared.
438  */
439 unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
440 {
441 	unsigned long asce, *table, end;
442 	int rc;
443 
444 	if (base_pgt_cache_init())
445 		return 0;
446 	end = addr + num_pages * PAGE_SIZE;
447 	if (end <= _REGION3_SIZE) {
448 		table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
449 		if (!table)
450 			return 0;
451 		rc = base_segment_walk(table, addr, end, 1);
452 		asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
453 	} else if (end <= _REGION2_SIZE) {
454 		table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
455 		if (!table)
456 			return 0;
457 		rc = base_region3_walk(table, addr, end, 1);
458 		asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
459 	} else if (end <= _REGION1_SIZE) {
460 		table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
461 		if (!table)
462 			return 0;
463 		rc = base_region2_walk(table, addr, end, 1);
464 		asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
465 	} else {
466 		table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
467 		if (!table)
468 			return 0;
469 		rc = base_region1_walk(table, addr, end, 1);
470 		asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
471 	}
472 	if (rc) {
473 		base_asce_free(asce);
474 		asce = 0;
475 	}
476 	return asce;
477 }
478