1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Page table allocation functions
4 *
5 * Copyright IBM Corp. 2016
6 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7 */
8
9 #include <linux/sysctl.h>
10 #include <linux/slab.h>
11 #include <linux/mm.h>
12 #include <asm/mmu_context.h>
13 #include <asm/page-states.h>
14 #include <asm/pgalloc.h>
15 #include <asm/tlbflush.h>
16
crst_table_alloc_noprof(struct mm_struct * mm)17 unsigned long *crst_table_alloc_noprof(struct mm_struct *mm)
18 {
19 gfp_t gfp = GFP_KERNEL_ACCOUNT;
20 struct ptdesc *ptdesc;
21 unsigned long *table;
22
23 if (mm == &init_mm)
24 gfp &= ~__GFP_ACCOUNT;
25 ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER);
26 if (!ptdesc)
27 return NULL;
28 table = ptdesc_address(ptdesc);
29 __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
30 return table;
31 }
32
crst_table_free(struct mm_struct * mm,unsigned long * table)33 void crst_table_free(struct mm_struct *mm, unsigned long *table)
34 {
35 if (!table)
36 return;
37 pagetable_free(virt_to_ptdesc(table));
38 }
39
__crst_table_upgrade(void * arg)40 static void __crst_table_upgrade(void *arg)
41 {
42 struct mm_struct *mm = arg;
43 struct ctlreg asce;
44
45 /* change all active ASCEs to avoid the creation of new TLBs */
46 if (current->active_mm == mm) {
47 asce.val = mm->context.asce;
48 get_lowcore()->user_asce = asce;
49 local_ctl_load(7, &asce);
50 if (!test_thread_flag(TIF_ASCE_PRIMARY))
51 local_ctl_load(1, &asce);
52 }
53 __tlb_flush_local();
54 }
55
crst_table_upgrade(struct mm_struct * mm,unsigned long end)56 int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
57 {
58 unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
59 unsigned long asce_limit = mm->context.asce_limit;
60
61 mmap_assert_write_locked(mm);
62
63 /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
64 VM_BUG_ON(asce_limit < _REGION2_SIZE);
65
66 if (end <= asce_limit)
67 return 0;
68
69 if (asce_limit == _REGION2_SIZE) {
70 p4d = crst_table_alloc(mm);
71 if (unlikely(!p4d))
72 goto err_p4d;
73 crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
74 pagetable_p4d_ctor(virt_to_ptdesc(p4d));
75 }
76 if (end > _REGION1_SIZE) {
77 pgd = crst_table_alloc(mm);
78 if (unlikely(!pgd))
79 goto err_pgd;
80 crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
81 pagetable_pgd_ctor(virt_to_ptdesc(pgd));
82 }
83
84 spin_lock_bh(&mm->page_table_lock);
85
86 if (p4d) {
87 __pgd = (unsigned long *) mm->pgd;
88 p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
89 mm->pgd = (pgd_t *) p4d;
90 mm->context.asce_limit = _REGION1_SIZE;
91 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
92 _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
93 mm_inc_nr_puds(mm);
94 }
95 if (pgd) {
96 __pgd = (unsigned long *) mm->pgd;
97 pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
98 mm->pgd = (pgd_t *) pgd;
99 mm->context.asce_limit = TASK_SIZE_MAX;
100 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
101 _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
102 }
103
104 spin_unlock_bh(&mm->page_table_lock);
105
106 on_each_cpu(__crst_table_upgrade, mm, 0);
107
108 return 0;
109
110 err_pgd:
111 pagetable_dtor(virt_to_ptdesc(p4d));
112 crst_table_free(mm, p4d);
113 err_p4d:
114 return -ENOMEM;
115 }
116
page_table_alloc_noprof(struct mm_struct * mm)117 unsigned long *page_table_alloc_noprof(struct mm_struct *mm)
118 {
119 gfp_t gfp = GFP_KERNEL_ACCOUNT;
120 struct ptdesc *ptdesc;
121 unsigned long *table;
122
123 if (mm == &init_mm)
124 gfp &= ~__GFP_ACCOUNT;
125 ptdesc = pagetable_alloc_noprof(gfp, 0);
126 if (!ptdesc)
127 return NULL;
128 if (!pagetable_pte_ctor(mm, ptdesc)) {
129 pagetable_free(ptdesc);
130 return NULL;
131 }
132 table = ptdesc_address(ptdesc);
133 __arch_set_page_dat(table, 1);
134 memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
135 memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
136 return table;
137 }
138
page_table_free(struct mm_struct * mm,unsigned long * table)139 void page_table_free(struct mm_struct *mm, unsigned long *table)
140 {
141 struct ptdesc *ptdesc = virt_to_ptdesc(table);
142
143 if (pagetable_is_reserved(ptdesc))
144 return free_reserved_ptdesc(ptdesc);
145 pagetable_dtor_free(ptdesc);
146 }
147
148 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pte_free_now(struct rcu_head * head)149 static void pte_free_now(struct rcu_head *head)
150 {
151 struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
152
153 pagetable_dtor_free(ptdesc);
154 }
155
pte_free_defer(struct mm_struct * mm,pgtable_t pgtable)156 void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
157 {
158 struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
159
160 call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
161 }
162 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
163
164 /*
165 * Base infrastructure required to generate basic asces, region, segment,
166 * and page tables that do not make use of enhanced features like EDAT1.
167 */
168
169 static struct kmem_cache *base_pgt_cache;
170
base_pgt_alloc(void)171 static unsigned long *base_pgt_alloc(void)
172 {
173 unsigned long *table;
174
175 table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
176 if (table)
177 memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
178 return table;
179 }
180
base_pgt_free(unsigned long * table)181 static void base_pgt_free(unsigned long *table)
182 {
183 kmem_cache_free(base_pgt_cache, table);
184 }
185
base_crst_alloc(unsigned long val)186 static unsigned long *base_crst_alloc(unsigned long val)
187 {
188 unsigned long *table;
189 struct ptdesc *ptdesc;
190
191 ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
192 if (!ptdesc)
193 return NULL;
194 table = ptdesc_address(ptdesc);
195 crst_table_init(table, val);
196 return table;
197 }
198
base_crst_free(unsigned long * table)199 static void base_crst_free(unsigned long *table)
200 {
201 if (!table)
202 return;
203 pagetable_free(virt_to_ptdesc(table));
204 }
205
206 #define BASE_ADDR_END_FUNC(NAME, SIZE) \
207 static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
208 unsigned long end) \
209 { \
210 unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \
211 \
212 return (next - 1) < (end - 1) ? next : end; \
213 }
214
BASE_ADDR_END_FUNC(page,PAGE_SIZE)215 BASE_ADDR_END_FUNC(page, PAGE_SIZE)
216 BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
217 BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
218 BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
219 BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
220
221 static inline unsigned long base_lra(unsigned long address)
222 {
223 unsigned long real;
224
225 asm volatile(
226 " lra %0,0(%1)"
227 : "=d" (real) : "a" (address) : "cc");
228 return real;
229 }
230
base_page_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)231 static int base_page_walk(unsigned long *origin, unsigned long addr,
232 unsigned long end, int alloc)
233 {
234 unsigned long *pte, next;
235
236 if (!alloc)
237 return 0;
238 pte = origin;
239 pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT;
240 do {
241 next = base_page_addr_end(addr, end);
242 *pte = base_lra(addr);
243 } while (pte++, addr = next, addr < end);
244 return 0;
245 }
246
base_segment_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)247 static int base_segment_walk(unsigned long *origin, unsigned long addr,
248 unsigned long end, int alloc)
249 {
250 unsigned long *ste, next, *table;
251 int rc;
252
253 ste = origin;
254 ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
255 do {
256 next = base_segment_addr_end(addr, end);
257 if (*ste & _SEGMENT_ENTRY_INVALID) {
258 if (!alloc)
259 continue;
260 table = base_pgt_alloc();
261 if (!table)
262 return -ENOMEM;
263 *ste = __pa(table) | _SEGMENT_ENTRY;
264 }
265 table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
266 rc = base_page_walk(table, addr, next, alloc);
267 if (rc)
268 return rc;
269 if (!alloc)
270 base_pgt_free(table);
271 cond_resched();
272 } while (ste++, addr = next, addr < end);
273 return 0;
274 }
275
base_region3_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)276 static int base_region3_walk(unsigned long *origin, unsigned long addr,
277 unsigned long end, int alloc)
278 {
279 unsigned long *rtte, next, *table;
280 int rc;
281
282 rtte = origin;
283 rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
284 do {
285 next = base_region3_addr_end(addr, end);
286 if (*rtte & _REGION_ENTRY_INVALID) {
287 if (!alloc)
288 continue;
289 table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
290 if (!table)
291 return -ENOMEM;
292 *rtte = __pa(table) | _REGION3_ENTRY;
293 }
294 table = __va(*rtte & _REGION_ENTRY_ORIGIN);
295 rc = base_segment_walk(table, addr, next, alloc);
296 if (rc)
297 return rc;
298 if (!alloc)
299 base_crst_free(table);
300 } while (rtte++, addr = next, addr < end);
301 return 0;
302 }
303
base_region2_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)304 static int base_region2_walk(unsigned long *origin, unsigned long addr,
305 unsigned long end, int alloc)
306 {
307 unsigned long *rste, next, *table;
308 int rc;
309
310 rste = origin;
311 rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
312 do {
313 next = base_region2_addr_end(addr, end);
314 if (*rste & _REGION_ENTRY_INVALID) {
315 if (!alloc)
316 continue;
317 table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
318 if (!table)
319 return -ENOMEM;
320 *rste = __pa(table) | _REGION2_ENTRY;
321 }
322 table = __va(*rste & _REGION_ENTRY_ORIGIN);
323 rc = base_region3_walk(table, addr, next, alloc);
324 if (rc)
325 return rc;
326 if (!alloc)
327 base_crst_free(table);
328 } while (rste++, addr = next, addr < end);
329 return 0;
330 }
331
base_region1_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)332 static int base_region1_walk(unsigned long *origin, unsigned long addr,
333 unsigned long end, int alloc)
334 {
335 unsigned long *rfte, next, *table;
336 int rc;
337
338 rfte = origin;
339 rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
340 do {
341 next = base_region1_addr_end(addr, end);
342 if (*rfte & _REGION_ENTRY_INVALID) {
343 if (!alloc)
344 continue;
345 table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
346 if (!table)
347 return -ENOMEM;
348 *rfte = __pa(table) | _REGION1_ENTRY;
349 }
350 table = __va(*rfte & _REGION_ENTRY_ORIGIN);
351 rc = base_region2_walk(table, addr, next, alloc);
352 if (rc)
353 return rc;
354 if (!alloc)
355 base_crst_free(table);
356 } while (rfte++, addr = next, addr < end);
357 return 0;
358 }
359
360 /**
361 * base_asce_free - free asce and tables returned from base_asce_alloc()
362 * @asce: asce to be freed
363 *
364 * Frees all region, segment, and page tables that were allocated with a
365 * corresponding base_asce_alloc() call.
366 */
base_asce_free(unsigned long asce)367 void base_asce_free(unsigned long asce)
368 {
369 unsigned long *table = __va(asce & _ASCE_ORIGIN);
370
371 if (!asce)
372 return;
373 switch (asce & _ASCE_TYPE_MASK) {
374 case _ASCE_TYPE_SEGMENT:
375 base_segment_walk(table, 0, _REGION3_SIZE, 0);
376 break;
377 case _ASCE_TYPE_REGION3:
378 base_region3_walk(table, 0, _REGION2_SIZE, 0);
379 break;
380 case _ASCE_TYPE_REGION2:
381 base_region2_walk(table, 0, _REGION1_SIZE, 0);
382 break;
383 case _ASCE_TYPE_REGION1:
384 base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
385 break;
386 }
387 base_crst_free(table);
388 }
389
base_pgt_cache_init(void)390 static int base_pgt_cache_init(void)
391 {
392 static DEFINE_MUTEX(base_pgt_cache_mutex);
393 unsigned long sz = _PAGE_TABLE_SIZE;
394
395 if (base_pgt_cache)
396 return 0;
397 mutex_lock(&base_pgt_cache_mutex);
398 if (!base_pgt_cache)
399 base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
400 mutex_unlock(&base_pgt_cache_mutex);
401 return base_pgt_cache ? 0 : -ENOMEM;
402 }
403
404 /**
405 * base_asce_alloc - create kernel mapping without enhanced DAT features
406 * @addr: virtual start address of kernel mapping
407 * @num_pages: number of consecutive pages
408 *
409 * Generate an asce, including all required region, segment and page tables,
410 * that can be used to access the virtual kernel mapping. The difference is
411 * that the returned asce does not make use of any enhanced DAT features like
412 * e.g. large pages. This is required for some I/O functions that pass an
413 * asce, like e.g. some service call requests.
414 *
415 * Note: the returned asce may NEVER be attached to any cpu. It may only be
416 * used for I/O requests. tlb entries that might result because the
417 * asce was attached to a cpu won't be cleared.
418 */
base_asce_alloc(unsigned long addr,unsigned long num_pages)419 unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
420 {
421 unsigned long asce, *table, end;
422 int rc;
423
424 if (base_pgt_cache_init())
425 return 0;
426 end = addr + num_pages * PAGE_SIZE;
427 if (end <= _REGION3_SIZE) {
428 table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
429 if (!table)
430 return 0;
431 rc = base_segment_walk(table, addr, end, 1);
432 asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
433 } else if (end <= _REGION2_SIZE) {
434 table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
435 if (!table)
436 return 0;
437 rc = base_region3_walk(table, addr, end, 1);
438 asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
439 } else if (end <= _REGION1_SIZE) {
440 table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
441 if (!table)
442 return 0;
443 rc = base_region2_walk(table, addr, end, 1);
444 asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
445 } else {
446 table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
447 if (!table)
448 return 0;
449 rc = base_region1_walk(table, addr, end, 1);
450 asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
451 }
452 if (rc) {
453 base_asce_free(asce);
454 asce = 0;
455 }
456 return asce;
457 }
458