xref: /linux/mm/pagewalk.c (revision 2c9e7a5f2e3f398213c0c122c18ffa2f4e192457)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/pagewalk.h>
3 #include <linux/highmem.h>
4 #include <linux/sched.h>
5 #include <linux/hugetlb.h>
6 #include <linux/mmu_context.h>
7 #include <linux/swap.h>
8 
9 #include <asm/tlbflush.h>
10 
11 #include "internal.h"
12 
13 /*
14  * We want to know the real level where a entry is located ignoring any
15  * folding of levels which may be happening. For example if p4d is folded then
16  * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
17  */
real_depth(int depth)18 static int real_depth(int depth)
19 {
20 	if (depth == 3 && PTRS_PER_PMD == 1)
21 		depth = 2;
22 	if (depth == 2 && PTRS_PER_PUD == 1)
23 		depth = 1;
24 	if (depth == 1 && PTRS_PER_P4D == 1)
25 		depth = 0;
26 	return depth;
27 }
28 
walk_pte_range_inner(pte_t * pte,unsigned long addr,unsigned long end,struct mm_walk * walk)29 static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
30 				unsigned long end, struct mm_walk *walk)
31 {
32 	const struct mm_walk_ops *ops = walk->ops;
33 	int err = 0;
34 
35 	for (;;) {
36 		if (ops->install_pte && pte_none(ptep_get(pte))) {
37 			pte_t new_pte;
38 
39 			err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte,
40 					       walk);
41 			if (err)
42 				break;
43 
44 			set_pte_at(walk->mm, addr, pte, new_pte);
45 			/* Non-present before, so for arches that need it. */
46 			if (!WARN_ON_ONCE(walk->no_vma))
47 				update_mmu_cache(walk->vma, addr, pte);
48 		} else {
49 			err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
50 			if (err)
51 				break;
52 		}
53 		if (addr >= end - PAGE_SIZE)
54 			break;
55 		addr += PAGE_SIZE;
56 		pte++;
57 	}
58 	return err;
59 }
60 
walk_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)61 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
62 			  struct mm_walk *walk)
63 {
64 	pte_t *pte;
65 	int err = 0;
66 	spinlock_t *ptl;
67 
68 	if (walk->no_vma) {
69 		/*
70 		 * pte_offset_map() might apply user-specific validation.
71 		 * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
72 		 * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
73 		 * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
74 		 */
75 		if (walk->mm == &init_mm || addr >= TASK_SIZE)
76 			pte = pte_offset_kernel(pmd, addr);
77 		else
78 			pte = pte_offset_map(pmd, addr);
79 		if (pte) {
80 			err = walk_pte_range_inner(pte, addr, end, walk);
81 			if (walk->mm != &init_mm && addr < TASK_SIZE)
82 				pte_unmap(pte);
83 		}
84 	} else {
85 		pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
86 		if (pte) {
87 			err = walk_pte_range_inner(pte, addr, end, walk);
88 			pte_unmap_unlock(pte, ptl);
89 		}
90 	}
91 	if (!pte)
92 		walk->action = ACTION_AGAIN;
93 	return err;
94 }
95 
walk_pmd_range(pud_t * pud,unsigned long addr,unsigned long end,struct mm_walk * walk)96 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
97 			  struct mm_walk *walk)
98 {
99 	pud_t pudval = pudp_get(pud);
100 	pmd_t *pmd;
101 	unsigned long next;
102 	const struct mm_walk_ops *ops = walk->ops;
103 	bool has_handler = ops->pte_entry;
104 	bool has_install = ops->install_pte;
105 	int err = 0;
106 	int depth = real_depth(3);
107 
108 	/*
109 	 * For PTE handling, pte_offset_map_lock() takes care of checking
110 	 * whether there actually is a page table. But it also has to be
111 	 * very careful about concurrent page table reclaim.
112 	 *
113 	 * Similarly, we have to be careful here - a PUD entry that points
114 	 * to a PMD table cannot go away, so we can just walk it. But if
115 	 * it's something else, we need to ensure we didn't race something,
116 	 * so need to retry.
117 	 *
118 	 * A pertinent example of this is a PUD refault after PUD split -
119 	 * we will need to split again or risk accessing invalid memory.
120 	 */
121 	if (!pud_present(pudval) || pud_leaf(pudval)) {
122 		walk->action = ACTION_AGAIN;
123 		return 0;
124 	}
125 
126 	pmd = pmd_offset(pud, addr);
127 	do {
128 again:
129 		next = pmd_addr_end(addr, end);
130 		if (pmd_none(*pmd)) {
131 			if (has_install)
132 				err = __pte_alloc(walk->mm, pmd);
133 			else if (ops->pte_hole)
134 				err = ops->pte_hole(addr, next, depth, walk);
135 			if (err)
136 				break;
137 			if (!has_install)
138 				continue;
139 		}
140 
141 		walk->action = ACTION_SUBTREE;
142 
143 		/*
144 		 * This implies that each ->pmd_entry() handler
145 		 * needs to know about pmd_trans_huge() pmds
146 		 */
147 		if (ops->pmd_entry)
148 			err = ops->pmd_entry(pmd, addr, next, walk);
149 		if (err)
150 			break;
151 
152 		if (walk->action == ACTION_AGAIN)
153 			goto again;
154 		if (walk->action == ACTION_CONTINUE)
155 			continue;
156 
157 		if (!has_handler) { /* No handlers for lower page tables. */
158 			if (!has_install)
159 				continue; /* Nothing to do. */
160 			/*
161 			 * We are ONLY installing, so avoid unnecessarily
162 			 * splitting a present huge page.
163 			 */
164 			if (pmd_present(*pmd) && pmd_trans_huge(*pmd))
165 				continue;
166 		}
167 
168 		if (walk->vma)
169 			split_huge_pmd(walk->vma, pmd, addr);
170 		else if (pmd_leaf(*pmd) || !pmd_present(*pmd))
171 			continue; /* Nothing to do. */
172 
173 		err = walk_pte_range(pmd, addr, next, walk);
174 		if (err)
175 			break;
176 
177 		if (walk->action == ACTION_AGAIN)
178 			goto again;
179 
180 	} while (pmd++, addr = next, addr != end);
181 
182 	return err;
183 }
184 
walk_pud_range(p4d_t * p4d,unsigned long addr,unsigned long end,struct mm_walk * walk)185 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
186 			  struct mm_walk *walk)
187 {
188 	pud_t *pud;
189 	unsigned long next;
190 	const struct mm_walk_ops *ops = walk->ops;
191 	bool has_handler = ops->pmd_entry || ops->pte_entry;
192 	bool has_install = ops->install_pte;
193 	int err = 0;
194 	int depth = real_depth(2);
195 
196 	pud = pud_offset(p4d, addr);
197 	do {
198  again:
199 		next = pud_addr_end(addr, end);
200 		if (pud_none(*pud)) {
201 			if (has_install)
202 				err = __pmd_alloc(walk->mm, pud, addr);
203 			else if (ops->pte_hole)
204 				err = ops->pte_hole(addr, next, depth, walk);
205 			if (err)
206 				break;
207 			if (!has_install)
208 				continue;
209 		}
210 
211 		walk->action = ACTION_SUBTREE;
212 
213 		if (ops->pud_entry)
214 			err = ops->pud_entry(pud, addr, next, walk);
215 		if (err)
216 			break;
217 
218 		if (walk->action == ACTION_AGAIN)
219 			goto again;
220 		if (walk->action == ACTION_CONTINUE)
221 			continue;
222 
223 		if (!has_handler) { /* No handlers for lower page tables. */
224 			if (!has_install)
225 				continue; /* Nothing to do. */
226 			/*
227 			 * We are ONLY installing, so avoid unnecessarily
228 			 * splitting a present huge page.
229 			 */
230 			if (pud_present(*pud) && pud_trans_huge(*pud))
231 				continue;
232 		}
233 
234 		if (walk->vma)
235 			split_huge_pud(walk->vma, pud, addr);
236 		else if (pud_leaf(*pud) || !pud_present(*pud))
237 			continue; /* Nothing to do. */
238 
239 		err = walk_pmd_range(pud, addr, next, walk);
240 		if (err)
241 			break;
242 
243 		if (walk->action == ACTION_AGAIN)
244 			goto again;
245 	} while (pud++, addr = next, addr != end);
246 
247 	return err;
248 }
249 
walk_p4d_range(pgd_t * pgd,unsigned long addr,unsigned long end,struct mm_walk * walk)250 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
251 			  struct mm_walk *walk)
252 {
253 	p4d_t *p4d;
254 	unsigned long next;
255 	const struct mm_walk_ops *ops = walk->ops;
256 	bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry;
257 	bool has_install = ops->install_pte;
258 	int err = 0;
259 	int depth = real_depth(1);
260 
261 	p4d = p4d_offset(pgd, addr);
262 	do {
263 		next = p4d_addr_end(addr, end);
264 		if (p4d_none_or_clear_bad(p4d)) {
265 			if (has_install)
266 				err = __pud_alloc(walk->mm, p4d, addr);
267 			else if (ops->pte_hole)
268 				err = ops->pte_hole(addr, next, depth, walk);
269 			if (err)
270 				break;
271 			if (!has_install)
272 				continue;
273 		}
274 		if (ops->p4d_entry) {
275 			err = ops->p4d_entry(p4d, addr, next, walk);
276 			if (err)
277 				break;
278 		}
279 		if (has_handler || has_install)
280 			err = walk_pud_range(p4d, addr, next, walk);
281 		if (err)
282 			break;
283 	} while (p4d++, addr = next, addr != end);
284 
285 	return err;
286 }
287 
walk_pgd_range(unsigned long addr,unsigned long end,struct mm_walk * walk)288 static int walk_pgd_range(unsigned long addr, unsigned long end,
289 			  struct mm_walk *walk)
290 {
291 	pgd_t *pgd;
292 	unsigned long next;
293 	const struct mm_walk_ops *ops = walk->ops;
294 	bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
295 		ops->pte_entry;
296 	bool has_install = ops->install_pte;
297 	int err = 0;
298 
299 	if (walk->pgd)
300 		pgd = walk->pgd + pgd_index(addr);
301 	else
302 		pgd = pgd_offset(walk->mm, addr);
303 	do {
304 		next = pgd_addr_end(addr, end);
305 		if (pgd_none_or_clear_bad(pgd)) {
306 			if (has_install)
307 				err = __p4d_alloc(walk->mm, pgd, addr);
308 			else if (ops->pte_hole)
309 				err = ops->pte_hole(addr, next, 0, walk);
310 			if (err)
311 				break;
312 			if (!has_install)
313 				continue;
314 		}
315 		if (ops->pgd_entry) {
316 			err = ops->pgd_entry(pgd, addr, next, walk);
317 			if (err)
318 				break;
319 		}
320 		if (has_handler || has_install)
321 			err = walk_p4d_range(pgd, addr, next, walk);
322 		if (err)
323 			break;
324 	} while (pgd++, addr = next, addr != end);
325 
326 	return err;
327 }
328 
329 #ifdef CONFIG_HUGETLB_PAGE
hugetlb_entry_end(struct hstate * h,unsigned long addr,unsigned long end)330 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
331 				       unsigned long end)
332 {
333 	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
334 
335 	return min(boundary, end);
336 }
337 
walk_hugetlb_range(unsigned long addr,unsigned long end,struct mm_walk * walk)338 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
339 			      struct mm_walk *walk)
340 {
341 	struct vm_area_struct *vma = walk->vma;
342 	struct hstate *h = hstate_vma(vma);
343 	unsigned long next;
344 	unsigned long hmask = huge_page_mask(h);
345 	unsigned long sz = huge_page_size(h);
346 	pte_t *pte;
347 	const struct mm_walk_ops *ops = walk->ops;
348 	int err = 0;
349 
350 	hugetlb_vma_lock_read(vma);
351 	do {
352 		next = hugetlb_entry_end(h, addr, end);
353 		pte = hugetlb_walk(vma, addr & hmask, sz);
354 		if (pte)
355 			err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
356 		else if (ops->pte_hole)
357 			err = ops->pte_hole(addr, next, -1, walk);
358 		if (err)
359 			break;
360 	} while (addr = next, addr != end);
361 	hugetlb_vma_unlock_read(vma);
362 
363 	return err;
364 }
365 
366 #else /* CONFIG_HUGETLB_PAGE */
walk_hugetlb_range(unsigned long addr,unsigned long end,struct mm_walk * walk)367 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
368 			      struct mm_walk *walk)
369 {
370 	return 0;
371 }
372 
373 #endif /* CONFIG_HUGETLB_PAGE */
374 
375 /*
376  * Decide whether we really walk over the current vma on [@start, @end)
377  * or skip it via the returned value. Return 0 if we do walk over the
378  * current vma, and return 1 if we skip the vma. Negative values means
379  * error, where we abort the current walk.
380  */
walk_page_test(unsigned long start,unsigned long end,struct mm_walk * walk)381 static int walk_page_test(unsigned long start, unsigned long end,
382 			struct mm_walk *walk)
383 {
384 	struct vm_area_struct *vma = walk->vma;
385 	const struct mm_walk_ops *ops = walk->ops;
386 
387 	if (ops->test_walk)
388 		return ops->test_walk(start, end, walk);
389 
390 	/*
391 	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
392 	 * range, so we don't walk over it as we do for normal vmas. However,
393 	 * Some callers are interested in handling hole range and they don't
394 	 * want to just ignore any single address range. Such users certainly
395 	 * define their ->pte_hole() callbacks, so let's delegate them to handle
396 	 * vma(VM_PFNMAP).
397 	 */
398 	if (vma->vm_flags & VM_PFNMAP) {
399 		int err = 1;
400 		if (ops->pte_hole)
401 			err = ops->pte_hole(start, end, -1, walk);
402 		return err ? err : 1;
403 	}
404 	return 0;
405 }
406 
__walk_page_range(unsigned long start,unsigned long end,struct mm_walk * walk)407 static int __walk_page_range(unsigned long start, unsigned long end,
408 			struct mm_walk *walk)
409 {
410 	int err = 0;
411 	struct vm_area_struct *vma = walk->vma;
412 	const struct mm_walk_ops *ops = walk->ops;
413 	bool is_hugetlb = is_vm_hugetlb_page(vma);
414 
415 	/* We do not support hugetlb PTE installation. */
416 	if (ops->install_pte && is_hugetlb)
417 		return -EINVAL;
418 
419 	if (ops->pre_vma) {
420 		err = ops->pre_vma(start, end, walk);
421 		if (err)
422 			return err;
423 	}
424 
425 	if (is_hugetlb) {
426 		if (ops->hugetlb_entry)
427 			err = walk_hugetlb_range(start, end, walk);
428 	} else
429 		err = walk_pgd_range(start, end, walk);
430 
431 	if (ops->post_vma)
432 		ops->post_vma(walk);
433 
434 	return err;
435 }
436 
process_mm_walk_lock(struct mm_struct * mm,enum page_walk_lock walk_lock)437 static inline void process_mm_walk_lock(struct mm_struct *mm,
438 					enum page_walk_lock walk_lock)
439 {
440 	if (walk_lock == PGWALK_RDLOCK)
441 		mmap_assert_locked(mm);
442 	else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY)
443 		mmap_assert_write_locked(mm);
444 }
445 
process_vma_walk_lock(struct vm_area_struct * vma,enum page_walk_lock walk_lock)446 static inline void process_vma_walk_lock(struct vm_area_struct *vma,
447 					 enum page_walk_lock walk_lock)
448 {
449 #ifdef CONFIG_PER_VMA_LOCK
450 	switch (walk_lock) {
451 	case PGWALK_WRLOCK:
452 		vma_start_write(vma);
453 		break;
454 	case PGWALK_WRLOCK_VERIFY:
455 		vma_assert_write_locked(vma);
456 		break;
457 	case PGWALK_VMA_RDLOCK_VERIFY:
458 		vma_assert_locked(vma);
459 		break;
460 	case PGWALK_RDLOCK:
461 		/* PGWALK_RDLOCK is handled by process_mm_walk_lock */
462 		break;
463 	}
464 #endif
465 }
466 
467 /*
468  * See the comment for walk_page_range(), this performs the heavy lifting of the
469  * operation, only sets no restrictions on how the walk proceeds.
470  *
471  * We usually restrict the ability to install PTEs, but this functionality is
472  * available to internal memory management code and provided in mm/internal.h.
473  */
walk_page_range_mm_unsafe(struct mm_struct * mm,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)474 int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start,
475 		unsigned long end, const struct mm_walk_ops *ops,
476 		void *private)
477 {
478 	int err = 0;
479 	unsigned long next;
480 	struct vm_area_struct *vma;
481 	struct mm_walk walk = {
482 		.ops		= ops,
483 		.mm		= mm,
484 		.private	= private,
485 	};
486 
487 	if (start >= end)
488 		return -EINVAL;
489 
490 	if (!walk.mm)
491 		return -EINVAL;
492 
493 	process_mm_walk_lock(walk.mm, ops->walk_lock);
494 
495 	vma = find_vma(walk.mm, start);
496 	do {
497 		if (!vma) { /* after the last vma */
498 			walk.vma = NULL;
499 			next = end;
500 			if (ops->pte_hole)
501 				err = ops->pte_hole(start, next, -1, &walk);
502 		} else if (start < vma->vm_start) { /* outside vma */
503 			walk.vma = NULL;
504 			next = min(end, vma->vm_start);
505 			if (ops->pte_hole)
506 				err = ops->pte_hole(start, next, -1, &walk);
507 		} else { /* inside vma */
508 			process_vma_walk_lock(vma, ops->walk_lock);
509 			walk.vma = vma;
510 			next = min(end, vma->vm_end);
511 			vma = find_vma(mm, vma->vm_end);
512 
513 			err = walk_page_test(start, next, &walk);
514 			if (err > 0) {
515 				/*
516 				 * positive return values are purely for
517 				 * controlling the pagewalk, so should never
518 				 * be passed to the callers.
519 				 */
520 				err = 0;
521 				continue;
522 			}
523 			if (err < 0)
524 				break;
525 			err = __walk_page_range(start, next, &walk);
526 		}
527 		if (err)
528 			break;
529 	} while (start = next, start < end);
530 	return err;
531 }
532 
533 /*
534  * Determine if the walk operations specified are permitted to be used for a
535  * page table walk.
536  *
537  * This check is performed on all functions which are parameterised by walk
538  * operations and exposed in include/linux/pagewalk.h.
539  *
540  * Internal memory management code can use *_unsafe() functions to be able to
541  * use all page walking operations.
542  */
check_ops_safe(const struct mm_walk_ops * ops)543 static bool check_ops_safe(const struct mm_walk_ops *ops)
544 {
545 	/*
546 	 * The installation of PTEs is solely under the control of memory
547 	 * management logic and subject to many subtle locking, security and
548 	 * cache considerations so we cannot permit other users to do so, and
549 	 * certainly not for exported symbols.
550 	 */
551 	if (ops->install_pte)
552 		return false;
553 
554 	return true;
555 }
556 
557 /**
558  * walk_page_range - walk page table with caller specific callbacks
559  * @mm:		mm_struct representing the target process of page table walk
560  * @start:	start address of the virtual address range
561  * @end:	end address of the virtual address range
562  * @ops:	operation to call during the walk
563  * @private:	private data for callbacks' usage
564  *
565  * Recursively walk the page table tree of the process represented by @mm
566  * within the virtual address range [@start, @end). During walking, we can do
567  * some caller-specific works for each entry, by setting up pmd_entry(),
568  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
569  * callbacks, the associated entries/pages are just ignored.
570  * The return values of these callbacks are commonly defined like below:
571  *
572  *  - 0  : succeeded to handle the current entry, and if you don't reach the
573  *         end address yet, continue to walk.
574  *  - >0 : succeeded to handle the current entry, and return to the caller
575  *         with caller specific value.
576  *  - <0 : failed to handle the current entry, and return to the caller
577  *         with error code.
578  *
579  * Before starting to walk page table, some callers want to check whether
580  * they really want to walk over the current vma, typically by checking
581  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
582  * purpose.
583  *
584  * If operations need to be staged before and committed after a vma is walked,
585  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
586  * since it is intended to handle commit-type operations, can't return any
587  * errors.
588  *
589  * struct mm_walk keeps current values of some common data like vma and pmd,
590  * which are useful for the access from callbacks. If you want to pass some
591  * caller-specific data to callbacks, @private should be helpful.
592  *
593  * Locking:
594  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
595  *   because these function traverse vma list and/or access to vma's data.
596  */
walk_page_range(struct mm_struct * mm,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)597 int walk_page_range(struct mm_struct *mm, unsigned long start,
598 		unsigned long end, const struct mm_walk_ops *ops,
599 		void *private)
600 {
601 	if (!check_ops_safe(ops))
602 		return -EINVAL;
603 
604 	return walk_page_range_mm_unsafe(mm, start, end, ops, private);
605 }
606 
607 /**
608  * walk_kernel_page_table_range - walk a range of kernel pagetables.
609  * @start:	start address of the virtual address range
610  * @end:	end address of the virtual address range
611  * @ops:	operation to call during the walk
612  * @pgd:	pgd to walk if different from mm->pgd
613  * @private:	private data for callbacks' usage
614  *
615  * Similar to walk_page_range() but can walk any page tables even if they are
616  * not backed by VMAs. Because 'unusual' entries may be walked this function
617  * will also not lock the PTEs for the pte_entry() callback. This is useful for
618  * walking kernel pages tables or page tables for firmware.
619  *
620  * Note: Be careful to walk the kernel pages tables, the caller may be need to
621  * take other effective approaches (mmap lock may be insufficient) to prevent
622  * the intermediate kernel page tables belonging to the specified address range
623  * from being freed (e.g. memory hot-remove).
624  */
walk_kernel_page_table_range(unsigned long start,unsigned long end,const struct mm_walk_ops * ops,pgd_t * pgd,void * private)625 int walk_kernel_page_table_range(unsigned long start, unsigned long end,
626 		const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
627 {
628 	/*
629 	 * Kernel intermediate page tables are usually not freed, so the mmap
630 	 * read lock is sufficient. But there are some exceptions.
631 	 * E.g. memory hot-remove. In which case, the mmap lock is insufficient
632 	 * to prevent the intermediate kernel pages tables belonging to the
633 	 * specified address range from being freed. The caller should take
634 	 * other actions to prevent this race.
635 	 */
636 	mmap_assert_locked(&init_mm);
637 
638 	return walk_kernel_page_table_range_lockless(start, end, ops, pgd,
639 						     private);
640 }
641 
642 /*
643  * Use this function to walk the kernel page tables locklessly. It should be
644  * guaranteed that the caller has exclusive access over the range they are
645  * operating on - that there should be no concurrent access, for example,
646  * changing permissions for vmalloc objects.
647  */
walk_kernel_page_table_range_lockless(unsigned long start,unsigned long end,const struct mm_walk_ops * ops,pgd_t * pgd,void * private)648 int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
649 		const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
650 {
651 	struct mm_walk walk = {
652 		.ops		= ops,
653 		.mm		= &init_mm,
654 		.pgd		= pgd,
655 		.private	= private,
656 		.no_vma		= true
657 	};
658 
659 	if (start >= end)
660 		return -EINVAL;
661 	if (!check_ops_safe(ops))
662 		return -EINVAL;
663 
664 	return walk_pgd_range(start, end, &walk);
665 }
666 
667 /**
668  * walk_page_range_debug - walk a range of pagetables not backed by a vma
669  * @mm:		mm_struct representing the target process of page table walk
670  * @start:	start address of the virtual address range
671  * @end:	end address of the virtual address range
672  * @ops:	operation to call during the walk
673  * @pgd:	pgd to walk if different from mm->pgd
674  * @private:	private data for callbacks' usage
675  *
676  * Similar to walk_page_range() but can walk any page tables even if they are
677  * not backed by VMAs. Because 'unusual' entries may be walked this function
678  * will also not lock the PTEs for the pte_entry() callback.
679  *
680  * This is for debugging purposes ONLY.
681  */
walk_page_range_debug(struct mm_struct * mm,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,pgd_t * pgd,void * private)682 int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
683 			  unsigned long end, const struct mm_walk_ops *ops,
684 			  pgd_t *pgd, void *private)
685 {
686 	struct mm_walk walk = {
687 		.ops		= ops,
688 		.mm		= mm,
689 		.pgd		= pgd,
690 		.private	= private,
691 		.no_vma		= true
692 	};
693 
694 	/* For convenience, we allow traversal of kernel mappings. */
695 	if (mm == &init_mm)
696 		return walk_kernel_page_table_range(start, end, ops,
697 						    pgd, private);
698 	if (start >= end || !walk.mm)
699 		return -EINVAL;
700 	if (!check_ops_safe(ops))
701 		return -EINVAL;
702 
703 	/*
704 	 * The mmap lock protects the page walker from changes to the page
705 	 * tables during the walk.  However a read lock is insufficient to
706 	 * protect those areas which don't have a VMA as munmap() detaches
707 	 * the VMAs before downgrading to a read lock and actually tearing
708 	 * down PTEs/page tables. In which case, the mmap write lock should
709 	 * be held.
710 	 */
711 	mmap_assert_write_locked(mm);
712 
713 	return walk_pgd_range(start, end, &walk);
714 }
715 
walk_page_range_vma_unsafe(struct vm_area_struct * vma,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)716 int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start,
717 		unsigned long end, const struct mm_walk_ops *ops, void *private)
718 {
719 	struct mm_walk walk = {
720 		.ops		= ops,
721 		.mm		= vma->vm_mm,
722 		.vma		= vma,
723 		.private	= private,
724 	};
725 
726 	if (start >= end || !walk.mm)
727 		return -EINVAL;
728 	if (start < vma->vm_start || end > vma->vm_end)
729 		return -EINVAL;
730 
731 	process_mm_walk_lock(walk.mm, ops->walk_lock);
732 	process_vma_walk_lock(vma, ops->walk_lock);
733 	return __walk_page_range(start, end, &walk);
734 }
735 
walk_page_range_vma(struct vm_area_struct * vma,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)736 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
737 			unsigned long end, const struct mm_walk_ops *ops,
738 			void *private)
739 {
740 	if (!check_ops_safe(ops))
741 		return -EINVAL;
742 
743 	return walk_page_range_vma_unsafe(vma, start, end, ops, private);
744 }
745 
walk_page_vma(struct vm_area_struct * vma,const struct mm_walk_ops * ops,void * private)746 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
747 		void *private)
748 {
749 	struct mm_walk walk = {
750 		.ops		= ops,
751 		.mm		= vma->vm_mm,
752 		.vma		= vma,
753 		.private	= private,
754 	};
755 
756 	if (!walk.mm)
757 		return -EINVAL;
758 	if (!check_ops_safe(ops))
759 		return -EINVAL;
760 
761 	process_mm_walk_lock(walk.mm, ops->walk_lock);
762 	process_vma_walk_lock(vma, ops->walk_lock);
763 	return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
764 }
765 
766 /**
767  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
768  * @mapping: Pointer to the struct address_space
769  * @first_index: First page offset in the address_space
770  * @nr: Number of incremental page offsets to cover
771  * @ops:	operation to call during the walk
772  * @private:	private data for callbacks' usage
773  *
774  * This function walks all memory areas mapped into a struct address_space.
775  * The walk is limited to only the given page-size index range, but if
776  * the index boundaries cross a huge page-table entry, that entry will be
777  * included.
778  *
779  * Also see walk_page_range() for additional information.
780  *
781  * Locking:
782  *   This function can't require that the struct mm_struct::mmap_lock is held,
783  *   since @mapping may be mapped by multiple processes. Instead
784  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
785  *   callbacks, and it's up tho the caller to ensure that the
786  *   struct mm_struct::mmap_lock is not needed.
787  *
788  *   Also this means that a caller can't rely on the struct
789  *   vm_area_struct::vm_flags to be constant across a call,
790  *   except for immutable flags. Callers requiring this shouldn't use
791  *   this function.
792  *
793  * Return: 0 on success, negative error code on failure, positive number on
794  * caller defined premature termination.
795  */
walk_page_mapping(struct address_space * mapping,pgoff_t first_index,pgoff_t nr,const struct mm_walk_ops * ops,void * private)796 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
797 		      pgoff_t nr, const struct mm_walk_ops *ops,
798 		      void *private)
799 {
800 	struct mm_walk walk = {
801 		.ops		= ops,
802 		.private	= private,
803 	};
804 	struct vm_area_struct *vma;
805 	pgoff_t vba, vea, cba, cea;
806 	unsigned long start_addr, end_addr;
807 	int err = 0;
808 
809 	if (!check_ops_safe(ops))
810 		return -EINVAL;
811 
812 	lockdep_assert_held(&mapping->i_mmap_rwsem);
813 	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
814 				  first_index + nr - 1) {
815 		/* Clip to the vma */
816 		vba = vma->vm_pgoff;
817 		vea = vba + vma_pages(vma);
818 		cba = first_index;
819 		cba = max(cba, vba);
820 		cea = first_index + nr;
821 		cea = min(cea, vea);
822 
823 		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
824 		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
825 		if (start_addr >= end_addr)
826 			continue;
827 
828 		walk.vma = vma;
829 		walk.mm = vma->vm_mm;
830 
831 		err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
832 		if (err > 0) {
833 			err = 0;
834 			break;
835 		} else if (err < 0)
836 			break;
837 
838 		err = __walk_page_range(start_addr, end_addr, &walk);
839 		if (err)
840 			break;
841 	}
842 
843 	return err;
844 }
845 
846 /**
847  * folio_walk_start - walk the page tables to a folio
848  * @fw: filled with information on success.
849  * @vma: the VMA.
850  * @addr: the virtual address to use for the page table walk.
851  * @flags: flags modifying which folios to walk to.
852  *
853  * Walk the page tables using @addr in a given @vma to a mapped folio and
854  * return the folio, making sure that the page table entry referenced by
855  * @addr cannot change until folio_walk_end() was called.
856  *
857  * As default, this function returns only folios that are not special (e.g., not
858  * the zeropage) and never returns folios that are supposed to be ignored by the
859  * VM as documented by vm_normal_page(). If requested, zeropages will be
860  * returned as well.
861  *
862  * If this function returns NULL it might either indicate "there is nothing" or
863  * "there is nothing suitable".
864  *
865  * On success, @fw is filled and the function returns the folio while the PTL
866  * is still held and folio_walk_end() must be called to clean up,
867  * releasing any held locks. The returned folio must *not* be used after the
868  * call to folio_walk_end(), unless a short-term folio reference is taken before
869  * that call.
870  *
871  * @fw->page will correspond to the page that is effectively referenced by
872  * @addr. However, for shared zeropages @fw->page is set to NULL. Note that
873  * large folios might be mapped by multiple page table entries, and this
874  * function will always only lookup a single entry as specified by @addr, which
875  * might or might not cover more than a single page of the returned folio.
876  *
877  * This function must *not* be used as a naive replacement for
878  * get_user_pages() / pin_user_pages(), especially not to perform DMA or
879  * to carelessly modify page content. This function may *only* be used to grab
880  * short-term folio references, never to grab long-term folio references.
881  *
882  * Using the page table entry pointers in @fw for reading or modifying the
883  * entry should be avoided where possible: however, there might be valid
884  * use cases.
885  *
886  * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
887  * For example, PMD page table sharing might require prior unsharing. Also,
888  * logical hugetlb entries might span multiple physical page table entries,
889  * which *must* be modified in a single operation (set_huge_pte_at(),
890  * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
891  * not correspond to the first physical entry of a logical hugetlb entry.
892  *
893  * The mmap lock must be held in read mode.
894  *
895  * Return: folio pointer on success, otherwise NULL.
896  */
folio_walk_start(struct folio_walk * fw,struct vm_area_struct * vma,unsigned long addr,folio_walk_flags_t flags)897 struct folio *folio_walk_start(struct folio_walk *fw,
898 		struct vm_area_struct *vma, unsigned long addr,
899 		folio_walk_flags_t flags)
900 {
901 	unsigned long entry_size;
902 	bool zeropage = false;
903 	struct page *page;
904 	pud_t *pudp, pud;
905 	pmd_t *pmdp, pmd;
906 	pte_t *ptep, pte;
907 	spinlock_t *ptl;
908 	pgd_t *pgdp;
909 	p4d_t *p4dp;
910 
911 	mmap_assert_locked(vma->vm_mm);
912 	vma_pgtable_walk_begin(vma);
913 
914 	if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
915 		goto not_found;
916 
917 	pgdp = pgd_offset(vma->vm_mm, addr);
918 	if (pgd_none_or_clear_bad(pgdp))
919 		goto not_found;
920 
921 	p4dp = p4d_offset(pgdp, addr);
922 	if (p4d_none_or_clear_bad(p4dp))
923 		goto not_found;
924 
925 	pudp = pud_offset(p4dp, addr);
926 	pud = pudp_get(pudp);
927 	if (pud_none(pud))
928 		goto not_found;
929 	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
930 	    (!pud_present(pud) || pud_leaf(pud))) {
931 		ptl = pud_lock(vma->vm_mm, pudp);
932 		pud = pudp_get(pudp);
933 
934 		entry_size = PUD_SIZE;
935 		fw->level = FW_LEVEL_PUD;
936 		fw->pudp = pudp;
937 		fw->pud = pud;
938 
939 		if (pud_none(pud)) {
940 			spin_unlock(ptl);
941 			goto not_found;
942 		} else if (pud_present(pud) && !pud_leaf(pud)) {
943 			spin_unlock(ptl);
944 			goto pmd_table;
945 		} else if (pud_present(pud)) {
946 			page = vm_normal_page_pud(vma, addr, pud);
947 			if (page)
948 				goto found;
949 		}
950 		spin_unlock(ptl);
951 		goto not_found;
952 	}
953 
954 pmd_table:
955 	VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
956 	pmdp = pmd_offset(pudp, addr);
957 	pmd = pmdp_get_lockless(pmdp);
958 	if (pmd_none(pmd))
959 		goto not_found;
960 	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
961 	    (!pmd_present(pmd) || pmd_leaf(pmd))) {
962 		ptl = pmd_lock(vma->vm_mm, pmdp);
963 		pmd = pmdp_get(pmdp);
964 
965 		entry_size = PMD_SIZE;
966 		fw->level = FW_LEVEL_PMD;
967 		fw->pmdp = pmdp;
968 		fw->pmd = pmd;
969 
970 		if (pmd_none(pmd)) {
971 			spin_unlock(ptl);
972 			goto not_found;
973 		} else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
974 			spin_unlock(ptl);
975 			goto pte_table;
976 		} else if (pmd_present(pmd)) {
977 			page = vm_normal_page_pmd(vma, addr, pmd);
978 			if (page) {
979 				goto found;
980 			} else if ((flags & FW_ZEROPAGE) &&
981 				    is_huge_zero_pmd(pmd)) {
982 				page = pfn_to_page(pmd_pfn(pmd));
983 				zeropage = true;
984 				goto found;
985 			}
986 		}
987 		spin_unlock(ptl);
988 		goto not_found;
989 	}
990 
991 pte_table:
992 	VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
993 	ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
994 	if (!ptep)
995 		goto not_found;
996 	pte = ptep_get(ptep);
997 
998 	entry_size = PAGE_SIZE;
999 	fw->level = FW_LEVEL_PTE;
1000 	fw->ptep = ptep;
1001 	fw->pte = pte;
1002 
1003 	if (pte_present(pte)) {
1004 		page = vm_normal_page(vma, addr, pte);
1005 		if (page)
1006 			goto found;
1007 		if ((flags & FW_ZEROPAGE) &&
1008 		    is_zero_pfn(pte_pfn(pte))) {
1009 			page = pfn_to_page(pte_pfn(pte));
1010 			zeropage = true;
1011 			goto found;
1012 		}
1013 	}
1014 	pte_unmap_unlock(ptep, ptl);
1015 not_found:
1016 	vma_pgtable_walk_end(vma);
1017 	return NULL;
1018 found:
1019 	if (!zeropage)
1020 		/* Note: Offset from the mapped page, not the folio start. */
1021 		fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT);
1022 	else
1023 		fw->page = NULL;
1024 	fw->ptl = ptl;
1025 	return page_folio(page);
1026 }
1027