1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/pagewalk.h>
3 #include <linux/highmem.h>
4 #include <linux/sched.h>
5 #include <linux/hugetlb.h>
6 #include <linux/mmu_context.h>
7 #include <linux/swap.h>
8
9 #include <asm/tlbflush.h>
10
11 #include "internal.h"
12
13 /*
14 * We want to know the real level where a entry is located ignoring any
15 * folding of levels which may be happening. For example if p4d is folded then
16 * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
17 */
real_depth(int depth)18 static int real_depth(int depth)
19 {
20 if (depth == 3 && PTRS_PER_PMD == 1)
21 depth = 2;
22 if (depth == 2 && PTRS_PER_PUD == 1)
23 depth = 1;
24 if (depth == 1 && PTRS_PER_P4D == 1)
25 depth = 0;
26 return depth;
27 }
28
walk_pte_range_inner(pte_t * pte,unsigned long addr,unsigned long end,struct mm_walk * walk)29 static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
30 unsigned long end, struct mm_walk *walk)
31 {
32 const struct mm_walk_ops *ops = walk->ops;
33 int err = 0;
34
35 for (;;) {
36 if (ops->install_pte && pte_none(ptep_get(pte))) {
37 pte_t new_pte;
38
39 err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte,
40 walk);
41 if (err)
42 break;
43
44 set_pte_at(walk->mm, addr, pte, new_pte);
45 /* Non-present before, so for arches that need it. */
46 if (!WARN_ON_ONCE(walk->no_vma))
47 update_mmu_cache(walk->vma, addr, pte);
48 } else {
49 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
50 if (err)
51 break;
52 }
53 if (addr >= end - PAGE_SIZE)
54 break;
55 addr += PAGE_SIZE;
56 pte++;
57 }
58 return err;
59 }
60
walk_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)61 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
62 struct mm_walk *walk)
63 {
64 pte_t *pte;
65 int err = 0;
66 spinlock_t *ptl;
67
68 if (walk->no_vma) {
69 /*
70 * pte_offset_map() might apply user-specific validation.
71 * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
72 * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
73 * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
74 */
75 if (walk->mm == &init_mm || addr >= TASK_SIZE)
76 pte = pte_offset_kernel(pmd, addr);
77 else
78 pte = pte_offset_map(pmd, addr);
79 if (pte) {
80 err = walk_pte_range_inner(pte, addr, end, walk);
81 if (walk->mm != &init_mm && addr < TASK_SIZE)
82 pte_unmap(pte);
83 }
84 } else {
85 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
86 if (pte) {
87 err = walk_pte_range_inner(pte, addr, end, walk);
88 pte_unmap_unlock(pte, ptl);
89 }
90 }
91 if (!pte)
92 walk->action = ACTION_AGAIN;
93 return err;
94 }
95
walk_pmd_range(pud_t * pud,unsigned long addr,unsigned long end,struct mm_walk * walk)96 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
97 struct mm_walk *walk)
98 {
99 pud_t pudval = pudp_get(pud);
100 pmd_t *pmd;
101 unsigned long next;
102 const struct mm_walk_ops *ops = walk->ops;
103 bool has_handler = ops->pte_entry;
104 bool has_install = ops->install_pte;
105 int err = 0;
106 int depth = real_depth(3);
107
108 /*
109 * For PTE handling, pte_offset_map_lock() takes care of checking
110 * whether there actually is a page table. But it also has to be
111 * very careful about concurrent page table reclaim.
112 *
113 * Similarly, we have to be careful here - a PUD entry that points
114 * to a PMD table cannot go away, so we can just walk it. But if
115 * it's something else, we need to ensure we didn't race something,
116 * so need to retry.
117 *
118 * A pertinent example of this is a PUD refault after PUD split -
119 * we will need to split again or risk accessing invalid memory.
120 */
121 if (!pud_present(pudval) || pud_leaf(pudval)) {
122 walk->action = ACTION_AGAIN;
123 return 0;
124 }
125
126 pmd = pmd_offset(pud, addr);
127 do {
128 again:
129 next = pmd_addr_end(addr, end);
130 if (pmd_none(*pmd)) {
131 if (has_install)
132 err = __pte_alloc(walk->mm, pmd);
133 else if (ops->pte_hole)
134 err = ops->pte_hole(addr, next, depth, walk);
135 if (err)
136 break;
137 if (!has_install)
138 continue;
139 }
140
141 walk->action = ACTION_SUBTREE;
142
143 /*
144 * This implies that each ->pmd_entry() handler
145 * needs to know about pmd_trans_huge() pmds
146 */
147 if (ops->pmd_entry)
148 err = ops->pmd_entry(pmd, addr, next, walk);
149 if (err)
150 break;
151
152 if (walk->action == ACTION_AGAIN)
153 goto again;
154 if (walk->action == ACTION_CONTINUE)
155 continue;
156
157 if (!has_handler) { /* No handlers for lower page tables. */
158 if (!has_install)
159 continue; /* Nothing to do. */
160 /*
161 * We are ONLY installing, so avoid unnecessarily
162 * splitting a present huge page.
163 */
164 if (pmd_present(*pmd) && pmd_trans_huge(*pmd))
165 continue;
166 }
167
168 if (walk->vma)
169 split_huge_pmd(walk->vma, pmd, addr);
170 else if (pmd_leaf(*pmd) || !pmd_present(*pmd))
171 continue; /* Nothing to do. */
172
173 err = walk_pte_range(pmd, addr, next, walk);
174 if (err)
175 break;
176
177 if (walk->action == ACTION_AGAIN)
178 goto again;
179
180 } while (pmd++, addr = next, addr != end);
181
182 return err;
183 }
184
walk_pud_range(p4d_t * p4d,unsigned long addr,unsigned long end,struct mm_walk * walk)185 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
186 struct mm_walk *walk)
187 {
188 pud_t *pud;
189 unsigned long next;
190 const struct mm_walk_ops *ops = walk->ops;
191 bool has_handler = ops->pmd_entry || ops->pte_entry;
192 bool has_install = ops->install_pte;
193 int err = 0;
194 int depth = real_depth(2);
195
196 pud = pud_offset(p4d, addr);
197 do {
198 again:
199 next = pud_addr_end(addr, end);
200 if (pud_none(*pud)) {
201 if (has_install)
202 err = __pmd_alloc(walk->mm, pud, addr);
203 else if (ops->pte_hole)
204 err = ops->pte_hole(addr, next, depth, walk);
205 if (err)
206 break;
207 if (!has_install)
208 continue;
209 }
210
211 walk->action = ACTION_SUBTREE;
212
213 if (ops->pud_entry)
214 err = ops->pud_entry(pud, addr, next, walk);
215 if (err)
216 break;
217
218 if (walk->action == ACTION_AGAIN)
219 goto again;
220 if (walk->action == ACTION_CONTINUE)
221 continue;
222
223 if (!has_handler) { /* No handlers for lower page tables. */
224 if (!has_install)
225 continue; /* Nothing to do. */
226 /*
227 * We are ONLY installing, so avoid unnecessarily
228 * splitting a present huge page.
229 */
230 if (pud_present(*pud) && pud_trans_huge(*pud))
231 continue;
232 }
233
234 if (walk->vma)
235 split_huge_pud(walk->vma, pud, addr);
236 else if (pud_leaf(*pud) || !pud_present(*pud))
237 continue; /* Nothing to do. */
238
239 err = walk_pmd_range(pud, addr, next, walk);
240 if (err)
241 break;
242
243 if (walk->action == ACTION_AGAIN)
244 goto again;
245 } while (pud++, addr = next, addr != end);
246
247 return err;
248 }
249
walk_p4d_range(pgd_t * pgd,unsigned long addr,unsigned long end,struct mm_walk * walk)250 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
251 struct mm_walk *walk)
252 {
253 p4d_t *p4d;
254 unsigned long next;
255 const struct mm_walk_ops *ops = walk->ops;
256 bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry;
257 bool has_install = ops->install_pte;
258 int err = 0;
259 int depth = real_depth(1);
260
261 p4d = p4d_offset(pgd, addr);
262 do {
263 next = p4d_addr_end(addr, end);
264 if (p4d_none_or_clear_bad(p4d)) {
265 if (has_install)
266 err = __pud_alloc(walk->mm, p4d, addr);
267 else if (ops->pte_hole)
268 err = ops->pte_hole(addr, next, depth, walk);
269 if (err)
270 break;
271 if (!has_install)
272 continue;
273 }
274 if (ops->p4d_entry) {
275 err = ops->p4d_entry(p4d, addr, next, walk);
276 if (err)
277 break;
278 }
279 if (has_handler || has_install)
280 err = walk_pud_range(p4d, addr, next, walk);
281 if (err)
282 break;
283 } while (p4d++, addr = next, addr != end);
284
285 return err;
286 }
287
walk_pgd_range(unsigned long addr,unsigned long end,struct mm_walk * walk)288 static int walk_pgd_range(unsigned long addr, unsigned long end,
289 struct mm_walk *walk)
290 {
291 pgd_t *pgd;
292 unsigned long next;
293 const struct mm_walk_ops *ops = walk->ops;
294 bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
295 ops->pte_entry;
296 bool has_install = ops->install_pte;
297 int err = 0;
298
299 if (walk->pgd)
300 pgd = walk->pgd + pgd_index(addr);
301 else
302 pgd = pgd_offset(walk->mm, addr);
303 do {
304 next = pgd_addr_end(addr, end);
305 if (pgd_none_or_clear_bad(pgd)) {
306 if (has_install)
307 err = __p4d_alloc(walk->mm, pgd, addr);
308 else if (ops->pte_hole)
309 err = ops->pte_hole(addr, next, 0, walk);
310 if (err)
311 break;
312 if (!has_install)
313 continue;
314 }
315 if (ops->pgd_entry) {
316 err = ops->pgd_entry(pgd, addr, next, walk);
317 if (err)
318 break;
319 }
320 if (has_handler || has_install)
321 err = walk_p4d_range(pgd, addr, next, walk);
322 if (err)
323 break;
324 } while (pgd++, addr = next, addr != end);
325
326 return err;
327 }
328
329 #ifdef CONFIG_HUGETLB_PAGE
hugetlb_entry_end(struct hstate * h,unsigned long addr,unsigned long end)330 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
331 unsigned long end)
332 {
333 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
334
335 return min(boundary, end);
336 }
337
walk_hugetlb_range(unsigned long addr,unsigned long end,struct mm_walk * walk)338 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
339 struct mm_walk *walk)
340 {
341 struct vm_area_struct *vma = walk->vma;
342 struct hstate *h = hstate_vma(vma);
343 unsigned long next;
344 unsigned long hmask = huge_page_mask(h);
345 unsigned long sz = huge_page_size(h);
346 pte_t *pte;
347 const struct mm_walk_ops *ops = walk->ops;
348 int err = 0;
349
350 hugetlb_vma_lock_read(vma);
351 do {
352 next = hugetlb_entry_end(h, addr, end);
353 pte = hugetlb_walk(vma, addr & hmask, sz);
354 if (pte)
355 err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
356 else if (ops->pte_hole)
357 err = ops->pte_hole(addr, next, -1, walk);
358 if (err)
359 break;
360 } while (addr = next, addr != end);
361 hugetlb_vma_unlock_read(vma);
362
363 return err;
364 }
365
366 #else /* CONFIG_HUGETLB_PAGE */
walk_hugetlb_range(unsigned long addr,unsigned long end,struct mm_walk * walk)367 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
368 struct mm_walk *walk)
369 {
370 return 0;
371 }
372
373 #endif /* CONFIG_HUGETLB_PAGE */
374
375 /*
376 * Decide whether we really walk over the current vma on [@start, @end)
377 * or skip it via the returned value. Return 0 if we do walk over the
378 * current vma, and return 1 if we skip the vma. Negative values means
379 * error, where we abort the current walk.
380 */
walk_page_test(unsigned long start,unsigned long end,struct mm_walk * walk)381 static int walk_page_test(unsigned long start, unsigned long end,
382 struct mm_walk *walk)
383 {
384 struct vm_area_struct *vma = walk->vma;
385 const struct mm_walk_ops *ops = walk->ops;
386
387 if (ops->test_walk)
388 return ops->test_walk(start, end, walk);
389
390 /*
391 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
392 * range, so we don't walk over it as we do for normal vmas. However,
393 * Some callers are interested in handling hole range and they don't
394 * want to just ignore any single address range. Such users certainly
395 * define their ->pte_hole() callbacks, so let's delegate them to handle
396 * vma(VM_PFNMAP).
397 */
398 if (vma->vm_flags & VM_PFNMAP) {
399 int err = 1;
400 if (ops->pte_hole)
401 err = ops->pte_hole(start, end, -1, walk);
402 return err ? err : 1;
403 }
404 return 0;
405 }
406
__walk_page_range(unsigned long start,unsigned long end,struct mm_walk * walk)407 static int __walk_page_range(unsigned long start, unsigned long end,
408 struct mm_walk *walk)
409 {
410 int err = 0;
411 struct vm_area_struct *vma = walk->vma;
412 const struct mm_walk_ops *ops = walk->ops;
413 bool is_hugetlb = is_vm_hugetlb_page(vma);
414
415 /* We do not support hugetlb PTE installation. */
416 if (ops->install_pte && is_hugetlb)
417 return -EINVAL;
418
419 if (ops->pre_vma) {
420 err = ops->pre_vma(start, end, walk);
421 if (err)
422 return err;
423 }
424
425 if (is_hugetlb) {
426 if (ops->hugetlb_entry)
427 err = walk_hugetlb_range(start, end, walk);
428 } else
429 err = walk_pgd_range(start, end, walk);
430
431 if (ops->post_vma)
432 ops->post_vma(walk);
433
434 return err;
435 }
436
process_mm_walk_lock(struct mm_struct * mm,enum page_walk_lock walk_lock)437 static inline void process_mm_walk_lock(struct mm_struct *mm,
438 enum page_walk_lock walk_lock)
439 {
440 if (walk_lock == PGWALK_RDLOCK)
441 mmap_assert_locked(mm);
442 else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY)
443 mmap_assert_write_locked(mm);
444 }
445
process_vma_walk_lock(struct vm_area_struct * vma,enum page_walk_lock walk_lock)446 static inline void process_vma_walk_lock(struct vm_area_struct *vma,
447 enum page_walk_lock walk_lock)
448 {
449 #ifdef CONFIG_PER_VMA_LOCK
450 switch (walk_lock) {
451 case PGWALK_WRLOCK:
452 vma_start_write(vma);
453 break;
454 case PGWALK_WRLOCK_VERIFY:
455 vma_assert_write_locked(vma);
456 break;
457 case PGWALK_VMA_RDLOCK_VERIFY:
458 vma_assert_locked(vma);
459 break;
460 case PGWALK_RDLOCK:
461 /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
462 break;
463 }
464 #endif
465 }
466
467 /*
468 * See the comment for walk_page_range(), this performs the heavy lifting of the
469 * operation, only sets no restrictions on how the walk proceeds.
470 *
471 * We usually restrict the ability to install PTEs, but this functionality is
472 * available to internal memory management code and provided in mm/internal.h.
473 */
walk_page_range_mm_unsafe(struct mm_struct * mm,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)474 int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start,
475 unsigned long end, const struct mm_walk_ops *ops,
476 void *private)
477 {
478 int err = 0;
479 unsigned long next;
480 struct vm_area_struct *vma;
481 struct mm_walk walk = {
482 .ops = ops,
483 .mm = mm,
484 .private = private,
485 };
486
487 if (start >= end)
488 return -EINVAL;
489
490 if (!walk.mm)
491 return -EINVAL;
492
493 process_mm_walk_lock(walk.mm, ops->walk_lock);
494
495 vma = find_vma(walk.mm, start);
496 do {
497 if (!vma) { /* after the last vma */
498 walk.vma = NULL;
499 next = end;
500 if (ops->pte_hole)
501 err = ops->pte_hole(start, next, -1, &walk);
502 } else if (start < vma->vm_start) { /* outside vma */
503 walk.vma = NULL;
504 next = min(end, vma->vm_start);
505 if (ops->pte_hole)
506 err = ops->pte_hole(start, next, -1, &walk);
507 } else { /* inside vma */
508 process_vma_walk_lock(vma, ops->walk_lock);
509 walk.vma = vma;
510 next = min(end, vma->vm_end);
511 vma = find_vma(mm, vma->vm_end);
512
513 err = walk_page_test(start, next, &walk);
514 if (err > 0) {
515 /*
516 * positive return values are purely for
517 * controlling the pagewalk, so should never
518 * be passed to the callers.
519 */
520 err = 0;
521 continue;
522 }
523 if (err < 0)
524 break;
525 err = __walk_page_range(start, next, &walk);
526 }
527 if (err)
528 break;
529 } while (start = next, start < end);
530 return err;
531 }
532
533 /*
534 * Determine if the walk operations specified are permitted to be used for a
535 * page table walk.
536 *
537 * This check is performed on all functions which are parameterised by walk
538 * operations and exposed in include/linux/pagewalk.h.
539 *
540 * Internal memory management code can use *_unsafe() functions to be able to
541 * use all page walking operations.
542 */
check_ops_safe(const struct mm_walk_ops * ops)543 static bool check_ops_safe(const struct mm_walk_ops *ops)
544 {
545 /*
546 * The installation of PTEs is solely under the control of memory
547 * management logic and subject to many subtle locking, security and
548 * cache considerations so we cannot permit other users to do so, and
549 * certainly not for exported symbols.
550 */
551 if (ops->install_pte)
552 return false;
553
554 return true;
555 }
556
557 /**
558 * walk_page_range - walk page table with caller specific callbacks
559 * @mm: mm_struct representing the target process of page table walk
560 * @start: start address of the virtual address range
561 * @end: end address of the virtual address range
562 * @ops: operation to call during the walk
563 * @private: private data for callbacks' usage
564 *
565 * Recursively walk the page table tree of the process represented by @mm
566 * within the virtual address range [@start, @end). During walking, we can do
567 * some caller-specific works for each entry, by setting up pmd_entry(),
568 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
569 * callbacks, the associated entries/pages are just ignored.
570 * The return values of these callbacks are commonly defined like below:
571 *
572 * - 0 : succeeded to handle the current entry, and if you don't reach the
573 * end address yet, continue to walk.
574 * - >0 : succeeded to handle the current entry, and return to the caller
575 * with caller specific value.
576 * - <0 : failed to handle the current entry, and return to the caller
577 * with error code.
578 *
579 * Before starting to walk page table, some callers want to check whether
580 * they really want to walk over the current vma, typically by checking
581 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
582 * purpose.
583 *
584 * If operations need to be staged before and committed after a vma is walked,
585 * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
586 * since it is intended to handle commit-type operations, can't return any
587 * errors.
588 *
589 * struct mm_walk keeps current values of some common data like vma and pmd,
590 * which are useful for the access from callbacks. If you want to pass some
591 * caller-specific data to callbacks, @private should be helpful.
592 *
593 * Locking:
594 * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
595 * because these function traverse vma list and/or access to vma's data.
596 */
walk_page_range(struct mm_struct * mm,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)597 int walk_page_range(struct mm_struct *mm, unsigned long start,
598 unsigned long end, const struct mm_walk_ops *ops,
599 void *private)
600 {
601 if (!check_ops_safe(ops))
602 return -EINVAL;
603
604 return walk_page_range_mm_unsafe(mm, start, end, ops, private);
605 }
606
607 /**
608 * walk_kernel_page_table_range - walk a range of kernel pagetables.
609 * @start: start address of the virtual address range
610 * @end: end address of the virtual address range
611 * @ops: operation to call during the walk
612 * @pgd: pgd to walk if different from mm->pgd
613 * @private: private data for callbacks' usage
614 *
615 * Similar to walk_page_range() but can walk any page tables even if they are
616 * not backed by VMAs. Because 'unusual' entries may be walked this function
617 * will also not lock the PTEs for the pte_entry() callback. This is useful for
618 * walking kernel pages tables or page tables for firmware.
619 *
620 * Note: Be careful to walk the kernel pages tables, the caller may be need to
621 * take other effective approaches (mmap lock may be insufficient) to prevent
622 * the intermediate kernel page tables belonging to the specified address range
623 * from being freed (e.g. memory hot-remove).
624 */
walk_kernel_page_table_range(unsigned long start,unsigned long end,const struct mm_walk_ops * ops,pgd_t * pgd,void * private)625 int walk_kernel_page_table_range(unsigned long start, unsigned long end,
626 const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
627 {
628 /*
629 * Kernel intermediate page tables are usually not freed, so the mmap
630 * read lock is sufficient. But there are some exceptions.
631 * E.g. memory hot-remove. In which case, the mmap lock is insufficient
632 * to prevent the intermediate kernel pages tables belonging to the
633 * specified address range from being freed. The caller should take
634 * other actions to prevent this race.
635 */
636 mmap_assert_locked(&init_mm);
637
638 return walk_kernel_page_table_range_lockless(start, end, ops, pgd,
639 private);
640 }
641
642 /*
643 * Use this function to walk the kernel page tables locklessly. It should be
644 * guaranteed that the caller has exclusive access over the range they are
645 * operating on - that there should be no concurrent access, for example,
646 * changing permissions for vmalloc objects.
647 */
walk_kernel_page_table_range_lockless(unsigned long start,unsigned long end,const struct mm_walk_ops * ops,pgd_t * pgd,void * private)648 int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
649 const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
650 {
651 struct mm_walk walk = {
652 .ops = ops,
653 .mm = &init_mm,
654 .pgd = pgd,
655 .private = private,
656 .no_vma = true
657 };
658
659 if (start >= end)
660 return -EINVAL;
661 if (!check_ops_safe(ops))
662 return -EINVAL;
663
664 return walk_pgd_range(start, end, &walk);
665 }
666
667 /**
668 * walk_page_range_debug - walk a range of pagetables not backed by a vma
669 * @mm: mm_struct representing the target process of page table walk
670 * @start: start address of the virtual address range
671 * @end: end address of the virtual address range
672 * @ops: operation to call during the walk
673 * @pgd: pgd to walk if different from mm->pgd
674 * @private: private data for callbacks' usage
675 *
676 * Similar to walk_page_range() but can walk any page tables even if they are
677 * not backed by VMAs. Because 'unusual' entries may be walked this function
678 * will also not lock the PTEs for the pte_entry() callback.
679 *
680 * This is for debugging purposes ONLY.
681 */
walk_page_range_debug(struct mm_struct * mm,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,pgd_t * pgd,void * private)682 int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
683 unsigned long end, const struct mm_walk_ops *ops,
684 pgd_t *pgd, void *private)
685 {
686 struct mm_walk walk = {
687 .ops = ops,
688 .mm = mm,
689 .pgd = pgd,
690 .private = private,
691 .no_vma = true
692 };
693
694 /* For convenience, we allow traversal of kernel mappings. */
695 if (mm == &init_mm)
696 return walk_kernel_page_table_range(start, end, ops,
697 pgd, private);
698 if (start >= end || !walk.mm)
699 return -EINVAL;
700 if (!check_ops_safe(ops))
701 return -EINVAL;
702
703 /*
704 * The mmap lock protects the page walker from changes to the page
705 * tables during the walk. However a read lock is insufficient to
706 * protect those areas which don't have a VMA as munmap() detaches
707 * the VMAs before downgrading to a read lock and actually tearing
708 * down PTEs/page tables. In which case, the mmap write lock should
709 * be held.
710 */
711 mmap_assert_write_locked(mm);
712
713 return walk_pgd_range(start, end, &walk);
714 }
715
walk_page_range_vma_unsafe(struct vm_area_struct * vma,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)716 int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start,
717 unsigned long end, const struct mm_walk_ops *ops, void *private)
718 {
719 struct mm_walk walk = {
720 .ops = ops,
721 .mm = vma->vm_mm,
722 .vma = vma,
723 .private = private,
724 };
725
726 if (start >= end || !walk.mm)
727 return -EINVAL;
728 if (start < vma->vm_start || end > vma->vm_end)
729 return -EINVAL;
730
731 process_mm_walk_lock(walk.mm, ops->walk_lock);
732 process_vma_walk_lock(vma, ops->walk_lock);
733 return __walk_page_range(start, end, &walk);
734 }
735
walk_page_range_vma(struct vm_area_struct * vma,unsigned long start,unsigned long end,const struct mm_walk_ops * ops,void * private)736 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
737 unsigned long end, const struct mm_walk_ops *ops,
738 void *private)
739 {
740 if (!check_ops_safe(ops))
741 return -EINVAL;
742
743 return walk_page_range_vma_unsafe(vma, start, end, ops, private);
744 }
745
walk_page_vma(struct vm_area_struct * vma,const struct mm_walk_ops * ops,void * private)746 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
747 void *private)
748 {
749 struct mm_walk walk = {
750 .ops = ops,
751 .mm = vma->vm_mm,
752 .vma = vma,
753 .private = private,
754 };
755
756 if (!walk.mm)
757 return -EINVAL;
758 if (!check_ops_safe(ops))
759 return -EINVAL;
760
761 process_mm_walk_lock(walk.mm, ops->walk_lock);
762 process_vma_walk_lock(vma, ops->walk_lock);
763 return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
764 }
765
766 /**
767 * walk_page_mapping - walk all memory areas mapped into a struct address_space.
768 * @mapping: Pointer to the struct address_space
769 * @first_index: First page offset in the address_space
770 * @nr: Number of incremental page offsets to cover
771 * @ops: operation to call during the walk
772 * @private: private data for callbacks' usage
773 *
774 * This function walks all memory areas mapped into a struct address_space.
775 * The walk is limited to only the given page-size index range, but if
776 * the index boundaries cross a huge page-table entry, that entry will be
777 * included.
778 *
779 * Also see walk_page_range() for additional information.
780 *
781 * Locking:
782 * This function can't require that the struct mm_struct::mmap_lock is held,
783 * since @mapping may be mapped by multiple processes. Instead
784 * @mapping->i_mmap_rwsem must be held. This might have implications in the
785 * callbacks, and it's up tho the caller to ensure that the
786 * struct mm_struct::mmap_lock is not needed.
787 *
788 * Also this means that a caller can't rely on the struct
789 * vm_area_struct::vm_flags to be constant across a call,
790 * except for immutable flags. Callers requiring this shouldn't use
791 * this function.
792 *
793 * Return: 0 on success, negative error code on failure, positive number on
794 * caller defined premature termination.
795 */
walk_page_mapping(struct address_space * mapping,pgoff_t first_index,pgoff_t nr,const struct mm_walk_ops * ops,void * private)796 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
797 pgoff_t nr, const struct mm_walk_ops *ops,
798 void *private)
799 {
800 struct mm_walk walk = {
801 .ops = ops,
802 .private = private,
803 };
804 struct vm_area_struct *vma;
805 pgoff_t vba, vea, cba, cea;
806 unsigned long start_addr, end_addr;
807 int err = 0;
808
809 if (!check_ops_safe(ops))
810 return -EINVAL;
811
812 lockdep_assert_held(&mapping->i_mmap_rwsem);
813 vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
814 first_index + nr - 1) {
815 /* Clip to the vma */
816 vba = vma->vm_pgoff;
817 vea = vba + vma_pages(vma);
818 cba = first_index;
819 cba = max(cba, vba);
820 cea = first_index + nr;
821 cea = min(cea, vea);
822
823 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
824 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
825 if (start_addr >= end_addr)
826 continue;
827
828 walk.vma = vma;
829 walk.mm = vma->vm_mm;
830
831 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
832 if (err > 0) {
833 err = 0;
834 break;
835 } else if (err < 0)
836 break;
837
838 err = __walk_page_range(start_addr, end_addr, &walk);
839 if (err)
840 break;
841 }
842
843 return err;
844 }
845
846 /**
847 * folio_walk_start - walk the page tables to a folio
848 * @fw: filled with information on success.
849 * @vma: the VMA.
850 * @addr: the virtual address to use for the page table walk.
851 * @flags: flags modifying which folios to walk to.
852 *
853 * Walk the page tables using @addr in a given @vma to a mapped folio and
854 * return the folio, making sure that the page table entry referenced by
855 * @addr cannot change until folio_walk_end() was called.
856 *
857 * As default, this function returns only folios that are not special (e.g., not
858 * the zeropage) and never returns folios that are supposed to be ignored by the
859 * VM as documented by vm_normal_page(). If requested, zeropages will be
860 * returned as well.
861 *
862 * If this function returns NULL it might either indicate "there is nothing" or
863 * "there is nothing suitable".
864 *
865 * On success, @fw is filled and the function returns the folio while the PTL
866 * is still held and folio_walk_end() must be called to clean up,
867 * releasing any held locks. The returned folio must *not* be used after the
868 * call to folio_walk_end(), unless a short-term folio reference is taken before
869 * that call.
870 *
871 * @fw->page will correspond to the page that is effectively referenced by
872 * @addr. However, for shared zeropages @fw->page is set to NULL. Note that
873 * large folios might be mapped by multiple page table entries, and this
874 * function will always only lookup a single entry as specified by @addr, which
875 * might or might not cover more than a single page of the returned folio.
876 *
877 * This function must *not* be used as a naive replacement for
878 * get_user_pages() / pin_user_pages(), especially not to perform DMA or
879 * to carelessly modify page content. This function may *only* be used to grab
880 * short-term folio references, never to grab long-term folio references.
881 *
882 * Using the page table entry pointers in @fw for reading or modifying the
883 * entry should be avoided where possible: however, there might be valid
884 * use cases.
885 *
886 * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
887 * For example, PMD page table sharing might require prior unsharing. Also,
888 * logical hugetlb entries might span multiple physical page table entries,
889 * which *must* be modified in a single operation (set_huge_pte_at(),
890 * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
891 * not correspond to the first physical entry of a logical hugetlb entry.
892 *
893 * The mmap lock must be held in read mode.
894 *
895 * Return: folio pointer on success, otherwise NULL.
896 */
folio_walk_start(struct folio_walk * fw,struct vm_area_struct * vma,unsigned long addr,folio_walk_flags_t flags)897 struct folio *folio_walk_start(struct folio_walk *fw,
898 struct vm_area_struct *vma, unsigned long addr,
899 folio_walk_flags_t flags)
900 {
901 unsigned long entry_size;
902 bool zeropage = false;
903 struct page *page;
904 pud_t *pudp, pud;
905 pmd_t *pmdp, pmd;
906 pte_t *ptep, pte;
907 spinlock_t *ptl;
908 pgd_t *pgdp;
909 p4d_t *p4dp;
910
911 mmap_assert_locked(vma->vm_mm);
912 vma_pgtable_walk_begin(vma);
913
914 if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
915 goto not_found;
916
917 pgdp = pgd_offset(vma->vm_mm, addr);
918 if (pgd_none_or_clear_bad(pgdp))
919 goto not_found;
920
921 p4dp = p4d_offset(pgdp, addr);
922 if (p4d_none_or_clear_bad(p4dp))
923 goto not_found;
924
925 pudp = pud_offset(p4dp, addr);
926 pud = pudp_get(pudp);
927 if (pud_none(pud))
928 goto not_found;
929 if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
930 (!pud_present(pud) || pud_leaf(pud))) {
931 ptl = pud_lock(vma->vm_mm, pudp);
932 pud = pudp_get(pudp);
933
934 entry_size = PUD_SIZE;
935 fw->level = FW_LEVEL_PUD;
936 fw->pudp = pudp;
937 fw->pud = pud;
938
939 if (pud_none(pud)) {
940 spin_unlock(ptl);
941 goto not_found;
942 } else if (pud_present(pud) && !pud_leaf(pud)) {
943 spin_unlock(ptl);
944 goto pmd_table;
945 } else if (pud_present(pud)) {
946 page = vm_normal_page_pud(vma, addr, pud);
947 if (page)
948 goto found;
949 }
950 spin_unlock(ptl);
951 goto not_found;
952 }
953
954 pmd_table:
955 VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
956 pmdp = pmd_offset(pudp, addr);
957 pmd = pmdp_get_lockless(pmdp);
958 if (pmd_none(pmd))
959 goto not_found;
960 if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
961 (!pmd_present(pmd) || pmd_leaf(pmd))) {
962 ptl = pmd_lock(vma->vm_mm, pmdp);
963 pmd = pmdp_get(pmdp);
964
965 entry_size = PMD_SIZE;
966 fw->level = FW_LEVEL_PMD;
967 fw->pmdp = pmdp;
968 fw->pmd = pmd;
969
970 if (pmd_none(pmd)) {
971 spin_unlock(ptl);
972 goto not_found;
973 } else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
974 spin_unlock(ptl);
975 goto pte_table;
976 } else if (pmd_present(pmd)) {
977 page = vm_normal_page_pmd(vma, addr, pmd);
978 if (page) {
979 goto found;
980 } else if ((flags & FW_ZEROPAGE) &&
981 is_huge_zero_pmd(pmd)) {
982 page = pfn_to_page(pmd_pfn(pmd));
983 zeropage = true;
984 goto found;
985 }
986 }
987 spin_unlock(ptl);
988 goto not_found;
989 }
990
991 pte_table:
992 VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
993 ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
994 if (!ptep)
995 goto not_found;
996 pte = ptep_get(ptep);
997
998 entry_size = PAGE_SIZE;
999 fw->level = FW_LEVEL_PTE;
1000 fw->ptep = ptep;
1001 fw->pte = pte;
1002
1003 if (pte_present(pte)) {
1004 page = vm_normal_page(vma, addr, pte);
1005 if (page)
1006 goto found;
1007 if ((flags & FW_ZEROPAGE) &&
1008 is_zero_pfn(pte_pfn(pte))) {
1009 page = pfn_to_page(pte_pfn(pte));
1010 zeropage = true;
1011 goto found;
1012 }
1013 }
1014 pte_unmap_unlock(ptep, ptl);
1015 not_found:
1016 vma_pgtable_walk_end(vma);
1017 return NULL;
1018 found:
1019 if (!zeropage)
1020 /* Note: Offset from the mapped page, not the folio start. */
1021 fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT);
1022 else
1023 fw->page = NULL;
1024 fw->ptl = ptl;
1025 return page_folio(page);
1026 }
1027