xref: /linux/include/linux/pagewalk.h (revision beace86e61e465dba204a268ab3f3377153a4973)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_PAGEWALK_H
3 #define _LINUX_PAGEWALK_H
4 
5 #include <linux/mm.h>
6 
7 struct mm_walk;
8 
9 /* Locking requirement during a page walk. */
10 enum page_walk_lock {
11 	/* mmap_lock should be locked for read to stabilize the vma tree */
12 	PGWALK_RDLOCK = 0,
13 	/* vma will be write-locked during the walk */
14 	PGWALK_WRLOCK = 1,
15 	/* vma is expected to be already write-locked during the walk */
16 	PGWALK_WRLOCK_VERIFY = 2,
17 	/* vma is expected to be already read-locked during the walk */
18 	PGWALK_VMA_RDLOCK_VERIFY = 3,
19 };
20 
21 /**
22  * struct mm_walk_ops - callbacks for walk_page_range
23  * @pgd_entry:		if set, called for each non-empty PGD (top-level) entry
24  * @p4d_entry:		if set, called for each non-empty P4D entry
25  * @pud_entry:		if set, called for each non-empty PUD entry
26  * @pmd_entry:		if set, called for each non-empty PMD entry
27  *			this handler is required to be able to handle
28  *			pmd_trans_huge() pmds.  They may simply choose to
29  *			split_huge_page() instead of handling it explicitly.
30  * @pte_entry:		if set, called for each PTE (lowest-level) entry
31  *			including empty ones, except if @install_pte is set.
32  *			If @install_pte is set, @pte_entry is called only for
33  *			existing PTEs.
34  * @pte_hole:		if set, called for each hole at all levels,
35  *			depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
36  *			Any folded depths (where PTRS_PER_P?D is equal to 1)
37  *			are skipped. If @install_pte is specified, this will
38  *			not trigger for any populated ranges.
39  * @hugetlb_entry:	if set, called for each hugetlb entry. This hook
40  *			function is called with the vma lock held, in order to
41  *			protect against a concurrent freeing of the pte_t* or
42  *			the ptl. In some cases, the hook function needs to drop
43  *			and retake the vma lock in order to avoid deadlocks
44  *			while calling other functions. In such cases the hook
45  *			function must either refrain from accessing the pte or
46  *			ptl after dropping the vma lock, or else revalidate
47  *			those items after re-acquiring the vma lock and before
48  *			accessing them.
49  * @test_walk:		caller specific callback function to determine whether
50  *			we walk over the current vma or not. Returning 0 means
51  *			"do page table walk over the current vma", returning
52  *			a negative value means "abort current page table walk
53  *			right now" and returning 1 means "skip the current vma"
54  *			Note that this callback is not called when the caller
55  *			passes in a single VMA as for walk_page_vma().
56  * @pre_vma:            if set, called before starting walk on a non-null vma.
57  * @post_vma:           if set, called after a walk on a non-null vma, provided
58  *                      that @pre_vma and the vma walk succeeded.
59  * @install_pte:        if set, missing page table entries are installed and
60  *                      thus all levels are always walked in the specified
61  *                      range. This callback is then invoked at the PTE level
62  *                      (having split any THP pages prior), providing the PTE to
63  *                      install. If allocations fail, the walk is aborted. This
64  *                      operation is only available for userland memory. Not
65  *                      usable for hugetlb ranges.
66  *
67  * p?d_entry callbacks are called even if those levels are folded on a
68  * particular architecture/configuration.
69  */
70 struct mm_walk_ops {
71 	int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
72 			 unsigned long next, struct mm_walk *walk);
73 	int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
74 			 unsigned long next, struct mm_walk *walk);
75 	int (*pud_entry)(pud_t *pud, unsigned long addr,
76 			 unsigned long next, struct mm_walk *walk);
77 	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
78 			 unsigned long next, struct mm_walk *walk);
79 	int (*pte_entry)(pte_t *pte, unsigned long addr,
80 			 unsigned long next, struct mm_walk *walk);
81 	int (*pte_hole)(unsigned long addr, unsigned long next,
82 			int depth, struct mm_walk *walk);
83 	int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
84 			     unsigned long addr, unsigned long next,
85 			     struct mm_walk *walk);
86 	int (*test_walk)(unsigned long addr, unsigned long next,
87 			struct mm_walk *walk);
88 	int (*pre_vma)(unsigned long start, unsigned long end,
89 		       struct mm_walk *walk);
90 	void (*post_vma)(struct mm_walk *walk);
91 	int (*install_pte)(unsigned long addr, unsigned long next,
92 			   pte_t *ptep, struct mm_walk *walk);
93 	enum page_walk_lock walk_lock;
94 };
95 
96 /*
97  * Action for pud_entry / pmd_entry callbacks.
98  * ACTION_SUBTREE is the default
99  */
100 enum page_walk_action {
101 	/* Descend to next level, splitting huge pages if needed and possible */
102 	ACTION_SUBTREE = 0,
103 	/* Continue to next entry at this level (ignoring any subtree) */
104 	ACTION_CONTINUE = 1,
105 	/* Call again for this entry */
106 	ACTION_AGAIN = 2
107 };
108 
109 /**
110  * struct mm_walk - walk_page_range data
111  * @ops:	operation to call during the walk
112  * @mm:		mm_struct representing the target process of page table walk
113  * @pgd:	pointer to PGD; only valid with no_vma (otherwise set to NULL)
114  * @vma:	vma currently walked (NULL if walking outside vmas)
115  * @action:	next action to perform (see enum page_walk_action)
116  * @no_vma:	walk ignoring vmas (vma will always be NULL)
117  * @private:	private data for callbacks' usage
118  *
119  * (see the comment on walk_page_range() for more details)
120  */
121 struct mm_walk {
122 	const struct mm_walk_ops *ops;
123 	struct mm_struct *mm;
124 	pgd_t *pgd;
125 	struct vm_area_struct *vma;
126 	enum page_walk_action action;
127 	bool no_vma;
128 	void *private;
129 };
130 
131 int walk_page_range(struct mm_struct *mm, unsigned long start,
132 		unsigned long end, const struct mm_walk_ops *ops,
133 		void *private);
134 int walk_kernel_page_table_range(unsigned long start,
135 		unsigned long end, const struct mm_walk_ops *ops,
136 		pgd_t *pgd, void *private);
137 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
138 			unsigned long end, const struct mm_walk_ops *ops,
139 			void *private);
140 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
141 		void *private);
142 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
143 		      pgoff_t nr, const struct mm_walk_ops *ops,
144 		      void *private);
145 
146 typedef int __bitwise folio_walk_flags_t;
147 
148 /*
149  * Walk migration entries as well. Careful: a large folio might get split
150  * concurrently.
151  */
152 #define FW_MIGRATION			((__force folio_walk_flags_t)BIT(0))
153 
154 /* Walk shared zeropages (small + huge) as well. */
155 #define FW_ZEROPAGE			((__force folio_walk_flags_t)BIT(1))
156 
157 enum folio_walk_level {
158 	FW_LEVEL_PTE,
159 	FW_LEVEL_PMD,
160 	FW_LEVEL_PUD,
161 };
162 
163 /**
164  * struct folio_walk - folio_walk_start() / folio_walk_end() data
165  * @page:	exact folio page referenced (if applicable)
166  * @level:	page table level identifying the entry type
167  * @pte:	pointer to the page table entry (FW_LEVEL_PTE).
168  * @pmd:	pointer to the page table entry (FW_LEVEL_PMD).
169  * @pud:	pointer to the page table entry (FW_LEVEL_PUD).
170  * @ptl:	pointer to the page table lock.
171  *
172  * (see folio_walk_start() documentation for more details)
173  */
174 struct folio_walk {
175 	/* public */
176 	struct page *page;
177 	enum folio_walk_level level;
178 	union {
179 		pte_t *ptep;
180 		pud_t *pudp;
181 		pmd_t *pmdp;
182 	};
183 	union {
184 		pte_t pte;
185 		pud_t pud;
186 		pmd_t pmd;
187 	};
188 	/* private */
189 	struct vm_area_struct *vma;
190 	spinlock_t *ptl;
191 };
192 
193 struct folio *folio_walk_start(struct folio_walk *fw,
194 		struct vm_area_struct *vma, unsigned long addr,
195 		folio_walk_flags_t flags);
196 
197 #define folio_walk_end(__fw, __vma) do { \
198 	spin_unlock((__fw)->ptl); \
199 	if (likely((__fw)->level == FW_LEVEL_PTE)) \
200 		pte_unmap((__fw)->ptep); \
201 	vma_pgtable_walk_end(__vma); \
202 } while (0)
203 
204 #endif /* _LINUX_PAGEWALK_H */
205