1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _LINUX_PAGEWALK_H 3 #define _LINUX_PAGEWALK_H 4 5 #include <linux/mm.h> 6 7 struct mm_walk; 8 9 /* Locking requirement during a page walk. */ 10 enum page_walk_lock { 11 /* mmap_lock should be locked for read to stabilize the vma tree */ 12 PGWALK_RDLOCK = 0, 13 /* vma will be write-locked during the walk */ 14 PGWALK_WRLOCK = 1, 15 /* vma is expected to be already write-locked during the walk */ 16 PGWALK_WRLOCK_VERIFY = 2, 17 /* vma is expected to be already read-locked during the walk */ 18 PGWALK_VMA_RDLOCK_VERIFY = 3, 19 }; 20 21 /** 22 * struct mm_walk_ops - callbacks for walk_page_range 23 * @pgd_entry: if set, called for each non-empty PGD (top-level) entry 24 * @p4d_entry: if set, called for each non-empty P4D entry 25 * @pud_entry: if set, called for each non-empty PUD entry 26 * @pmd_entry: if set, called for each non-empty PMD entry 27 * this handler is required to be able to handle 28 * pmd_trans_huge() pmds. They may simply choose to 29 * split_huge_page() instead of handling it explicitly. 30 * @pte_entry: if set, called for each PTE (lowest-level) entry 31 * including empty ones, except if @install_pte is set. 32 * If @install_pte is set, @pte_entry is called only for 33 * existing PTEs. 34 * @pte_hole: if set, called for each hole at all levels, 35 * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. 36 * Any folded depths (where PTRS_PER_P?D is equal to 1) 37 * are skipped. If @install_pte is specified, this will 38 * not trigger for any populated ranges. 39 * @hugetlb_entry: if set, called for each hugetlb entry. This hook 40 * function is called with the vma lock held, in order to 41 * protect against a concurrent freeing of the pte_t* or 42 * the ptl. In some cases, the hook function needs to drop 43 * and retake the vma lock in order to avoid deadlocks 44 * while calling other functions. In such cases the hook 45 * function must either refrain from accessing the pte or 46 * ptl after dropping the vma lock, or else revalidate 47 * those items after re-acquiring the vma lock and before 48 * accessing them. 49 * @test_walk: caller specific callback function to determine whether 50 * we walk over the current vma or not. Returning 0 means 51 * "do page table walk over the current vma", returning 52 * a negative value means "abort current page table walk 53 * right now" and returning 1 means "skip the current vma" 54 * Note that this callback is not called when the caller 55 * passes in a single VMA as for walk_page_vma(). 56 * @pre_vma: if set, called before starting walk on a non-null vma. 57 * @post_vma: if set, called after a walk on a non-null vma, provided 58 * that @pre_vma and the vma walk succeeded. 59 * @install_pte: if set, missing page table entries are installed and 60 * thus all levels are always walked in the specified 61 * range. This callback is then invoked at the PTE level 62 * (having split any THP pages prior), providing the PTE to 63 * install. If allocations fail, the walk is aborted. This 64 * operation is only available for userland memory. Not 65 * usable for hugetlb ranges. 66 * 67 * p?d_entry callbacks are called even if those levels are folded on a 68 * particular architecture/configuration. 69 */ 70 struct mm_walk_ops { 71 int (*pgd_entry)(pgd_t *pgd, unsigned long addr, 72 unsigned long next, struct mm_walk *walk); 73 int (*p4d_entry)(p4d_t *p4d, unsigned long addr, 74 unsigned long next, struct mm_walk *walk); 75 int (*pud_entry)(pud_t *pud, unsigned long addr, 76 unsigned long next, struct mm_walk *walk); 77 int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 78 unsigned long next, struct mm_walk *walk); 79 int (*pte_entry)(pte_t *pte, unsigned long addr, 80 unsigned long next, struct mm_walk *walk); 81 int (*pte_hole)(unsigned long addr, unsigned long next, 82 int depth, struct mm_walk *walk); 83 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, 84 unsigned long addr, unsigned long next, 85 struct mm_walk *walk); 86 int (*test_walk)(unsigned long addr, unsigned long next, 87 struct mm_walk *walk); 88 int (*pre_vma)(unsigned long start, unsigned long end, 89 struct mm_walk *walk); 90 void (*post_vma)(struct mm_walk *walk); 91 int (*install_pte)(unsigned long addr, unsigned long next, 92 pte_t *ptep, struct mm_walk *walk); 93 enum page_walk_lock walk_lock; 94 }; 95 96 /* 97 * Action for pud_entry / pmd_entry callbacks. 98 * ACTION_SUBTREE is the default 99 */ 100 enum page_walk_action { 101 /* Descend to next level, splitting huge pages if needed and possible */ 102 ACTION_SUBTREE = 0, 103 /* Continue to next entry at this level (ignoring any subtree) */ 104 ACTION_CONTINUE = 1, 105 /* Call again for this entry */ 106 ACTION_AGAIN = 2 107 }; 108 109 /** 110 * struct mm_walk - walk_page_range data 111 * @ops: operation to call during the walk 112 * @mm: mm_struct representing the target process of page table walk 113 * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) 114 * @vma: vma currently walked (NULL if walking outside vmas) 115 * @action: next action to perform (see enum page_walk_action) 116 * @no_vma: walk ignoring vmas (vma will always be NULL) 117 * @private: private data for callbacks' usage 118 * 119 * (see the comment on walk_page_range() for more details) 120 */ 121 struct mm_walk { 122 const struct mm_walk_ops *ops; 123 struct mm_struct *mm; 124 pgd_t *pgd; 125 struct vm_area_struct *vma; 126 enum page_walk_action action; 127 bool no_vma; 128 void *private; 129 }; 130 131 int walk_page_range(struct mm_struct *mm, unsigned long start, 132 unsigned long end, const struct mm_walk_ops *ops, 133 void *private); 134 int walk_kernel_page_table_range(unsigned long start, 135 unsigned long end, const struct mm_walk_ops *ops, 136 pgd_t *pgd, void *private); 137 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, 138 unsigned long end, const struct mm_walk_ops *ops, 139 void *private); 140 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 141 void *private); 142 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, 143 pgoff_t nr, const struct mm_walk_ops *ops, 144 void *private); 145 146 typedef int __bitwise folio_walk_flags_t; 147 148 /* 149 * Walk migration entries as well. Careful: a large folio might get split 150 * concurrently. 151 */ 152 #define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) 153 154 /* Walk shared zeropages (small + huge) as well. */ 155 #define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) 156 157 enum folio_walk_level { 158 FW_LEVEL_PTE, 159 FW_LEVEL_PMD, 160 FW_LEVEL_PUD, 161 }; 162 163 /** 164 * struct folio_walk - folio_walk_start() / folio_walk_end() data 165 * @page: exact folio page referenced (if applicable) 166 * @level: page table level identifying the entry type 167 * @pte: pointer to the page table entry (FW_LEVEL_PTE). 168 * @pmd: pointer to the page table entry (FW_LEVEL_PMD). 169 * @pud: pointer to the page table entry (FW_LEVEL_PUD). 170 * @ptl: pointer to the page table lock. 171 * 172 * (see folio_walk_start() documentation for more details) 173 */ 174 struct folio_walk { 175 /* public */ 176 struct page *page; 177 enum folio_walk_level level; 178 union { 179 pte_t *ptep; 180 pud_t *pudp; 181 pmd_t *pmdp; 182 }; 183 union { 184 pte_t pte; 185 pud_t pud; 186 pmd_t pmd; 187 }; 188 /* private */ 189 struct vm_area_struct *vma; 190 spinlock_t *ptl; 191 }; 192 193 struct folio *folio_walk_start(struct folio_walk *fw, 194 struct vm_area_struct *vma, unsigned long addr, 195 folio_walk_flags_t flags); 196 197 #define folio_walk_end(__fw, __vma) do { \ 198 spin_unlock((__fw)->ptl); \ 199 if (likely((__fw)->level == FW_LEVEL_PTE)) \ 200 pte_unmap((__fw)->ptep); \ 201 vma_pgtable_walk_end(__vma); \ 202 } while (0) 203 204 #endif /* _LINUX_PAGEWALK_H */ 205