1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_RMAP_H
3 #define _LINUX_RMAP_H
4 /*
5 * Declarations for Reverse Mapping functions in mm/rmap.c
6 */
7
8 #include <linux/list.h>
9 #include <linux/slab.h>
10 #include <linux/mm.h>
11 #include <linux/rwsem.h>
12 #include <linux/memcontrol.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/memremap.h>
16 #include <linux/bit_spinlock.h>
17
18 /*
19 * The anon_vma heads a list of private "related" vmas, to scan if
20 * an anonymous page pointing to this anon_vma needs to be unmapped:
21 * the vmas on the list will be related by forking, or by splitting.
22 *
23 * Since vmas come and go as they are split and merged (particularly
24 * in mprotect), the mapping field of an anonymous page cannot point
25 * directly to a vma: instead it points to an anon_vma, on whose list
26 * the related vmas can be easily linked or unlinked.
27 *
28 * After unlinking the last vma on the list, we must garbage collect
29 * the anon_vma object itself: we're guaranteed no page can be
30 * pointing to this anon_vma once its vma list is empty.
31 */
32 struct anon_vma {
33 struct anon_vma *root; /* Root of this anon_vma tree */
34 struct rw_semaphore rwsem; /* W: modification, R: walking the list */
35 /*
36 * The refcount is taken on an anon_vma when there is no
37 * guarantee that the vma of page tables will exist for
38 * the duration of the operation. A caller that takes
39 * the reference is responsible for clearing up the
40 * anon_vma if they are the last user on release
41 */
42 atomic_t refcount;
43
44 /*
45 * Count of child anon_vmas. Equals to the count of all anon_vmas that
46 * have ->parent pointing to this one, including itself.
47 *
48 * This counter is used for making decision about reusing anon_vma
49 * instead of forking new one. See comments in function anon_vma_clone.
50 */
51 unsigned long num_children;
52 /* Count of VMAs whose ->anon_vma pointer points to this object. */
53 unsigned long num_active_vmas;
54
55 struct anon_vma *parent; /* Parent of this anon_vma */
56
57 /*
58 * NOTE: the LSB of the rb_root.rb_node is set by
59 * mm_take_all_locks() _after_ taking the above lock. So the
60 * rb_root must only be read/written after taking the above lock
61 * to be sure to see a valid next pointer. The LSB bit itself
62 * is serialized by a system wide lock only visible to
63 * mm_take_all_locks() (mm_all_locks_mutex).
64 */
65
66 /* Interval tree of private "related" vmas */
67 struct rb_root_cached rb_root;
68 };
69
70 /*
71 * The copy-on-write semantics of fork mean that an anon_vma
72 * can become associated with multiple processes. Furthermore,
73 * each child process will have its own anon_vma, where new
74 * pages for that process are instantiated.
75 *
76 * This structure allows us to find the anon_vmas associated
77 * with a VMA, or the VMAs associated with an anon_vma.
78 * The "same_vma" list contains the anon_vma_chains linking
79 * all the anon_vmas associated with this VMA.
80 * The "rb" field indexes on an interval tree the anon_vma_chains
81 * which link all the VMAs associated with this anon_vma.
82 */
83 struct anon_vma_chain {
84 struct vm_area_struct *vma;
85 struct anon_vma *anon_vma;
86 struct list_head same_vma; /* locked by mmap_lock & page_table_lock */
87 struct rb_node rb; /* locked by anon_vma->rwsem */
88 unsigned long rb_subtree_last;
89 #ifdef CONFIG_DEBUG_VM_RB
90 unsigned long cached_vma_start, cached_vma_last;
91 #endif
92 };
93
94 enum ttu_flags {
95 TTU_USE_SHARED_ZEROPAGE = 0x2, /* for unused pages of large folios */
96 TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
97 TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
98 TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
99 TTU_HWPOISON = 0x20, /* do convert pte to hwpoison entry */
100 TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
101 * and caller guarantees they will
102 * do a final flush if necessary */
103 TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
104 * caller holds it */
105 };
106
107 #ifdef CONFIG_MMU
108
109 void anon_vma_init(void); /* create anon_vma_cachep */
110
111 #ifdef CONFIG_MM_ID
folio_lock_large_mapcount(struct folio * folio)112 static __always_inline void folio_lock_large_mapcount(struct folio *folio)
113 {
114 bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
115 }
116
folio_unlock_large_mapcount(struct folio * folio)117 static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
118 {
119 __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
120 }
121
folio_mm_id(const struct folio * folio,int idx)122 static inline unsigned int folio_mm_id(const struct folio *folio, int idx)
123 {
124 VM_WARN_ON_ONCE(idx != 0 && idx != 1);
125 return folio->_mm_id[idx] & MM_ID_MASK;
126 }
127
folio_set_mm_id(struct folio * folio,int idx,mm_id_t id)128 static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id)
129 {
130 VM_WARN_ON_ONCE(idx != 0 && idx != 1);
131 folio->_mm_id[idx] &= ~MM_ID_MASK;
132 folio->_mm_id[idx] |= id;
133 }
134
__folio_large_mapcount_sanity_checks(const struct folio * folio,int diff,mm_id_t mm_id)135 static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
136 int diff, mm_id_t mm_id)
137 {
138 VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio));
139 VM_WARN_ON_ONCE(diff <= 0);
140 VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX);
141
142 /*
143 * Make sure we can detect at least one complete PTE mapping of the
144 * folio in a single MM as "exclusively mapped". This is primarily
145 * a check on 32bit, where we currently reduce the size of the per-MM
146 * mapcount to a short.
147 */
148 VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
149 VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX);
150
151 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY &&
152 folio->_mm_id_mapcount[0] != -1);
153 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY &&
154 folio->_mm_id_mapcount[0] < 0);
155 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY &&
156 folio->_mm_id_mapcount[1] != -1);
157 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
158 folio->_mm_id_mapcount[1] < 0);
159 VM_WARN_ON_ONCE(!folio_mapped(folio) &&
160 test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids));
161 }
162
folio_set_large_mapcount(struct folio * folio,int mapcount,struct vm_area_struct * vma)163 static __always_inline void folio_set_large_mapcount(struct folio *folio,
164 int mapcount, struct vm_area_struct *vma)
165 {
166 __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);
167
168 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
169 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);
170
171 /* Note: mapcounts start at -1. */
172 atomic_set(&folio->_large_mapcount, mapcount - 1);
173 folio->_mm_id_mapcount[0] = mapcount - 1;
174 folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
175 }
176
folio_add_return_large_mapcount(struct folio * folio,int diff,struct vm_area_struct * vma)177 static __always_inline int folio_add_return_large_mapcount(struct folio *folio,
178 int diff, struct vm_area_struct *vma)
179 {
180 const mm_id_t mm_id = vma->vm_mm->mm_id;
181 int new_mapcount_val;
182
183 folio_lock_large_mapcount(folio);
184 __folio_large_mapcount_sanity_checks(folio, diff, mm_id);
185
186 new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
187 atomic_set(&folio->_large_mapcount, new_mapcount_val);
188
189 /*
190 * If a folio is mapped more than once into an MM on 32bit, we
191 * can in theory overflow the per-MM mapcount (although only for
192 * fairly large folios), turning it negative. In that case, just
193 * free up the slot and mark the folio "mapped shared", otherwise
194 * we might be in trouble when unmapping pages later.
195 */
196 if (folio_mm_id(folio, 0) == mm_id) {
197 folio->_mm_id_mapcount[0] += diff;
198 if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) {
199 folio->_mm_id_mapcount[0] = -1;
200 folio_set_mm_id(folio, 0, MM_ID_DUMMY);
201 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
202 }
203 } else if (folio_mm_id(folio, 1) == mm_id) {
204 folio->_mm_id_mapcount[1] += diff;
205 if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) {
206 folio->_mm_id_mapcount[1] = -1;
207 folio_set_mm_id(folio, 1, MM_ID_DUMMY);
208 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
209 }
210 } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) {
211 folio_set_mm_id(folio, 0, mm_id);
212 folio->_mm_id_mapcount[0] = diff - 1;
213 /* We might have other mappings already. */
214 if (new_mapcount_val != diff - 1)
215 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
216 } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) {
217 folio_set_mm_id(folio, 1, mm_id);
218 folio->_mm_id_mapcount[1] = diff - 1;
219 /* Slot 0 certainly has mappings as well. */
220 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
221 }
222 folio_unlock_large_mapcount(folio);
223 return new_mapcount_val + 1;
224 }
225 #define folio_add_large_mapcount folio_add_return_large_mapcount
226
folio_sub_return_large_mapcount(struct folio * folio,int diff,struct vm_area_struct * vma)227 static __always_inline int folio_sub_return_large_mapcount(struct folio *folio,
228 int diff, struct vm_area_struct *vma)
229 {
230 const mm_id_t mm_id = vma->vm_mm->mm_id;
231 int new_mapcount_val;
232
233 folio_lock_large_mapcount(folio);
234 __folio_large_mapcount_sanity_checks(folio, diff, mm_id);
235
236 new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
237 atomic_set(&folio->_large_mapcount, new_mapcount_val);
238
239 /*
240 * There are valid corner cases where we might underflow a per-MM
241 * mapcount (some mappings added when no slot was free, some mappings
242 * added once a slot was free), so we always set it to -1 once we go
243 * negative.
244 */
245 if (folio_mm_id(folio, 0) == mm_id) {
246 folio->_mm_id_mapcount[0] -= diff;
247 if (folio->_mm_id_mapcount[0] >= 0)
248 goto out;
249 folio->_mm_id_mapcount[0] = -1;
250 folio_set_mm_id(folio, 0, MM_ID_DUMMY);
251 } else if (folio_mm_id(folio, 1) == mm_id) {
252 folio->_mm_id_mapcount[1] -= diff;
253 if (folio->_mm_id_mapcount[1] >= 0)
254 goto out;
255 folio->_mm_id_mapcount[1] = -1;
256 folio_set_mm_id(folio, 1, MM_ID_DUMMY);
257 }
258
259 /*
260 * If one MM slot owns all mappings, the folio is mapped exclusively.
261 * Note that if the folio is now unmapped (new_mapcount_val == -1), both
262 * slots must be free (mapcount == -1), and we'll also mark it as
263 * exclusive.
264 */
265 if (folio->_mm_id_mapcount[0] == new_mapcount_val ||
266 folio->_mm_id_mapcount[1] == new_mapcount_val)
267 folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
268 out:
269 folio_unlock_large_mapcount(folio);
270 return new_mapcount_val + 1;
271 }
272 #define folio_sub_large_mapcount folio_sub_return_large_mapcount
273 #else /* !CONFIG_MM_ID */
274 /*
275 * See __folio_rmap_sanity_checks(), we might map large folios even without
276 * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
277 */
folio_set_large_mapcount(struct folio * folio,int mapcount,struct vm_area_struct * vma)278 static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
279 struct vm_area_struct *vma)
280 {
281 /* Note: mapcounts start at -1. */
282 atomic_set(&folio->_large_mapcount, mapcount - 1);
283 }
284
folio_add_large_mapcount(struct folio * folio,int diff,struct vm_area_struct * vma)285 static inline void folio_add_large_mapcount(struct folio *folio,
286 int diff, struct vm_area_struct *vma)
287 {
288 atomic_add(diff, &folio->_large_mapcount);
289 }
290
folio_add_return_large_mapcount(struct folio * folio,int diff,struct vm_area_struct * vma)291 static inline int folio_add_return_large_mapcount(struct folio *folio,
292 int diff, struct vm_area_struct *vma)
293 {
294 BUILD_BUG();
295 }
296
folio_sub_large_mapcount(struct folio * folio,int diff,struct vm_area_struct * vma)297 static inline void folio_sub_large_mapcount(struct folio *folio,
298 int diff, struct vm_area_struct *vma)
299 {
300 atomic_sub(diff, &folio->_large_mapcount);
301 }
302
folio_sub_return_large_mapcount(struct folio * folio,int diff,struct vm_area_struct * vma)303 static inline int folio_sub_return_large_mapcount(struct folio *folio,
304 int diff, struct vm_area_struct *vma)
305 {
306 BUILD_BUG();
307 }
308 #endif /* CONFIG_MM_ID */
309
310 #define folio_inc_large_mapcount(folio, vma) \
311 folio_add_large_mapcount(folio, 1, vma)
312 #define folio_inc_return_large_mapcount(folio, vma) \
313 folio_add_return_large_mapcount(folio, 1, vma)
314 #define folio_dec_large_mapcount(folio, vma) \
315 folio_sub_large_mapcount(folio, 1, vma)
316 #define folio_dec_return_large_mapcount(folio, vma) \
317 folio_sub_return_large_mapcount(folio, 1, vma)
318
319 /* RMAP flags, currently only relevant for some anon rmap operations. */
320 typedef int __bitwise rmap_t;
321
322 /*
323 * No special request: A mapped anonymous (sub)page is possibly shared between
324 * processes.
325 */
326 #define RMAP_NONE ((__force rmap_t)0)
327
328 /* The anonymous (sub)page is exclusive to a single process. */
329 #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0))
330
__folio_rmap_sanity_checks(const struct folio * folio,const struct page * page,int nr_pages,enum pgtable_level level)331 static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio,
332 const struct page *page, int nr_pages, enum pgtable_level level)
333 {
334 /* hugetlb folios are handled separately. */
335 VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
336
337 /* When (un)mapping zeropages, we should never touch ref+mapcount. */
338 VM_WARN_ON_FOLIO(is_zero_folio(folio), folio);
339
340 /*
341 * TODO: we get driver-allocated folios that have nothing to do with
342 * the rmap using vm_insert_page(); therefore, we cannot assume that
343 * folio_test_large_rmappable() holds for large folios. We should
344 * handle any desired mapcount+stats accounting for these folios in
345 * VM_MIXEDMAP VMAs separately, and then sanity-check here that
346 * we really only get rmappable folios.
347 */
348
349 VM_WARN_ON_ONCE(nr_pages <= 0);
350 VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
351 VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
352
353 switch (level) {
354 case PGTABLE_LEVEL_PTE:
355 break;
356 case PGTABLE_LEVEL_PMD:
357 /*
358 * We don't support folios larger than a single PMD yet. So
359 * when PGTABLE_LEVEL_PMD is set, we assume that we are creating
360 * a single "entire" mapping of the folio.
361 */
362 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
363 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
364 break;
365 case PGTABLE_LEVEL_PUD:
366 /*
367 * Assume that we are creating a single "entire" mapping of the
368 * folio.
369 */
370 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
371 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
372 break;
373 default:
374 BUILD_BUG();
375 }
376
377 /*
378 * Anon folios must have an associated live anon_vma as long as they're
379 * mapped into userspace.
380 * Note that the atomic_read() mainly does two things:
381 *
382 * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to
383 * check that the associated anon_vma has not yet been freed (subject
384 * to KASAN's usual limitations). This check will pass if the
385 * anon_vma's refcount has already dropped to 0 but an RCU grace
386 * period hasn't passed since then.
387 * 2. If the anon_vma has not yet been freed, it checks that the
388 * anon_vma still has a nonzero refcount (as opposed to being in the
389 * middle of an RCU delay for getting freed).
390 */
391 if (folio_test_anon(folio) && !folio_test_ksm(folio)) {
392 unsigned long mapping = (unsigned long)folio->mapping;
393 struct anon_vma *anon_vma;
394
395 anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON);
396 VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio);
397 }
398 }
399
400 /*
401 * rmap interfaces called when adding or removing pte of page
402 */
403 void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
404 void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
405 struct vm_area_struct *, unsigned long address, rmap_t flags);
406 #define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
407 folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
408 void folio_add_anon_rmap_pmd(struct folio *, struct page *,
409 struct vm_area_struct *, unsigned long address, rmap_t flags);
410 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
411 unsigned long address, rmap_t flags);
412 void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
413 struct vm_area_struct *);
414 #define folio_add_file_rmap_pte(folio, page, vma) \
415 folio_add_file_rmap_ptes(folio, page, 1, vma)
416 void folio_add_file_rmap_pmd(struct folio *, struct page *,
417 struct vm_area_struct *);
418 void folio_add_file_rmap_pud(struct folio *, struct page *,
419 struct vm_area_struct *);
420 void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
421 struct vm_area_struct *);
422 #define folio_remove_rmap_pte(folio, page, vma) \
423 folio_remove_rmap_ptes(folio, page, 1, vma)
424 void folio_remove_rmap_pmd(struct folio *, struct page *,
425 struct vm_area_struct *);
426 void folio_remove_rmap_pud(struct folio *, struct page *,
427 struct vm_area_struct *);
428
429 void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
430 unsigned long address, rmap_t flags);
431 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
432 unsigned long address);
433
434 /* See folio_try_dup_anon_rmap_*() */
hugetlb_try_dup_anon_rmap(struct folio * folio,struct vm_area_struct * vma)435 static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
436 struct vm_area_struct *vma)
437 {
438 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
439 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
440
441 if (PageAnonExclusive(&folio->page)) {
442 if (unlikely(folio_needs_cow_for_dma(vma, folio)))
443 return -EBUSY;
444 ClearPageAnonExclusive(&folio->page);
445 }
446 atomic_inc(&folio->_entire_mapcount);
447 atomic_inc(&folio->_large_mapcount);
448 return 0;
449 }
450
451 /* See folio_try_share_anon_rmap_*() */
hugetlb_try_share_anon_rmap(struct folio * folio)452 static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
453 {
454 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
455 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
456 VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);
457
458 /* Paired with the memory barrier in try_grab_folio(). */
459 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
460 smp_mb();
461
462 if (unlikely(folio_maybe_dma_pinned(folio)))
463 return -EBUSY;
464 ClearPageAnonExclusive(&folio->page);
465
466 /*
467 * This is conceptually a smp_wmb() paired with the smp_rmb() in
468 * gup_must_unshare().
469 */
470 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
471 smp_mb__after_atomic();
472 return 0;
473 }
474
hugetlb_add_file_rmap(struct folio * folio)475 static inline void hugetlb_add_file_rmap(struct folio *folio)
476 {
477 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
478 VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
479
480 atomic_inc(&folio->_entire_mapcount);
481 atomic_inc(&folio->_large_mapcount);
482 }
483
hugetlb_remove_rmap(struct folio * folio)484 static inline void hugetlb_remove_rmap(struct folio *folio)
485 {
486 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
487
488 atomic_dec(&folio->_entire_mapcount);
489 atomic_dec(&folio->_large_mapcount);
490 }
491
__folio_dup_file_rmap(struct folio * folio,struct page * page,int nr_pages,struct vm_area_struct * dst_vma,enum pgtable_level level)492 static __always_inline void __folio_dup_file_rmap(struct folio *folio,
493 struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
494 enum pgtable_level level)
495 {
496 const int orig_nr_pages = nr_pages;
497
498 __folio_rmap_sanity_checks(folio, page, nr_pages, level);
499
500 switch (level) {
501 case PGTABLE_LEVEL_PTE:
502 if (!folio_test_large(folio)) {
503 atomic_inc(&folio->_mapcount);
504 break;
505 }
506
507 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
508 do {
509 atomic_inc(&page->_mapcount);
510 } while (page++, --nr_pages > 0);
511 }
512 folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
513 break;
514 case PGTABLE_LEVEL_PMD:
515 case PGTABLE_LEVEL_PUD:
516 atomic_inc(&folio->_entire_mapcount);
517 folio_inc_large_mapcount(folio, dst_vma);
518 break;
519 default:
520 BUILD_BUG();
521 }
522 }
523
524 /**
525 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
526 * @folio: The folio to duplicate the mappings of
527 * @page: The first page to duplicate the mappings of
528 * @nr_pages: The number of pages of which the mapping will be duplicated
529 * @dst_vma: The destination vm area
530 *
531 * The page range of the folio is defined by [page, page + nr_pages)
532 *
533 * The caller needs to hold the page table lock.
534 */
folio_dup_file_rmap_ptes(struct folio * folio,struct page * page,int nr_pages,struct vm_area_struct * dst_vma)535 static inline void folio_dup_file_rmap_ptes(struct folio *folio,
536 struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
537 {
538 __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE);
539 }
540
folio_dup_file_rmap_pte(struct folio * folio,struct page * page,struct vm_area_struct * dst_vma)541 static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
542 struct page *page, struct vm_area_struct *dst_vma)
543 {
544 __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE);
545 }
546
547 /**
548 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
549 * @folio: The folio to duplicate the mapping of
550 * @page: The first page to duplicate the mapping of
551 * @dst_vma: The destination vm area
552 *
553 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
554 *
555 * The caller needs to hold the page table lock.
556 */
folio_dup_file_rmap_pmd(struct folio * folio,struct page * page,struct vm_area_struct * dst_vma)557 static inline void folio_dup_file_rmap_pmd(struct folio *folio,
558 struct page *page, struct vm_area_struct *dst_vma)
559 {
560 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
561 __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE);
562 #else
563 WARN_ON_ONCE(true);
564 #endif
565 }
566
__folio_try_dup_anon_rmap(struct folio * folio,struct page * page,int nr_pages,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,enum pgtable_level level)567 static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
568 struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
569 struct vm_area_struct *src_vma, enum pgtable_level level)
570 {
571 const int orig_nr_pages = nr_pages;
572 bool maybe_pinned;
573 int i;
574
575 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
576 __folio_rmap_sanity_checks(folio, page, nr_pages, level);
577
578 /*
579 * If this folio may have been pinned by the parent process,
580 * don't allow to duplicate the mappings but instead require to e.g.,
581 * copy the subpage immediately for the child so that we'll always
582 * guarantee the pinned folio won't be randomly replaced in the
583 * future on write faults.
584 */
585 maybe_pinned = likely(!folio_is_device_private(folio)) &&
586 unlikely(folio_needs_cow_for_dma(src_vma, folio));
587
588 /*
589 * No need to check+clear for already shared PTEs/PMDs of the
590 * folio. But if any page is PageAnonExclusive, we must fallback to
591 * copying if the folio maybe pinned.
592 */
593 switch (level) {
594 case PGTABLE_LEVEL_PTE:
595 if (unlikely(maybe_pinned)) {
596 for (i = 0; i < nr_pages; i++)
597 if (PageAnonExclusive(page + i))
598 return -EBUSY;
599 }
600
601 if (!folio_test_large(folio)) {
602 if (PageAnonExclusive(page))
603 ClearPageAnonExclusive(page);
604 atomic_inc(&folio->_mapcount);
605 break;
606 }
607
608 do {
609 if (PageAnonExclusive(page))
610 ClearPageAnonExclusive(page);
611 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
612 atomic_inc(&page->_mapcount);
613 } while (page++, --nr_pages > 0);
614 folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
615 break;
616 case PGTABLE_LEVEL_PMD:
617 case PGTABLE_LEVEL_PUD:
618 if (PageAnonExclusive(page)) {
619 if (unlikely(maybe_pinned))
620 return -EBUSY;
621 ClearPageAnonExclusive(page);
622 }
623 atomic_inc(&folio->_entire_mapcount);
624 folio_inc_large_mapcount(folio, dst_vma);
625 break;
626 default:
627 BUILD_BUG();
628 }
629 return 0;
630 }
631
632 /**
633 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
634 * of a folio
635 * @folio: The folio to duplicate the mappings of
636 * @page: The first page to duplicate the mappings of
637 * @nr_pages: The number of pages of which the mapping will be duplicated
638 * @dst_vma: The destination vm area
639 * @src_vma: The vm area from which the mappings are duplicated
640 *
641 * The page range of the folio is defined by [page, page + nr_pages)
642 *
643 * The caller needs to hold the page table lock and the
644 * vma->vma_mm->write_protect_seq.
645 *
646 * Duplicating the mappings can only fail if the folio may be pinned; device
647 * private folios cannot get pinned and consequently this function cannot fail
648 * for them.
649 *
650 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
651 * the parent and the child. They must *not* be writable after this call
652 * succeeded.
653 *
654 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
655 */
folio_try_dup_anon_rmap_ptes(struct folio * folio,struct page * page,int nr_pages,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma)656 static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
657 struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
658 struct vm_area_struct *src_vma)
659 {
660 return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
661 src_vma, PGTABLE_LEVEL_PTE);
662 }
663
folio_try_dup_anon_rmap_pte(struct folio * folio,struct page * page,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma)664 static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
665 struct page *page, struct vm_area_struct *dst_vma,
666 struct vm_area_struct *src_vma)
667 {
668 return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
669 PGTABLE_LEVEL_PTE);
670 }
671
672 /**
673 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
674 * of a folio
675 * @folio: The folio to duplicate the mapping of
676 * @page: The first page to duplicate the mapping of
677 * @dst_vma: The destination vm area
678 * @src_vma: The vm area from which the mapping is duplicated
679 *
680 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
681 *
682 * The caller needs to hold the page table lock and the
683 * vma->vma_mm->write_protect_seq.
684 *
685 * Duplicating the mapping can only fail if the folio may be pinned; device
686 * private folios cannot get pinned and consequently this function cannot fail
687 * for them.
688 *
689 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
690 * the parent and the child. They must *not* be writable after this call
691 * succeeded.
692 *
693 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
694 */
folio_try_dup_anon_rmap_pmd(struct folio * folio,struct page * page,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma)695 static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
696 struct page *page, struct vm_area_struct *dst_vma,
697 struct vm_area_struct *src_vma)
698 {
699 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
700 return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
701 src_vma, PGTABLE_LEVEL_PMD);
702 #else
703 WARN_ON_ONCE(true);
704 return -EBUSY;
705 #endif
706 }
707
__folio_try_share_anon_rmap(struct folio * folio,struct page * page,int nr_pages,enum pgtable_level level)708 static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
709 struct page *page, int nr_pages, enum pgtable_level level)
710 {
711 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
712 VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
713 __folio_rmap_sanity_checks(folio, page, nr_pages, level);
714
715 /* device private folios cannot get pinned via GUP. */
716 if (unlikely(folio_is_device_private(folio))) {
717 ClearPageAnonExclusive(page);
718 return 0;
719 }
720
721 /*
722 * We have to make sure that when we clear PageAnonExclusive, that
723 * the page is not pinned and that concurrent GUP-fast won't succeed in
724 * concurrently pinning the page.
725 *
726 * Conceptually, PageAnonExclusive clearing consists of:
727 * (A1) Clear PTE
728 * (A2) Check if the page is pinned; back off if so.
729 * (A3) Clear PageAnonExclusive
730 * (A4) Restore PTE (optional, but certainly not writable)
731 *
732 * When clearing PageAnonExclusive, we cannot possibly map the page
733 * writable again, because anon pages that may be shared must never
734 * be writable. So in any case, if the PTE was writable it cannot
735 * be writable anymore afterwards and there would be a PTE change. Only
736 * if the PTE wasn't writable, there might not be a PTE change.
737 *
738 * Conceptually, GUP-fast pinning of an anon page consists of:
739 * (B1) Read the PTE
740 * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
741 * (B3) Pin the mapped page
742 * (B4) Check if the PTE changed by re-reading it; back off if so.
743 * (B5) If the original PTE is not writable, check if
744 * PageAnonExclusive is not set; back off if so.
745 *
746 * If the PTE was writable, we only have to make sure that GUP-fast
747 * observes a PTE change and properly backs off.
748 *
749 * If the PTE was not writable, we have to make sure that GUP-fast either
750 * detects a (temporary) PTE change or that PageAnonExclusive is cleared
751 * and properly backs off.
752 *
753 * Consequently, when clearing PageAnonExclusive(), we have to make
754 * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
755 * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
756 * and (B5) happen in the right memory order.
757 *
758 * We assume that there might not be a memory barrier after
759 * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
760 * so we use explicit ones here.
761 */
762
763 /* Paired with the memory barrier in try_grab_folio(). */
764 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
765 smp_mb();
766
767 if (unlikely(folio_maybe_dma_pinned(folio)))
768 return -EBUSY;
769 ClearPageAnonExclusive(page);
770
771 /*
772 * This is conceptually a smp_wmb() paired with the smp_rmb() in
773 * gup_must_unshare().
774 */
775 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
776 smp_mb__after_atomic();
777 return 0;
778 }
779
780 /**
781 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
782 * mapped by a PTE possibly shared to prepare
783 * for KSM or temporary unmapping
784 * @folio: The folio to share a mapping of
785 * @page: The mapped exclusive page
786 *
787 * The caller needs to hold the page table lock and has to have the page table
788 * entries cleared/invalidated.
789 *
790 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
791 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
792 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
793 *
794 * Marking the mapped page shared can only fail if the folio maybe pinned;
795 * device private folios cannot get pinned and consequently this function cannot
796 * fail.
797 *
798 * Returns 0 if marking the mapped page possibly shared succeeded. Returns
799 * -EBUSY otherwise.
800 */
folio_try_share_anon_rmap_pte(struct folio * folio,struct page * page)801 static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
802 struct page *page)
803 {
804 return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE);
805 }
806
807 /**
808 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
809 * range mapped by a PMD possibly shared to
810 * prepare for temporary unmapping
811 * @folio: The folio to share the mapping of
812 * @page: The first page to share the mapping of
813 *
814 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
815 *
816 * The caller needs to hold the page table lock and has to have the page table
817 * entries cleared/invalidated.
818 *
819 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
820 * fork() to duplicate a mapping, but instead to prepare for temporarily
821 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
822 *
823 * Marking the mapped pages shared can only fail if the folio maybe pinned;
824 * device private folios cannot get pinned and consequently this function cannot
825 * fail.
826 *
827 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
828 * -EBUSY otherwise.
829 */
folio_try_share_anon_rmap_pmd(struct folio * folio,struct page * page)830 static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
831 struct page *page)
832 {
833 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
834 return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
835 PGTABLE_LEVEL_PMD);
836 #else
837 WARN_ON_ONCE(true);
838 return -EBUSY;
839 #endif
840 }
841
842 /*
843 * Called from mm/vmscan.c to handle paging out
844 */
845 int folio_referenced(struct folio *, int is_locked,
846 struct mem_cgroup *memcg, vm_flags_t *vm_flags);
847
848 void try_to_migrate(struct folio *folio, enum ttu_flags flags);
849 void try_to_unmap(struct folio *, enum ttu_flags flags);
850
851 struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
852 void *owner, struct folio **foliop);
853
854 /* Avoid racy checks */
855 #define PVMW_SYNC (1 << 0)
856 /* Look for migration entries rather than present PTEs */
857 #define PVMW_MIGRATION (1 << 1)
858
859 /* Result flags */
860
861 /* The page is mapped across page table boundary */
862 #define PVMW_PGTABLE_CROSSED (1 << 16)
863
864 struct page_vma_mapped_walk {
865 unsigned long pfn;
866 unsigned long nr_pages;
867 pgoff_t pgoff;
868 struct vm_area_struct *vma;
869 unsigned long address;
870 pmd_t *pmd;
871 pte_t *pte;
872 spinlock_t *ptl;
873 unsigned int flags;
874 };
875
876 #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \
877 struct page_vma_mapped_walk name = { \
878 .pfn = folio_pfn(_folio), \
879 .nr_pages = folio_nr_pages(_folio), \
880 .pgoff = folio_pgoff(_folio), \
881 .vma = _vma, \
882 .address = _address, \
883 .flags = _flags, \
884 }
885
page_vma_mapped_walk_done(struct page_vma_mapped_walk * pvmw)886 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
887 {
888 /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
889 if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
890 pte_unmap(pvmw->pte);
891 if (pvmw->ptl)
892 spin_unlock(pvmw->ptl);
893 }
894
895 /**
896 * page_vma_mapped_walk_restart - Restart the page table walk.
897 * @pvmw: Pointer to struct page_vma_mapped_walk.
898 *
899 * It restarts the page table walk when changes occur in the page
900 * table, such as splitting a PMD. Ensures that the PTL held during
901 * the previous walk is released and resets the state to allow for
902 * a new walk starting at the current address stored in pvmw->address.
903 */
904 static inline void
page_vma_mapped_walk_restart(struct page_vma_mapped_walk * pvmw)905 page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw)
906 {
907 WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte);
908
909 if (likely(pvmw->ptl))
910 spin_unlock(pvmw->ptl);
911 else
912 WARN_ON_ONCE(1);
913
914 pvmw->ptl = NULL;
915 pvmw->pmd = NULL;
916 pvmw->pte = NULL;
917 }
918
919 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
920 unsigned long page_address_in_vma(const struct folio *folio,
921 const struct page *, const struct vm_area_struct *);
922
923 /*
924 * Cleans the PTEs of shared mappings.
925 * (and since clean PTEs should also be readonly, write protects them too)
926 *
927 * returns the number of cleaned PTEs.
928 */
929 int folio_mkclean(struct folio *);
930
931 int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
932 unsigned long pfn, unsigned long nr_pages);
933
934 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
935 struct vm_area_struct *vma);
936
937 void remove_migration_ptes(struct folio *src, struct folio *dst,
938 enum ttu_flags flags);
939
940 /*
941 * rmap_walk_control: To control rmap traversing for specific needs
942 *
943 * arg: passed to rmap_one() and invalid_vma()
944 * try_lock: bail out if the rmap lock is contended
945 * contended: indicate the rmap traversal bailed out due to lock contention
946 * rmap_one: executed on each vma where page is mapped
947 * done: for checking traversing termination condition
948 * anon_lock: for getting anon_lock by optimized way rather than default
949 * invalid_vma: for skipping uninterested vma
950 */
951 struct rmap_walk_control {
952 void *arg;
953 bool try_lock;
954 bool contended;
955 /*
956 * Return false if page table scanning in rmap_walk should be stopped.
957 * Otherwise, return true.
958 */
959 bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
960 unsigned long addr, void *arg);
961 int (*done)(struct folio *folio);
962 struct anon_vma *(*anon_lock)(const struct folio *folio,
963 struct rmap_walk_control *rwc);
964 bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
965 };
966
967 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
968 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
969 struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
970 struct rmap_walk_control *rwc);
971
972 #else /* !CONFIG_MMU */
973
974 #define anon_vma_init() do {} while (0)
975 #define anon_vma_prepare(vma) (0)
976
folio_referenced(struct folio * folio,int is_locked,struct mem_cgroup * memcg,vm_flags_t * vm_flags)977 static inline int folio_referenced(struct folio *folio, int is_locked,
978 struct mem_cgroup *memcg,
979 vm_flags_t *vm_flags)
980 {
981 *vm_flags = 0;
982 return 0;
983 }
984
try_to_unmap(struct folio * folio,enum ttu_flags flags)985 static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
986 {
987 }
988
folio_mkclean(struct folio * folio)989 static inline int folio_mkclean(struct folio *folio)
990 {
991 return 0;
992 }
993 #endif /* CONFIG_MMU */
994
995 #endif /* _LINUX_RMAP_H */
996