1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Memory Migration functionality - linux/mm/migrate.c
4 *
5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
6 *
7 * Page migration was first developed in the context of the memory hotplug
8 * project. The main authors of the migration code are:
9 *
10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
11 * Hirokazu Takahashi <taka@valinux.co.jp>
12 * Dave Hansen <haveblue@us.ibm.com>
13 * Christoph Lameter
14 */
15
16 #include <linux/migrate.h>
17 #include <linux/export.h>
18 #include <linux/swap.h>
19 #include <linux/swapops.h>
20 #include <linux/pagemap.h>
21 #include <linux/buffer_head.h>
22 #include <linux/mm_inline.h>
23 #include <linux/ksm.h>
24 #include <linux/rmap.h>
25 #include <linux/topology.h>
26 #include <linux/cpu.h>
27 #include <linux/cpuset.h>
28 #include <linux/writeback.h>
29 #include <linux/mempolicy.h>
30 #include <linux/vmalloc.h>
31 #include <linux/security.h>
32 #include <linux/backing-dev.h>
33 #include <linux/compaction.h>
34 #include <linux/syscalls.h>
35 #include <linux/compat.h>
36 #include <linux/hugetlb.h>
37 #include <linux/gfp.h>
38 #include <linux/page_idle.h>
39 #include <linux/page_owner.h>
40 #include <linux/sched/mm.h>
41 #include <linux/ptrace.h>
42 #include <linux/memory.h>
43 #include <linux/sched/sysctl.h>
44 #include <linux/memory-tiers.h>
45 #include <linux/pagewalk.h>
46 #include <linux/balloon_compaction.h>
47 #include <linux/zsmalloc.h>
48
49 #include <asm/tlbflush.h>
50
51 #include <trace/events/migrate.h>
52
53 #include "internal.h"
54 #include "swap.h"
55
page_movable_ops(struct page * page)56 static const struct movable_operations *page_movable_ops(struct page *page)
57 {
58 VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
59
60 /*
61 * If we enable page migration for a page of a certain type by marking
62 * it as movable, the page type must be sticky until the page gets freed
63 * back to the buddy.
64 */
65 #ifdef CONFIG_BALLOON_COMPACTION
66 if (PageOffline(page))
67 /* Only balloon compaction sets PageOffline pages movable. */
68 return &balloon_mops;
69 #endif /* CONFIG_BALLOON_COMPACTION */
70 #if defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION)
71 if (PageZsmalloc(page))
72 return &zsmalloc_mops;
73 #endif /* defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) */
74 return NULL;
75 }
76
77 /**
78 * isolate_movable_ops_page - isolate a movable_ops page for migration
79 * @page: The page.
80 * @mode: The isolation mode.
81 *
82 * Try to isolate a movable_ops page for migration. Will fail if the page is
83 * not a movable_ops page, if the page is already isolated for migration
84 * or if the page was just was released by its owner.
85 *
86 * Once isolated, the page cannot get freed until it is either putback
87 * or migrated.
88 *
89 * Returns true if isolation succeeded, otherwise false.
90 */
isolate_movable_ops_page(struct page * page,isolate_mode_t mode)91 bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
92 {
93 /*
94 * TODO: these pages will not be folios in the future. All
95 * folio dependencies will have to be removed.
96 */
97 struct folio *folio = folio_get_nontail_page(page);
98 const struct movable_operations *mops;
99
100 /*
101 * Avoid burning cycles with pages that are yet under __free_pages(),
102 * or just got freed under us.
103 *
104 * In case we 'win' a race for a movable page being freed under us and
105 * raise its refcount preventing __free_pages() from doing its job
106 * the put_page() at the end of this block will take care of
107 * release this page, thus avoiding a nasty leakage.
108 */
109 if (!folio)
110 goto out;
111
112 /*
113 * Check for movable_ops pages before taking the page lock because
114 * we use non-atomic bitops on newly allocated page flags so
115 * unconditionally grabbing the lock ruins page's owner side.
116 *
117 * Note that once a page has movable_ops, it will stay that way
118 * until the page was freed.
119 */
120 if (unlikely(!page_has_movable_ops(page)))
121 goto out_putfolio;
122
123 /*
124 * As movable pages are not isolated from LRU lists, concurrent
125 * compaction threads can race against page migration functions
126 * as well as race against the releasing a page.
127 *
128 * In order to avoid having an already isolated movable page
129 * being (wrongly) re-isolated while it is under migration,
130 * or to avoid attempting to isolate pages being released,
131 * lets be sure we have the page lock
132 * before proceeding with the movable page isolation steps.
133 */
134 if (unlikely(!folio_trylock(folio)))
135 goto out_putfolio;
136
137 VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
138 if (PageMovableOpsIsolated(page))
139 goto out_no_isolated;
140
141 mops = page_movable_ops(page);
142 if (WARN_ON_ONCE(!mops))
143 goto out_no_isolated;
144
145 if (!mops->isolate_page(page, mode))
146 goto out_no_isolated;
147
148 /* Driver shouldn't use the isolated flag */
149 VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page);
150 SetPageMovableOpsIsolated(page);
151 folio_unlock(folio);
152
153 return true;
154
155 out_no_isolated:
156 folio_unlock(folio);
157 out_putfolio:
158 folio_put(folio);
159 out:
160 return false;
161 }
162
163 /**
164 * putback_movable_ops_page - putback an isolated movable_ops page
165 * @page: The isolated page.
166 *
167 * Putback an isolated movable_ops page.
168 *
169 * After the page was putback, it might get freed instantly.
170 */
putback_movable_ops_page(struct page * page)171 static void putback_movable_ops_page(struct page *page)
172 {
173 /*
174 * TODO: these pages will not be folios in the future. All
175 * folio dependencies will have to be removed.
176 */
177 struct folio *folio = page_folio(page);
178
179 VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
180 VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(page), page);
181 folio_lock(folio);
182 page_movable_ops(page)->putback_page(page);
183 ClearPageMovableOpsIsolated(page);
184 folio_unlock(folio);
185 folio_put(folio);
186 }
187
188 /**
189 * migrate_movable_ops_page - migrate an isolated movable_ops page
190 * @dst: The destination page.
191 * @src: The source page.
192 * @mode: The migration mode.
193 *
194 * Migrate an isolated movable_ops page.
195 *
196 * If the src page was already released by its owner, the src page is
197 * un-isolated (putback) and migration succeeds; the migration core will be the
198 * owner of both pages.
199 *
200 * If the src page was not released by its owner and the migration was
201 * successful, the owner of the src page and the dst page are swapped and
202 * the src page is un-isolated.
203 *
204 * If migration fails, the ownership stays unmodified and the src page
205 * remains isolated: migration may be retried later or the page can be putback.
206 *
207 * TODO: migration core will treat both pages as folios and lock them before
208 * this call to unlock them after this call. Further, the folio refcounts on
209 * src and dst are also released by migration core. These pages will not be
210 * folios in the future, so that must be reworked.
211 *
212 * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error
213 * code.
214 */
migrate_movable_ops_page(struct page * dst,struct page * src,enum migrate_mode mode)215 static int migrate_movable_ops_page(struct page *dst, struct page *src,
216 enum migrate_mode mode)
217 {
218 int rc = MIGRATEPAGE_SUCCESS;
219
220 VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
221 VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src);
222 rc = page_movable_ops(src)->migrate_page(dst, src, mode);
223 if (rc == MIGRATEPAGE_SUCCESS)
224 ClearPageMovableOpsIsolated(src);
225 return rc;
226 }
227
228 /*
229 * Put previously isolated pages back onto the appropriate lists
230 * from where they were once taken off for compaction/migration.
231 *
232 * This function shall be used whenever the isolated pageset has been
233 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
234 * and folio_isolate_hugetlb().
235 */
putback_movable_pages(struct list_head * l)236 void putback_movable_pages(struct list_head *l)
237 {
238 struct folio *folio;
239 struct folio *folio2;
240
241 list_for_each_entry_safe(folio, folio2, l, lru) {
242 if (unlikely(folio_test_hugetlb(folio))) {
243 folio_putback_hugetlb(folio);
244 continue;
245 }
246 list_del(&folio->lru);
247 if (unlikely(page_has_movable_ops(&folio->page))) {
248 putback_movable_ops_page(&folio->page);
249 } else {
250 node_stat_mod_folio(folio, NR_ISOLATED_ANON +
251 folio_is_file_lru(folio), -folio_nr_pages(folio));
252 folio_putback_lru(folio);
253 }
254 }
255 }
256
257 /* Must be called with an elevated refcount on the non-hugetlb folio */
isolate_folio_to_list(struct folio * folio,struct list_head * list)258 bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
259 {
260 if (folio_test_hugetlb(folio))
261 return folio_isolate_hugetlb(folio, list);
262
263 if (page_has_movable_ops(&folio->page)) {
264 if (!isolate_movable_ops_page(&folio->page,
265 ISOLATE_UNEVICTABLE))
266 return false;
267 } else {
268 if (!folio_isolate_lru(folio))
269 return false;
270 node_stat_add_folio(folio, NR_ISOLATED_ANON +
271 folio_is_file_lru(folio));
272 }
273 list_add(&folio->lru, list);
274 return true;
275 }
276
try_to_map_unused_to_zeropage(struct page_vma_mapped_walk * pvmw,struct folio * folio,unsigned long idx)277 static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
278 struct folio *folio,
279 unsigned long idx)
280 {
281 struct page *page = folio_page(folio, idx);
282 bool contains_data;
283 pte_t newpte;
284 void *addr;
285
286 if (PageCompound(page))
287 return false;
288 VM_BUG_ON_PAGE(!PageAnon(page), page);
289 VM_BUG_ON_PAGE(!PageLocked(page), page);
290 VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
291
292 if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
293 mm_forbids_zeropage(pvmw->vma->vm_mm))
294 return false;
295
296 /*
297 * The pmd entry mapping the old thp was flushed and the pte mapping
298 * this subpage has been non present. If the subpage is only zero-filled
299 * then map it to the shared zeropage.
300 */
301 addr = kmap_local_page(page);
302 contains_data = memchr_inv(addr, 0, PAGE_SIZE);
303 kunmap_local(addr);
304
305 if (contains_data)
306 return false;
307
308 newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
309 pvmw->vma->vm_page_prot));
310 set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
311
312 dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
313 return true;
314 }
315
316 struct rmap_walk_arg {
317 struct folio *folio;
318 bool map_unused_to_zeropage;
319 };
320
321 /*
322 * Restore a potential migration pte to a working pte entry
323 */
remove_migration_pte(struct folio * folio,struct vm_area_struct * vma,unsigned long addr,void * arg)324 static bool remove_migration_pte(struct folio *folio,
325 struct vm_area_struct *vma, unsigned long addr, void *arg)
326 {
327 struct rmap_walk_arg *rmap_walk_arg = arg;
328 DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
329
330 while (page_vma_mapped_walk(&pvmw)) {
331 rmap_t rmap_flags = RMAP_NONE;
332 pte_t old_pte;
333 pte_t pte;
334 swp_entry_t entry;
335 struct page *new;
336 unsigned long idx = 0;
337
338 /* pgoff is invalid for ksm pages, but they are never large */
339 if (folio_test_large(folio) && !folio_test_hugetlb(folio))
340 idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
341 new = folio_page(folio, idx);
342
343 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
344 /* PMD-mapped THP migration entry */
345 if (!pvmw.pte) {
346 VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
347 !folio_test_pmd_mappable(folio), folio);
348 remove_migration_pmd(&pvmw, new);
349 continue;
350 }
351 #endif
352 if (rmap_walk_arg->map_unused_to_zeropage &&
353 try_to_map_unused_to_zeropage(&pvmw, folio, idx))
354 continue;
355
356 folio_get(folio);
357 pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
358 old_pte = ptep_get(pvmw.pte);
359
360 entry = pte_to_swp_entry(old_pte);
361 if (!is_migration_entry_young(entry))
362 pte = pte_mkold(pte);
363 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
364 pte = pte_mkdirty(pte);
365 if (pte_swp_soft_dirty(old_pte))
366 pte = pte_mksoft_dirty(pte);
367 else
368 pte = pte_clear_soft_dirty(pte);
369
370 if (is_writable_migration_entry(entry))
371 pte = pte_mkwrite(pte, vma);
372 else if (pte_swp_uffd_wp(old_pte))
373 pte = pte_mkuffd_wp(pte);
374
375 if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
376 rmap_flags |= RMAP_EXCLUSIVE;
377
378 if (unlikely(is_device_private_page(new))) {
379 if (pte_write(pte))
380 entry = make_writable_device_private_entry(
381 page_to_pfn(new));
382 else
383 entry = make_readable_device_private_entry(
384 page_to_pfn(new));
385 pte = swp_entry_to_pte(entry);
386 if (pte_swp_soft_dirty(old_pte))
387 pte = pte_swp_mksoft_dirty(pte);
388 if (pte_swp_uffd_wp(old_pte))
389 pte = pte_swp_mkuffd_wp(pte);
390 }
391
392 #ifdef CONFIG_HUGETLB_PAGE
393 if (folio_test_hugetlb(folio)) {
394 struct hstate *h = hstate_vma(vma);
395 unsigned int shift = huge_page_shift(h);
396 unsigned long psize = huge_page_size(h);
397
398 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
399 if (folio_test_anon(folio))
400 hugetlb_add_anon_rmap(folio, vma, pvmw.address,
401 rmap_flags);
402 else
403 hugetlb_add_file_rmap(folio);
404 set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte,
405 psize);
406 } else
407 #endif
408 {
409 if (folio_test_anon(folio))
410 folio_add_anon_rmap_pte(folio, new, vma,
411 pvmw.address, rmap_flags);
412 else
413 folio_add_file_rmap_pte(folio, new, vma);
414 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
415 }
416 if (READ_ONCE(vma->vm_flags) & VM_LOCKED)
417 mlock_drain_local();
418
419 trace_remove_migration_pte(pvmw.address, pte_val(pte),
420 compound_order(new));
421
422 /* No need to invalidate - it was non-present before */
423 update_mmu_cache(vma, pvmw.address, pvmw.pte);
424 }
425
426 return true;
427 }
428
429 /*
430 * Get rid of all migration entries and replace them by
431 * references to the indicated page.
432 */
remove_migration_ptes(struct folio * src,struct folio * dst,int flags)433 void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
434 {
435 struct rmap_walk_arg rmap_walk_arg = {
436 .folio = src,
437 .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
438 };
439
440 struct rmap_walk_control rwc = {
441 .rmap_one = remove_migration_pte,
442 .arg = &rmap_walk_arg,
443 };
444
445 VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
446
447 if (flags & RMP_LOCKED)
448 rmap_walk_locked(dst, &rwc);
449 else
450 rmap_walk(dst, &rwc);
451 }
452
453 /*
454 * Something used the pte of a page under migration. We need to
455 * get to the page and wait until migration is finished.
456 * When we return from this function the fault will be retried.
457 */
migration_entry_wait(struct mm_struct * mm,pmd_t * pmd,unsigned long address)458 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
459 unsigned long address)
460 {
461 spinlock_t *ptl;
462 pte_t *ptep;
463 pte_t pte;
464 swp_entry_t entry;
465
466 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
467 if (!ptep)
468 return;
469
470 pte = ptep_get(ptep);
471 pte_unmap(ptep);
472
473 if (!is_swap_pte(pte))
474 goto out;
475
476 entry = pte_to_swp_entry(pte);
477 if (!is_migration_entry(entry))
478 goto out;
479
480 migration_entry_wait_on_locked(entry, ptl);
481 return;
482 out:
483 spin_unlock(ptl);
484 }
485
486 #ifdef CONFIG_HUGETLB_PAGE
487 /*
488 * The vma read lock must be held upon entry. Holding that lock prevents either
489 * the pte or the ptl from being freed.
490 *
491 * This function will release the vma lock before returning.
492 */
migration_entry_wait_huge(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep)493 void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
494 {
495 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
496 pte_t pte;
497
498 hugetlb_vma_assert_locked(vma);
499 spin_lock(ptl);
500 pte = huge_ptep_get(vma->vm_mm, addr, ptep);
501
502 if (unlikely(!is_hugetlb_entry_migration(pte))) {
503 spin_unlock(ptl);
504 hugetlb_vma_unlock_read(vma);
505 } else {
506 /*
507 * If migration entry existed, safe to release vma lock
508 * here because the pgtable page won't be freed without the
509 * pgtable lock released. See comment right above pgtable
510 * lock release in migration_entry_wait_on_locked().
511 */
512 hugetlb_vma_unlock_read(vma);
513 migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
514 }
515 }
516 #endif
517
518 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
pmd_migration_entry_wait(struct mm_struct * mm,pmd_t * pmd)519 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
520 {
521 spinlock_t *ptl;
522
523 ptl = pmd_lock(mm, pmd);
524 if (!is_pmd_migration_entry(*pmd))
525 goto unlock;
526 migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl);
527 return;
528 unlock:
529 spin_unlock(ptl);
530 }
531 #endif
532
533 /*
534 * Replace the folio in the mapping.
535 *
536 * The number of remaining references must be:
537 * 1 for anonymous folios without a mapping
538 * 2 for folios with a mapping
539 * 3 for folios with a mapping and the private flag set.
540 */
__folio_migrate_mapping(struct address_space * mapping,struct folio * newfolio,struct folio * folio,int expected_count)541 static int __folio_migrate_mapping(struct address_space *mapping,
542 struct folio *newfolio, struct folio *folio, int expected_count)
543 {
544 XA_STATE(xas, &mapping->i_pages, folio_index(folio));
545 struct zone *oldzone, *newzone;
546 int dirty;
547 long nr = folio_nr_pages(folio);
548 long entries, i;
549
550 if (!mapping) {
551 /* Take off deferred split queue while frozen and memcg set */
552 if (folio_test_large(folio) &&
553 folio_test_large_rmappable(folio)) {
554 if (!folio_ref_freeze(folio, expected_count))
555 return -EAGAIN;
556 folio_unqueue_deferred_split(folio);
557 folio_ref_unfreeze(folio, expected_count);
558 }
559
560 /* No turning back from here */
561 newfolio->index = folio->index;
562 newfolio->mapping = folio->mapping;
563 if (folio_test_anon(folio) && folio_test_large(folio))
564 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
565 if (folio_test_swapbacked(folio))
566 __folio_set_swapbacked(newfolio);
567
568 return MIGRATEPAGE_SUCCESS;
569 }
570
571 oldzone = folio_zone(folio);
572 newzone = folio_zone(newfolio);
573
574 xas_lock_irq(&xas);
575 if (!folio_ref_freeze(folio, expected_count)) {
576 xas_unlock_irq(&xas);
577 return -EAGAIN;
578 }
579
580 /* Take off deferred split queue while frozen and memcg set */
581 folio_unqueue_deferred_split(folio);
582
583 /*
584 * Now we know that no one else is looking at the folio:
585 * no turning back from here.
586 */
587 newfolio->index = folio->index;
588 newfolio->mapping = folio->mapping;
589 if (folio_test_anon(folio) && folio_test_large(folio))
590 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
591 folio_ref_add(newfolio, nr); /* add cache reference */
592 if (folio_test_swapbacked(folio))
593 __folio_set_swapbacked(newfolio);
594 if (folio_test_swapcache(folio)) {
595 folio_set_swapcache(newfolio);
596 newfolio->private = folio_get_private(folio);
597 entries = nr;
598 } else {
599 entries = 1;
600 }
601
602 /* Move dirty while folio refs frozen and newfolio not yet exposed */
603 dirty = folio_test_dirty(folio);
604 if (dirty) {
605 folio_clear_dirty(folio);
606 folio_set_dirty(newfolio);
607 }
608
609 /* Swap cache still stores N entries instead of a high-order entry */
610 for (i = 0; i < entries; i++) {
611 xas_store(&xas, newfolio);
612 xas_next(&xas);
613 }
614
615 /*
616 * Drop cache reference from old folio by unfreezing
617 * to one less reference.
618 * We know this isn't the last reference.
619 */
620 folio_ref_unfreeze(folio, expected_count - nr);
621
622 xas_unlock(&xas);
623 /* Leave irq disabled to prevent preemption while updating stats */
624
625 /*
626 * If moved to a different zone then also account
627 * the folio for that zone. Other VM counters will be
628 * taken care of when we establish references to the
629 * new folio and drop references to the old folio.
630 *
631 * Note that anonymous folios are accounted for
632 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
633 * are mapped to swap space.
634 */
635 if (newzone != oldzone) {
636 struct lruvec *old_lruvec, *new_lruvec;
637 struct mem_cgroup *memcg;
638
639 memcg = folio_memcg(folio);
640 old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
641 new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
642
643 __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
644 __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
645 if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
646 __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
647 __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
648
649 if (folio_test_pmd_mappable(folio)) {
650 __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
651 __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
652 }
653 }
654 #ifdef CONFIG_SWAP
655 if (folio_test_swapcache(folio)) {
656 __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
657 __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
658 }
659 #endif
660 if (dirty && mapping_can_writeback(mapping)) {
661 __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
662 __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
663 __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
664 __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
665 }
666 }
667 local_irq_enable();
668
669 return MIGRATEPAGE_SUCCESS;
670 }
671
folio_migrate_mapping(struct address_space * mapping,struct folio * newfolio,struct folio * folio,int extra_count)672 int folio_migrate_mapping(struct address_space *mapping,
673 struct folio *newfolio, struct folio *folio, int extra_count)
674 {
675 int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
676
677 if (folio_ref_count(folio) != expected_count)
678 return -EAGAIN;
679
680 return __folio_migrate_mapping(mapping, newfolio, folio, expected_count);
681 }
682 EXPORT_SYMBOL(folio_migrate_mapping);
683
684 /*
685 * The expected number of remaining references is the same as that
686 * of folio_migrate_mapping().
687 */
migrate_huge_page_move_mapping(struct address_space * mapping,struct folio * dst,struct folio * src)688 int migrate_huge_page_move_mapping(struct address_space *mapping,
689 struct folio *dst, struct folio *src)
690 {
691 XA_STATE(xas, &mapping->i_pages, folio_index(src));
692 int rc, expected_count = folio_expected_ref_count(src) + 1;
693
694 if (folio_ref_count(src) != expected_count)
695 return -EAGAIN;
696
697 rc = folio_mc_copy(dst, src);
698 if (unlikely(rc))
699 return rc;
700
701 xas_lock_irq(&xas);
702 if (!folio_ref_freeze(src, expected_count)) {
703 xas_unlock_irq(&xas);
704 return -EAGAIN;
705 }
706
707 dst->index = src->index;
708 dst->mapping = src->mapping;
709
710 folio_ref_add(dst, folio_nr_pages(dst));
711
712 xas_store(&xas, dst);
713
714 folio_ref_unfreeze(src, expected_count - folio_nr_pages(src));
715
716 xas_unlock_irq(&xas);
717
718 return MIGRATEPAGE_SUCCESS;
719 }
720
721 /*
722 * Copy the flags and some other ancillary information
723 */
folio_migrate_flags(struct folio * newfolio,struct folio * folio)724 void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
725 {
726 int cpupid;
727
728 if (folio_test_referenced(folio))
729 folio_set_referenced(newfolio);
730 if (folio_test_uptodate(folio))
731 folio_mark_uptodate(newfolio);
732 if (folio_test_clear_active(folio)) {
733 VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
734 folio_set_active(newfolio);
735 } else if (folio_test_clear_unevictable(folio))
736 folio_set_unevictable(newfolio);
737 if (folio_test_workingset(folio))
738 folio_set_workingset(newfolio);
739 if (folio_test_checked(folio))
740 folio_set_checked(newfolio);
741 /*
742 * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
743 * migration entries. We can still have PG_anon_exclusive set on an
744 * effectively unmapped and unreferenced first sub-pages of an
745 * anonymous THP: we can simply copy it here via PG_mappedtodisk.
746 */
747 if (folio_test_mappedtodisk(folio))
748 folio_set_mappedtodisk(newfolio);
749
750 /* Move dirty on pages not done by folio_migrate_mapping() */
751 if (folio_test_dirty(folio))
752 folio_set_dirty(newfolio);
753
754 if (folio_test_young(folio))
755 folio_set_young(newfolio);
756 if (folio_test_idle(folio))
757 folio_set_idle(newfolio);
758
759 folio_migrate_refs(newfolio, folio);
760 /*
761 * Copy NUMA information to the new page, to prevent over-eager
762 * future migrations of this same page.
763 */
764 cpupid = folio_xchg_last_cpupid(folio, -1);
765 /*
766 * For memory tiering mode, when migrate between slow and fast
767 * memory node, reset cpupid, because that is used to record
768 * page access time in slow memory node.
769 */
770 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
771 bool f_toptier = node_is_toptier(folio_nid(folio));
772 bool t_toptier = node_is_toptier(folio_nid(newfolio));
773
774 if (f_toptier != t_toptier)
775 cpupid = -1;
776 }
777 folio_xchg_last_cpupid(newfolio, cpupid);
778
779 folio_migrate_ksm(newfolio, folio);
780 /*
781 * Please do not reorder this without considering how mm/ksm.c's
782 * ksm_get_folio() depends upon ksm_migrate_page() and the
783 * swapcache flag.
784 */
785 if (folio_test_swapcache(folio))
786 folio_clear_swapcache(folio);
787 folio_clear_private(folio);
788
789 /* page->private contains hugetlb specific flags */
790 if (!folio_test_hugetlb(folio))
791 folio->private = NULL;
792
793 /*
794 * If any waiters have accumulated on the new page then
795 * wake them up.
796 */
797 if (folio_test_writeback(newfolio))
798 folio_end_writeback(newfolio);
799
800 /*
801 * PG_readahead shares the same bit with PG_reclaim. The above
802 * end_page_writeback() may clear PG_readahead mistakenly, so set the
803 * bit after that.
804 */
805 if (folio_test_readahead(folio))
806 folio_set_readahead(newfolio);
807
808 folio_copy_owner(newfolio, folio);
809 pgalloc_tag_swap(newfolio, folio);
810
811 mem_cgroup_migrate(folio, newfolio);
812 }
813 EXPORT_SYMBOL(folio_migrate_flags);
814
815 /************************************************************
816 * Migration functions
817 ***********************************************************/
818
__migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,void * src_private,enum migrate_mode mode)819 static int __migrate_folio(struct address_space *mapping, struct folio *dst,
820 struct folio *src, void *src_private,
821 enum migrate_mode mode)
822 {
823 int rc, expected_count = folio_expected_ref_count(src) + 1;
824
825 /* Check whether src does not have extra refs before we do more work */
826 if (folio_ref_count(src) != expected_count)
827 return -EAGAIN;
828
829 rc = folio_mc_copy(dst, src);
830 if (unlikely(rc))
831 return rc;
832
833 rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
834 if (rc != MIGRATEPAGE_SUCCESS)
835 return rc;
836
837 if (src_private)
838 folio_attach_private(dst, folio_detach_private(src));
839
840 folio_migrate_flags(dst, src);
841 return MIGRATEPAGE_SUCCESS;
842 }
843
844 /**
845 * migrate_folio() - Simple folio migration.
846 * @mapping: The address_space containing the folio.
847 * @dst: The folio to migrate the data to.
848 * @src: The folio containing the current data.
849 * @mode: How to migrate the page.
850 *
851 * Common logic to directly migrate a single LRU folio suitable for
852 * folios that do not have private data.
853 *
854 * Folios are locked upon entry and exit.
855 */
migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)856 int migrate_folio(struct address_space *mapping, struct folio *dst,
857 struct folio *src, enum migrate_mode mode)
858 {
859 BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
860 return __migrate_folio(mapping, dst, src, NULL, mode);
861 }
862 EXPORT_SYMBOL(migrate_folio);
863
864 #ifdef CONFIG_BUFFER_HEAD
865 /* Returns true if all buffers are successfully locked */
buffer_migrate_lock_buffers(struct buffer_head * head,enum migrate_mode mode)866 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
867 enum migrate_mode mode)
868 {
869 struct buffer_head *bh = head;
870 struct buffer_head *failed_bh;
871
872 do {
873 if (!trylock_buffer(bh)) {
874 if (mode == MIGRATE_ASYNC)
875 goto unlock;
876 if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh))
877 goto unlock;
878 lock_buffer(bh);
879 }
880
881 bh = bh->b_this_page;
882 } while (bh != head);
883
884 return true;
885
886 unlock:
887 /* We failed to lock the buffer and cannot stall. */
888 failed_bh = bh;
889 bh = head;
890 while (bh != failed_bh) {
891 unlock_buffer(bh);
892 bh = bh->b_this_page;
893 }
894
895 return false;
896 }
897
__buffer_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode,bool check_refs)898 static int __buffer_migrate_folio(struct address_space *mapping,
899 struct folio *dst, struct folio *src, enum migrate_mode mode,
900 bool check_refs)
901 {
902 struct buffer_head *bh, *head;
903 int rc;
904 int expected_count;
905
906 head = folio_buffers(src);
907 if (!head)
908 return migrate_folio(mapping, dst, src, mode);
909
910 /* Check whether page does not have extra refs before we do more work */
911 expected_count = folio_expected_ref_count(src) + 1;
912 if (folio_ref_count(src) != expected_count)
913 return -EAGAIN;
914
915 if (!buffer_migrate_lock_buffers(head, mode))
916 return -EAGAIN;
917
918 if (check_refs) {
919 bool busy, migrating;
920 bool invalidated = false;
921
922 migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
923 VM_WARN_ON_ONCE(migrating);
924 recheck_buffers:
925 busy = false;
926 spin_lock(&mapping->i_private_lock);
927 bh = head;
928 do {
929 if (atomic_read(&bh->b_count)) {
930 busy = true;
931 break;
932 }
933 bh = bh->b_this_page;
934 } while (bh != head);
935 spin_unlock(&mapping->i_private_lock);
936 if (busy) {
937 if (invalidated) {
938 rc = -EAGAIN;
939 goto unlock_buffers;
940 }
941 invalidate_bh_lrus();
942 invalidated = true;
943 goto recheck_buffers;
944 }
945 }
946
947 rc = filemap_migrate_folio(mapping, dst, src, mode);
948 if (rc != MIGRATEPAGE_SUCCESS)
949 goto unlock_buffers;
950
951 bh = head;
952 do {
953 folio_set_bh(bh, dst, bh_offset(bh));
954 bh = bh->b_this_page;
955 } while (bh != head);
956
957 unlock_buffers:
958 if (check_refs)
959 clear_bit_unlock(BH_Migrate, &head->b_state);
960 bh = head;
961 do {
962 unlock_buffer(bh);
963 bh = bh->b_this_page;
964 } while (bh != head);
965
966 return rc;
967 }
968
969 /**
970 * buffer_migrate_folio() - Migration function for folios with buffers.
971 * @mapping: The address space containing @src.
972 * @dst: The folio to migrate to.
973 * @src: The folio to migrate from.
974 * @mode: How to migrate the folio.
975 *
976 * This function can only be used if the underlying filesystem guarantees
977 * that no other references to @src exist. For example attached buffer
978 * heads are accessed only under the folio lock. If your filesystem cannot
979 * provide this guarantee, buffer_migrate_folio_norefs() may be more
980 * appropriate.
981 *
982 * Return: 0 on success or a negative errno on failure.
983 */
buffer_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)984 int buffer_migrate_folio(struct address_space *mapping,
985 struct folio *dst, struct folio *src, enum migrate_mode mode)
986 {
987 return __buffer_migrate_folio(mapping, dst, src, mode, false);
988 }
989 EXPORT_SYMBOL(buffer_migrate_folio);
990
991 /**
992 * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
993 * @mapping: The address space containing @src.
994 * @dst: The folio to migrate to.
995 * @src: The folio to migrate from.
996 * @mode: How to migrate the folio.
997 *
998 * Like buffer_migrate_folio() except that this variant is more careful
999 * and checks that there are also no buffer head references. This function
1000 * is the right one for mappings where buffer heads are directly looked
1001 * up and referenced (such as block device mappings).
1002 *
1003 * Return: 0 on success or a negative errno on failure.
1004 */
buffer_migrate_folio_norefs(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)1005 int buffer_migrate_folio_norefs(struct address_space *mapping,
1006 struct folio *dst, struct folio *src, enum migrate_mode mode)
1007 {
1008 return __buffer_migrate_folio(mapping, dst, src, mode, true);
1009 }
1010 EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
1011 #endif /* CONFIG_BUFFER_HEAD */
1012
filemap_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)1013 int filemap_migrate_folio(struct address_space *mapping,
1014 struct folio *dst, struct folio *src, enum migrate_mode mode)
1015 {
1016 return __migrate_folio(mapping, dst, src, folio_get_private(src), mode);
1017 }
1018 EXPORT_SYMBOL_GPL(filemap_migrate_folio);
1019
1020 /*
1021 * Default handling if a filesystem does not provide a migration function.
1022 */
fallback_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)1023 static int fallback_migrate_folio(struct address_space *mapping,
1024 struct folio *dst, struct folio *src, enum migrate_mode mode)
1025 {
1026 WARN_ONCE(mapping->a_ops->writepages,
1027 "%ps does not implement migrate_folio\n",
1028 mapping->a_ops);
1029 if (folio_test_dirty(src))
1030 return -EBUSY;
1031
1032 /*
1033 * Filesystem may have private data at folio->private that we
1034 * can't migrate automatically.
1035 */
1036 if (!filemap_release_folio(src, GFP_KERNEL))
1037 return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
1038
1039 return migrate_folio(mapping, dst, src, mode);
1040 }
1041
1042 /*
1043 * Move a src folio to a newly allocated dst folio.
1044 *
1045 * The src and dst folios are locked and the src folios was unmapped from
1046 * the page tables.
1047 *
1048 * On success, the src folio was replaced by the dst folio.
1049 *
1050 * Return value:
1051 * < 0 - error code
1052 * MIGRATEPAGE_SUCCESS - success
1053 */
move_to_new_folio(struct folio * dst,struct folio * src,enum migrate_mode mode)1054 static int move_to_new_folio(struct folio *dst, struct folio *src,
1055 enum migrate_mode mode)
1056 {
1057 struct address_space *mapping = folio_mapping(src);
1058 int rc = -EAGAIN;
1059
1060 VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
1061 VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
1062
1063 if (!mapping)
1064 rc = migrate_folio(mapping, dst, src, mode);
1065 else if (mapping_inaccessible(mapping))
1066 rc = -EOPNOTSUPP;
1067 else if (mapping->a_ops->migrate_folio)
1068 /*
1069 * Most folios have a mapping and most filesystems
1070 * provide a migrate_folio callback. Anonymous folios
1071 * are part of swap space which also has its own
1072 * migrate_folio callback. This is the most common path
1073 * for page migration.
1074 */
1075 rc = mapping->a_ops->migrate_folio(mapping, dst, src,
1076 mode);
1077 else
1078 rc = fallback_migrate_folio(mapping, dst, src, mode);
1079
1080 if (rc == MIGRATEPAGE_SUCCESS) {
1081 /*
1082 * For pagecache folios, src->mapping must be cleared before src
1083 * is freed. Anonymous folios must stay anonymous until freed.
1084 */
1085 if (!folio_test_anon(src))
1086 src->mapping = NULL;
1087
1088 if (likely(!folio_is_zone_device(dst)))
1089 flush_dcache_folio(dst);
1090 }
1091 return rc;
1092 }
1093
1094 /*
1095 * To record some information during migration, we use unused private
1096 * field of struct folio of the newly allocated destination folio.
1097 * This is safe because nobody is using it except us.
1098 */
1099 enum {
1100 PAGE_WAS_MAPPED = BIT(0),
1101 PAGE_WAS_MLOCKED = BIT(1),
1102 PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
1103 };
1104
__migrate_folio_record(struct folio * dst,int old_page_state,struct anon_vma * anon_vma)1105 static void __migrate_folio_record(struct folio *dst,
1106 int old_page_state,
1107 struct anon_vma *anon_vma)
1108 {
1109 dst->private = (void *)anon_vma + old_page_state;
1110 }
1111
__migrate_folio_extract(struct folio * dst,int * old_page_state,struct anon_vma ** anon_vmap)1112 static void __migrate_folio_extract(struct folio *dst,
1113 int *old_page_state,
1114 struct anon_vma **anon_vmap)
1115 {
1116 unsigned long private = (unsigned long)dst->private;
1117
1118 *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES);
1119 *old_page_state = private & PAGE_OLD_STATES;
1120 dst->private = NULL;
1121 }
1122
1123 /* Restore the source folio to the original state upon failure */
migrate_folio_undo_src(struct folio * src,int page_was_mapped,struct anon_vma * anon_vma,bool locked,struct list_head * ret)1124 static void migrate_folio_undo_src(struct folio *src,
1125 int page_was_mapped,
1126 struct anon_vma *anon_vma,
1127 bool locked,
1128 struct list_head *ret)
1129 {
1130 if (page_was_mapped)
1131 remove_migration_ptes(src, src, 0);
1132 /* Drop an anon_vma reference if we took one */
1133 if (anon_vma)
1134 put_anon_vma(anon_vma);
1135 if (locked)
1136 folio_unlock(src);
1137 if (ret)
1138 list_move_tail(&src->lru, ret);
1139 }
1140
1141 /* Restore the destination folio to the original state upon failure */
migrate_folio_undo_dst(struct folio * dst,bool locked,free_folio_t put_new_folio,unsigned long private)1142 static void migrate_folio_undo_dst(struct folio *dst, bool locked,
1143 free_folio_t put_new_folio, unsigned long private)
1144 {
1145 if (locked)
1146 folio_unlock(dst);
1147 if (put_new_folio)
1148 put_new_folio(dst, private);
1149 else
1150 folio_put(dst);
1151 }
1152
1153 /* Cleanup src folio upon migration success */
migrate_folio_done(struct folio * src,enum migrate_reason reason)1154 static void migrate_folio_done(struct folio *src,
1155 enum migrate_reason reason)
1156 {
1157 if (likely(!page_has_movable_ops(&src->page)) && reason != MR_DEMOTION)
1158 mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
1159 folio_is_file_lru(src), -folio_nr_pages(src));
1160
1161 if (reason != MR_MEMORY_FAILURE)
1162 /* We release the page in page_handle_poison. */
1163 folio_put(src);
1164 }
1165
1166 /* Obtain the lock on page, remove all ptes. */
migrate_folio_unmap(new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,struct folio * src,struct folio ** dstp,enum migrate_mode mode,enum migrate_reason reason,struct list_head * ret)1167 static int migrate_folio_unmap(new_folio_t get_new_folio,
1168 free_folio_t put_new_folio, unsigned long private,
1169 struct folio *src, struct folio **dstp, enum migrate_mode mode,
1170 enum migrate_reason reason, struct list_head *ret)
1171 {
1172 struct folio *dst;
1173 int rc = -EAGAIN;
1174 int old_page_state = 0;
1175 struct anon_vma *anon_vma = NULL;
1176 bool locked = false;
1177 bool dst_locked = false;
1178
1179 if (folio_ref_count(src) == 1) {
1180 /* Folio was freed from under us. So we are done. */
1181 folio_clear_active(src);
1182 folio_clear_unevictable(src);
1183 /* free_pages_prepare() will clear PG_isolated. */
1184 list_del(&src->lru);
1185 migrate_folio_done(src, reason);
1186 return MIGRATEPAGE_SUCCESS;
1187 }
1188
1189 dst = get_new_folio(src, private);
1190 if (!dst)
1191 return -ENOMEM;
1192 *dstp = dst;
1193
1194 dst->private = NULL;
1195
1196 if (!folio_trylock(src)) {
1197 if (mode == MIGRATE_ASYNC)
1198 goto out;
1199
1200 /*
1201 * It's not safe for direct compaction to call lock_page.
1202 * For example, during page readahead pages are added locked
1203 * to the LRU. Later, when the IO completes the pages are
1204 * marked uptodate and unlocked. However, the queueing
1205 * could be merging multiple pages for one bio (e.g.
1206 * mpage_readahead). If an allocation happens for the
1207 * second or third page, the process can end up locking
1208 * the same page twice and deadlocking. Rather than
1209 * trying to be clever about what pages can be locked,
1210 * avoid the use of lock_page for direct compaction
1211 * altogether.
1212 */
1213 if (current->flags & PF_MEMALLOC)
1214 goto out;
1215
1216 /*
1217 * In "light" mode, we can wait for transient locks (eg
1218 * inserting a page into the page table), but it's not
1219 * worth waiting for I/O.
1220 */
1221 if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src))
1222 goto out;
1223
1224 folio_lock(src);
1225 }
1226 locked = true;
1227 if (folio_test_mlocked(src))
1228 old_page_state |= PAGE_WAS_MLOCKED;
1229
1230 if (folio_test_writeback(src)) {
1231 /*
1232 * Only in the case of a full synchronous migration is it
1233 * necessary to wait for PageWriteback. In the async case,
1234 * the retry loop is too short and in the sync-light case,
1235 * the overhead of stalling is too much
1236 */
1237 switch (mode) {
1238 case MIGRATE_SYNC:
1239 break;
1240 default:
1241 rc = -EBUSY;
1242 goto out;
1243 }
1244 folio_wait_writeback(src);
1245 }
1246
1247 /*
1248 * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
1249 * we cannot notice that anon_vma is freed while we migrate a page.
1250 * This get_anon_vma() delays freeing anon_vma pointer until the end
1251 * of migration. File cache pages are no problem because of page_lock()
1252 * File Caches may use write_page() or lock_page() in migration, then,
1253 * just care Anon page here.
1254 *
1255 * Only folio_get_anon_vma() understands the subtleties of
1256 * getting a hold on an anon_vma from outside one of its mms.
1257 * But if we cannot get anon_vma, then we won't need it anyway,
1258 * because that implies that the anon page is no longer mapped
1259 * (and cannot be remapped so long as we hold the page lock).
1260 */
1261 if (folio_test_anon(src) && !folio_test_ksm(src))
1262 anon_vma = folio_get_anon_vma(src);
1263
1264 /*
1265 * Block others from accessing the new page when we get around to
1266 * establishing additional references. We are usually the only one
1267 * holding a reference to dst at this point. We used to have a BUG
1268 * here if folio_trylock(dst) fails, but would like to allow for
1269 * cases where there might be a race with the previous use of dst.
1270 * This is much like races on refcount of oldpage: just don't BUG().
1271 */
1272 if (unlikely(!folio_trylock(dst)))
1273 goto out;
1274 dst_locked = true;
1275
1276 if (unlikely(page_has_movable_ops(&src->page))) {
1277 __migrate_folio_record(dst, old_page_state, anon_vma);
1278 return MIGRATEPAGE_UNMAP;
1279 }
1280
1281 /*
1282 * Corner case handling:
1283 * 1. When a new swap-cache page is read into, it is added to the LRU
1284 * and treated as swapcache but it has no rmap yet.
1285 * Calling try_to_unmap() against a src->mapping==NULL page will
1286 * trigger a BUG. So handle it here.
1287 * 2. An orphaned page (see truncate_cleanup_page) might have
1288 * fs-private metadata. The page can be picked up due to memory
1289 * offlining. Everywhere else except page reclaim, the page is
1290 * invisible to the vm, so the page can not be migrated. So try to
1291 * free the metadata, so the page can be freed.
1292 */
1293 if (!src->mapping) {
1294 if (folio_test_private(src)) {
1295 try_to_free_buffers(src);
1296 goto out;
1297 }
1298 } else if (folio_mapped(src)) {
1299 /* Establish migration ptes */
1300 VM_BUG_ON_FOLIO(folio_test_anon(src) &&
1301 !folio_test_ksm(src) && !anon_vma, src);
1302 try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
1303 old_page_state |= PAGE_WAS_MAPPED;
1304 }
1305
1306 if (!folio_mapped(src)) {
1307 __migrate_folio_record(dst, old_page_state, anon_vma);
1308 return MIGRATEPAGE_UNMAP;
1309 }
1310
1311 out:
1312 /*
1313 * A folio that has not been unmapped will be restored to
1314 * right list unless we want to retry.
1315 */
1316 if (rc == -EAGAIN)
1317 ret = NULL;
1318
1319 migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
1320 anon_vma, locked, ret);
1321 migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
1322
1323 return rc;
1324 }
1325
1326 /* Migrate the folio to the newly allocated folio in dst. */
migrate_folio_move(free_folio_t put_new_folio,unsigned long private,struct folio * src,struct folio * dst,enum migrate_mode mode,enum migrate_reason reason,struct list_head * ret)1327 static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
1328 struct folio *src, struct folio *dst,
1329 enum migrate_mode mode, enum migrate_reason reason,
1330 struct list_head *ret)
1331 {
1332 int rc;
1333 int old_page_state = 0;
1334 struct anon_vma *anon_vma = NULL;
1335 struct list_head *prev;
1336
1337 __migrate_folio_extract(dst, &old_page_state, &anon_vma);
1338 prev = dst->lru.prev;
1339 list_del(&dst->lru);
1340
1341 if (unlikely(page_has_movable_ops(&src->page))) {
1342 rc = migrate_movable_ops_page(&dst->page, &src->page, mode);
1343 if (rc)
1344 goto out;
1345 goto out_unlock_both;
1346 }
1347
1348 rc = move_to_new_folio(dst, src, mode);
1349 if (rc)
1350 goto out;
1351
1352 /*
1353 * When successful, push dst to LRU immediately: so that if it
1354 * turns out to be an mlocked page, remove_migration_ptes() will
1355 * automatically build up the correct dst->mlock_count for it.
1356 *
1357 * We would like to do something similar for the old page, when
1358 * unsuccessful, and other cases when a page has been temporarily
1359 * isolated from the unevictable LRU: but this case is the easiest.
1360 */
1361 folio_add_lru(dst);
1362 if (old_page_state & PAGE_WAS_MLOCKED)
1363 lru_add_drain();
1364
1365 if (old_page_state & PAGE_WAS_MAPPED)
1366 remove_migration_ptes(src, dst, 0);
1367
1368 out_unlock_both:
1369 folio_unlock(dst);
1370 folio_set_owner_migrate_reason(dst, reason);
1371 /*
1372 * If migration is successful, decrease refcount of dst,
1373 * which will not free the page because new page owner increased
1374 * refcounter.
1375 */
1376 folio_put(dst);
1377
1378 /*
1379 * A folio that has been migrated has all references removed
1380 * and will be freed.
1381 */
1382 list_del(&src->lru);
1383 /* Drop an anon_vma reference if we took one */
1384 if (anon_vma)
1385 put_anon_vma(anon_vma);
1386 folio_unlock(src);
1387 migrate_folio_done(src, reason);
1388
1389 return rc;
1390 out:
1391 /*
1392 * A folio that has not been migrated will be restored to
1393 * right list unless we want to retry.
1394 */
1395 if (rc == -EAGAIN) {
1396 list_add(&dst->lru, prev);
1397 __migrate_folio_record(dst, old_page_state, anon_vma);
1398 return rc;
1399 }
1400
1401 migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
1402 anon_vma, true, ret);
1403 migrate_folio_undo_dst(dst, true, put_new_folio, private);
1404
1405 return rc;
1406 }
1407
1408 /*
1409 * Counterpart of unmap_and_move_page() for hugepage migration.
1410 *
1411 * This function doesn't wait the completion of hugepage I/O
1412 * because there is no race between I/O and migration for hugepage.
1413 * Note that currently hugepage I/O occurs only in direct I/O
1414 * where no lock is held and PG_writeback is irrelevant,
1415 * and writeback status of all subpages are counted in the reference
1416 * count of the head page (i.e. if all subpages of a 2MB hugepage are
1417 * under direct I/O, the reference of the head page is 512 and a bit more.)
1418 * This means that when we try to migrate hugepage whose subpages are
1419 * doing direct I/O, some references remain after try_to_unmap() and
1420 * hugepage migration fails without data corruption.
1421 *
1422 * There is also no race when direct I/O is issued on the page under migration,
1423 * because then pte is replaced with migration swap entry and direct I/O code
1424 * will wait in the page fault for migration to complete.
1425 */
unmap_and_move_huge_page(new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,struct folio * src,int force,enum migrate_mode mode,int reason,struct list_head * ret)1426 static int unmap_and_move_huge_page(new_folio_t get_new_folio,
1427 free_folio_t put_new_folio, unsigned long private,
1428 struct folio *src, int force, enum migrate_mode mode,
1429 int reason, struct list_head *ret)
1430 {
1431 struct folio *dst;
1432 int rc = -EAGAIN;
1433 int page_was_mapped = 0;
1434 struct anon_vma *anon_vma = NULL;
1435 struct address_space *mapping = NULL;
1436
1437 if (folio_ref_count(src) == 1) {
1438 /* page was freed from under us. So we are done. */
1439 folio_putback_hugetlb(src);
1440 return MIGRATEPAGE_SUCCESS;
1441 }
1442
1443 dst = get_new_folio(src, private);
1444 if (!dst)
1445 return -ENOMEM;
1446
1447 if (!folio_trylock(src)) {
1448 if (!force)
1449 goto out;
1450 switch (mode) {
1451 case MIGRATE_SYNC:
1452 break;
1453 default:
1454 goto out;
1455 }
1456 folio_lock(src);
1457 }
1458
1459 /*
1460 * Check for pages which are in the process of being freed. Without
1461 * folio_mapping() set, hugetlbfs specific move page routine will not
1462 * be called and we could leak usage counts for subpools.
1463 */
1464 if (hugetlb_folio_subpool(src) && !folio_mapping(src)) {
1465 rc = -EBUSY;
1466 goto out_unlock;
1467 }
1468
1469 if (folio_test_anon(src))
1470 anon_vma = folio_get_anon_vma(src);
1471
1472 if (unlikely(!folio_trylock(dst)))
1473 goto put_anon;
1474
1475 if (folio_mapped(src)) {
1476 enum ttu_flags ttu = 0;
1477
1478 if (!folio_test_anon(src)) {
1479 /*
1480 * In shared mappings, try_to_unmap could potentially
1481 * call huge_pmd_unshare. Because of this, take
1482 * semaphore in write mode here and set TTU_RMAP_LOCKED
1483 * to let lower levels know we have taken the lock.
1484 */
1485 mapping = hugetlb_folio_mapping_lock_write(src);
1486 if (unlikely(!mapping))
1487 goto unlock_put_anon;
1488
1489 ttu = TTU_RMAP_LOCKED;
1490 }
1491
1492 try_to_migrate(src, ttu);
1493 page_was_mapped = 1;
1494
1495 if (ttu & TTU_RMAP_LOCKED)
1496 i_mmap_unlock_write(mapping);
1497 }
1498
1499 if (!folio_mapped(src))
1500 rc = move_to_new_folio(dst, src, mode);
1501
1502 if (page_was_mapped)
1503 remove_migration_ptes(src,
1504 rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
1505
1506 unlock_put_anon:
1507 folio_unlock(dst);
1508
1509 put_anon:
1510 if (anon_vma)
1511 put_anon_vma(anon_vma);
1512
1513 if (rc == MIGRATEPAGE_SUCCESS) {
1514 move_hugetlb_state(src, dst, reason);
1515 put_new_folio = NULL;
1516 }
1517
1518 out_unlock:
1519 folio_unlock(src);
1520 out:
1521 if (rc == MIGRATEPAGE_SUCCESS)
1522 folio_putback_hugetlb(src);
1523 else if (rc != -EAGAIN)
1524 list_move_tail(&src->lru, ret);
1525
1526 /*
1527 * If migration was not successful and there's a freeing callback,
1528 * return the folio to that special allocator. Otherwise, simply drop
1529 * our additional reference.
1530 */
1531 if (put_new_folio)
1532 put_new_folio(dst, private);
1533 else
1534 folio_put(dst);
1535
1536 return rc;
1537 }
1538
try_split_folio(struct folio * folio,struct list_head * split_folios,enum migrate_mode mode)1539 static inline int try_split_folio(struct folio *folio, struct list_head *split_folios,
1540 enum migrate_mode mode)
1541 {
1542 int rc;
1543
1544 if (mode == MIGRATE_ASYNC) {
1545 if (!folio_trylock(folio))
1546 return -EAGAIN;
1547 } else {
1548 folio_lock(folio);
1549 }
1550 rc = split_folio_to_list(folio, split_folios);
1551 folio_unlock(folio);
1552 if (!rc)
1553 list_move_tail(&folio->lru, split_folios);
1554
1555 return rc;
1556 }
1557
1558 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1559 #define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR
1560 #else
1561 #define NR_MAX_BATCHED_MIGRATION 512
1562 #endif
1563 #define NR_MAX_MIGRATE_PAGES_RETRY 10
1564 #define NR_MAX_MIGRATE_ASYNC_RETRY 3
1565 #define NR_MAX_MIGRATE_SYNC_RETRY \
1566 (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
1567
1568 struct migrate_pages_stats {
1569 int nr_succeeded; /* Normal and large folios migrated successfully, in
1570 units of base pages */
1571 int nr_failed_pages; /* Normal and large folios failed to be migrated, in
1572 units of base pages. Untried folios aren't counted */
1573 int nr_thp_succeeded; /* THP migrated successfully */
1574 int nr_thp_failed; /* THP failed to be migrated */
1575 int nr_thp_split; /* THP split before migrating */
1576 int nr_split; /* Large folio (include THP) split before migrating */
1577 };
1578
1579 /*
1580 * Returns the number of hugetlb folios that were not migrated, or an error code
1581 * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable
1582 * any more because the list has become empty or no retryable hugetlb folios
1583 * exist any more. It is caller's responsibility to call putback_movable_pages()
1584 * only if ret != 0.
1585 */
migrate_hugetlbs(struct list_head * from,new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,struct migrate_pages_stats * stats,struct list_head * ret_folios)1586 static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
1587 free_folio_t put_new_folio, unsigned long private,
1588 enum migrate_mode mode, int reason,
1589 struct migrate_pages_stats *stats,
1590 struct list_head *ret_folios)
1591 {
1592 int retry = 1;
1593 int nr_failed = 0;
1594 int nr_retry_pages = 0;
1595 int pass = 0;
1596 struct folio *folio, *folio2;
1597 int rc, nr_pages;
1598
1599 for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) {
1600 retry = 0;
1601 nr_retry_pages = 0;
1602
1603 list_for_each_entry_safe(folio, folio2, from, lru) {
1604 if (!folio_test_hugetlb(folio))
1605 continue;
1606
1607 nr_pages = folio_nr_pages(folio);
1608
1609 cond_resched();
1610
1611 /*
1612 * Migratability of hugepages depends on architectures and
1613 * their size. This check is necessary because some callers
1614 * of hugepage migration like soft offline and memory
1615 * hotremove don't walk through page tables or check whether
1616 * the hugepage is pmd-based or not before kicking migration.
1617 */
1618 if (!hugepage_migration_supported(folio_hstate(folio))) {
1619 nr_failed++;
1620 stats->nr_failed_pages += nr_pages;
1621 list_move_tail(&folio->lru, ret_folios);
1622 continue;
1623 }
1624
1625 rc = unmap_and_move_huge_page(get_new_folio,
1626 put_new_folio, private,
1627 folio, pass > 2, mode,
1628 reason, ret_folios);
1629 /*
1630 * The rules are:
1631 * Success: hugetlb folio will be put back
1632 * -EAGAIN: stay on the from list
1633 * -ENOMEM: stay on the from list
1634 * Other errno: put on ret_folios list
1635 */
1636 switch(rc) {
1637 case -ENOMEM:
1638 /*
1639 * When memory is low, don't bother to try to migrate
1640 * other folios, just exit.
1641 */
1642 stats->nr_failed_pages += nr_pages + nr_retry_pages;
1643 return -ENOMEM;
1644 case -EAGAIN:
1645 retry++;
1646 nr_retry_pages += nr_pages;
1647 break;
1648 case MIGRATEPAGE_SUCCESS:
1649 stats->nr_succeeded += nr_pages;
1650 break;
1651 default:
1652 /*
1653 * Permanent failure (-EBUSY, etc.):
1654 * unlike -EAGAIN case, the failed folio is
1655 * removed from migration folio list and not
1656 * retried in the next outer loop.
1657 */
1658 nr_failed++;
1659 stats->nr_failed_pages += nr_pages;
1660 break;
1661 }
1662 }
1663 }
1664 /*
1665 * nr_failed is number of hugetlb folios failed to be migrated. After
1666 * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb
1667 * folios as failed.
1668 */
1669 nr_failed += retry;
1670 stats->nr_failed_pages += nr_retry_pages;
1671
1672 return nr_failed;
1673 }
1674
migrate_folios_move(struct list_head * src_folios,struct list_head * dst_folios,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,struct list_head * ret_folios,struct migrate_pages_stats * stats,int * retry,int * thp_retry,int * nr_failed,int * nr_retry_pages)1675 static void migrate_folios_move(struct list_head *src_folios,
1676 struct list_head *dst_folios,
1677 free_folio_t put_new_folio, unsigned long private,
1678 enum migrate_mode mode, int reason,
1679 struct list_head *ret_folios,
1680 struct migrate_pages_stats *stats,
1681 int *retry, int *thp_retry, int *nr_failed,
1682 int *nr_retry_pages)
1683 {
1684 struct folio *folio, *folio2, *dst, *dst2;
1685 bool is_thp;
1686 int nr_pages;
1687 int rc;
1688
1689 dst = list_first_entry(dst_folios, struct folio, lru);
1690 dst2 = list_next_entry(dst, lru);
1691 list_for_each_entry_safe(folio, folio2, src_folios, lru) {
1692 is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
1693 nr_pages = folio_nr_pages(folio);
1694
1695 cond_resched();
1696
1697 rc = migrate_folio_move(put_new_folio, private,
1698 folio, dst, mode,
1699 reason, ret_folios);
1700 /*
1701 * The rules are:
1702 * Success: folio will be freed
1703 * -EAGAIN: stay on the unmap_folios list
1704 * Other errno: put on ret_folios list
1705 */
1706 switch (rc) {
1707 case -EAGAIN:
1708 *retry += 1;
1709 *thp_retry += is_thp;
1710 *nr_retry_pages += nr_pages;
1711 break;
1712 case MIGRATEPAGE_SUCCESS:
1713 stats->nr_succeeded += nr_pages;
1714 stats->nr_thp_succeeded += is_thp;
1715 break;
1716 default:
1717 *nr_failed += 1;
1718 stats->nr_thp_failed += is_thp;
1719 stats->nr_failed_pages += nr_pages;
1720 break;
1721 }
1722 dst = dst2;
1723 dst2 = list_next_entry(dst, lru);
1724 }
1725 }
1726
migrate_folios_undo(struct list_head * src_folios,struct list_head * dst_folios,free_folio_t put_new_folio,unsigned long private,struct list_head * ret_folios)1727 static void migrate_folios_undo(struct list_head *src_folios,
1728 struct list_head *dst_folios,
1729 free_folio_t put_new_folio, unsigned long private,
1730 struct list_head *ret_folios)
1731 {
1732 struct folio *folio, *folio2, *dst, *dst2;
1733
1734 dst = list_first_entry(dst_folios, struct folio, lru);
1735 dst2 = list_next_entry(dst, lru);
1736 list_for_each_entry_safe(folio, folio2, src_folios, lru) {
1737 int old_page_state = 0;
1738 struct anon_vma *anon_vma = NULL;
1739
1740 __migrate_folio_extract(dst, &old_page_state, &anon_vma);
1741 migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
1742 anon_vma, true, ret_folios);
1743 list_del(&dst->lru);
1744 migrate_folio_undo_dst(dst, true, put_new_folio, private);
1745 dst = dst2;
1746 dst2 = list_next_entry(dst, lru);
1747 }
1748 }
1749
1750 /*
1751 * migrate_pages_batch() first unmaps folios in the from list as many as
1752 * possible, then move the unmapped folios.
1753 *
1754 * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
1755 * lock or bit when we have locked more than one folio. Which may cause
1756 * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
1757 * length of the from list must be <= 1.
1758 */
migrate_pages_batch(struct list_head * from,new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,struct list_head * ret_folios,struct list_head * split_folios,struct migrate_pages_stats * stats,int nr_pass)1759 static int migrate_pages_batch(struct list_head *from,
1760 new_folio_t get_new_folio, free_folio_t put_new_folio,
1761 unsigned long private, enum migrate_mode mode, int reason,
1762 struct list_head *ret_folios, struct list_head *split_folios,
1763 struct migrate_pages_stats *stats, int nr_pass)
1764 {
1765 int retry = 1;
1766 int thp_retry = 1;
1767 int nr_failed = 0;
1768 int nr_retry_pages = 0;
1769 int pass = 0;
1770 bool is_thp = false;
1771 bool is_large = false;
1772 struct folio *folio, *folio2, *dst = NULL;
1773 int rc, rc_saved = 0, nr_pages;
1774 LIST_HEAD(unmap_folios);
1775 LIST_HEAD(dst_folios);
1776 bool nosplit = (reason == MR_NUMA_MISPLACED);
1777
1778 VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
1779 !list_empty(from) && !list_is_singular(from));
1780
1781 for (pass = 0; pass < nr_pass && retry; pass++) {
1782 retry = 0;
1783 thp_retry = 0;
1784 nr_retry_pages = 0;
1785
1786 list_for_each_entry_safe(folio, folio2, from, lru) {
1787 is_large = folio_test_large(folio);
1788 is_thp = folio_test_pmd_mappable(folio);
1789 nr_pages = folio_nr_pages(folio);
1790
1791 cond_resched();
1792
1793 /*
1794 * The rare folio on the deferred split list should
1795 * be split now. It should not count as a failure:
1796 * but increment nr_failed because, without doing so,
1797 * migrate_pages() may report success with (split but
1798 * unmigrated) pages still on its fromlist; whereas it
1799 * always reports success when its fromlist is empty.
1800 * stats->nr_thp_failed should be increased too,
1801 * otherwise stats inconsistency will happen when
1802 * migrate_pages_batch is called via migrate_pages()
1803 * with MIGRATE_SYNC and MIGRATE_ASYNC.
1804 *
1805 * Only check it without removing it from the list.
1806 * Since the folio can be on deferred_split_scan()
1807 * local list and removing it can cause the local list
1808 * corruption. Folio split process below can handle it
1809 * with the help of folio_ref_freeze().
1810 *
1811 * nr_pages > 2 is needed to avoid checking order-1
1812 * page cache folios. They exist, in contrast to
1813 * non-existent order-1 anonymous folios, and do not
1814 * use _deferred_list.
1815 */
1816 if (nr_pages > 2 &&
1817 !list_empty(&folio->_deferred_list) &&
1818 folio_test_partially_mapped(folio)) {
1819 if (!try_split_folio(folio, split_folios, mode)) {
1820 nr_failed++;
1821 stats->nr_thp_failed += is_thp;
1822 stats->nr_thp_split += is_thp;
1823 stats->nr_split++;
1824 continue;
1825 }
1826 }
1827
1828 /*
1829 * Large folio migration might be unsupported or
1830 * the allocation might be failed so we should retry
1831 * on the same folio with the large folio split
1832 * to normal folios.
1833 *
1834 * Split folios are put in split_folios, and
1835 * we will migrate them after the rest of the
1836 * list is processed.
1837 */
1838 if (!thp_migration_supported() && is_thp) {
1839 nr_failed++;
1840 stats->nr_thp_failed++;
1841 if (!try_split_folio(folio, split_folios, mode)) {
1842 stats->nr_thp_split++;
1843 stats->nr_split++;
1844 continue;
1845 }
1846 stats->nr_failed_pages += nr_pages;
1847 list_move_tail(&folio->lru, ret_folios);
1848 continue;
1849 }
1850
1851 rc = migrate_folio_unmap(get_new_folio, put_new_folio,
1852 private, folio, &dst, mode, reason,
1853 ret_folios);
1854 /*
1855 * The rules are:
1856 * Success: folio will be freed
1857 * Unmap: folio will be put on unmap_folios list,
1858 * dst folio put on dst_folios list
1859 * -EAGAIN: stay on the from list
1860 * -ENOMEM: stay on the from list
1861 * Other errno: put on ret_folios list
1862 */
1863 switch(rc) {
1864 case -ENOMEM:
1865 /*
1866 * When memory is low, don't bother to try to migrate
1867 * other folios, move unmapped folios, then exit.
1868 */
1869 nr_failed++;
1870 stats->nr_thp_failed += is_thp;
1871 /* Large folio NUMA faulting doesn't split to retry. */
1872 if (is_large && !nosplit) {
1873 int ret = try_split_folio(folio, split_folios, mode);
1874
1875 if (!ret) {
1876 stats->nr_thp_split += is_thp;
1877 stats->nr_split++;
1878 break;
1879 } else if (reason == MR_LONGTERM_PIN &&
1880 ret == -EAGAIN) {
1881 /*
1882 * Try again to split large folio to
1883 * mitigate the failure of longterm pinning.
1884 */
1885 retry++;
1886 thp_retry += is_thp;
1887 nr_retry_pages += nr_pages;
1888 /* Undo duplicated failure counting. */
1889 nr_failed--;
1890 stats->nr_thp_failed -= is_thp;
1891 break;
1892 }
1893 }
1894
1895 stats->nr_failed_pages += nr_pages + nr_retry_pages;
1896 /* nr_failed isn't updated for not used */
1897 stats->nr_thp_failed += thp_retry;
1898 rc_saved = rc;
1899 if (list_empty(&unmap_folios))
1900 goto out;
1901 else
1902 goto move;
1903 case -EAGAIN:
1904 retry++;
1905 thp_retry += is_thp;
1906 nr_retry_pages += nr_pages;
1907 break;
1908 case MIGRATEPAGE_SUCCESS:
1909 stats->nr_succeeded += nr_pages;
1910 stats->nr_thp_succeeded += is_thp;
1911 break;
1912 case MIGRATEPAGE_UNMAP:
1913 list_move_tail(&folio->lru, &unmap_folios);
1914 list_add_tail(&dst->lru, &dst_folios);
1915 break;
1916 default:
1917 /*
1918 * Permanent failure (-EBUSY, etc.):
1919 * unlike -EAGAIN case, the failed folio is
1920 * removed from migration folio list and not
1921 * retried in the next outer loop.
1922 */
1923 nr_failed++;
1924 stats->nr_thp_failed += is_thp;
1925 stats->nr_failed_pages += nr_pages;
1926 break;
1927 }
1928 }
1929 }
1930 nr_failed += retry;
1931 stats->nr_thp_failed += thp_retry;
1932 stats->nr_failed_pages += nr_retry_pages;
1933 move:
1934 /* Flush TLBs for all unmapped folios */
1935 try_to_unmap_flush();
1936
1937 retry = 1;
1938 for (pass = 0; pass < nr_pass && retry; pass++) {
1939 retry = 0;
1940 thp_retry = 0;
1941 nr_retry_pages = 0;
1942
1943 /* Move the unmapped folios */
1944 migrate_folios_move(&unmap_folios, &dst_folios,
1945 put_new_folio, private, mode, reason,
1946 ret_folios, stats, &retry, &thp_retry,
1947 &nr_failed, &nr_retry_pages);
1948 }
1949 nr_failed += retry;
1950 stats->nr_thp_failed += thp_retry;
1951 stats->nr_failed_pages += nr_retry_pages;
1952
1953 rc = rc_saved ? : nr_failed;
1954 out:
1955 /* Cleanup remaining folios */
1956 migrate_folios_undo(&unmap_folios, &dst_folios,
1957 put_new_folio, private, ret_folios);
1958
1959 return rc;
1960 }
1961
migrate_pages_sync(struct list_head * from,new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,struct list_head * ret_folios,struct list_head * split_folios,struct migrate_pages_stats * stats)1962 static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
1963 free_folio_t put_new_folio, unsigned long private,
1964 enum migrate_mode mode, int reason,
1965 struct list_head *ret_folios, struct list_head *split_folios,
1966 struct migrate_pages_stats *stats)
1967 {
1968 int rc, nr_failed = 0;
1969 LIST_HEAD(folios);
1970 struct migrate_pages_stats astats;
1971
1972 memset(&astats, 0, sizeof(astats));
1973 /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
1974 rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC,
1975 reason, &folios, split_folios, &astats,
1976 NR_MAX_MIGRATE_ASYNC_RETRY);
1977 stats->nr_succeeded += astats.nr_succeeded;
1978 stats->nr_thp_succeeded += astats.nr_thp_succeeded;
1979 stats->nr_thp_split += astats.nr_thp_split;
1980 stats->nr_split += astats.nr_split;
1981 if (rc < 0) {
1982 stats->nr_failed_pages += astats.nr_failed_pages;
1983 stats->nr_thp_failed += astats.nr_thp_failed;
1984 list_splice_tail(&folios, ret_folios);
1985 return rc;
1986 }
1987 stats->nr_thp_failed += astats.nr_thp_split;
1988 /*
1989 * Do not count rc, as pages will be retried below.
1990 * Count nr_split only, since it includes nr_thp_split.
1991 */
1992 nr_failed += astats.nr_split;
1993 /*
1994 * Fall back to migrate all failed folios one by one synchronously. All
1995 * failed folios except split THPs will be retried, so their failure
1996 * isn't counted
1997 */
1998 list_splice_tail_init(&folios, from);
1999 while (!list_empty(from)) {
2000 list_move(from->next, &folios);
2001 rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
2002 private, mode, reason, ret_folios,
2003 split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
2004 list_splice_tail_init(&folios, ret_folios);
2005 if (rc < 0)
2006 return rc;
2007 nr_failed += rc;
2008 }
2009
2010 return nr_failed;
2011 }
2012
2013 /*
2014 * migrate_pages - migrate the folios specified in a list, to the free folios
2015 * supplied as the target for the page migration
2016 *
2017 * @from: The list of folios to be migrated.
2018 * @get_new_folio: The function used to allocate free folios to be used
2019 * as the target of the folio migration.
2020 * @put_new_folio: The function used to free target folios if migration
2021 * fails, or NULL if no special handling is necessary.
2022 * @private: Private data to be passed on to get_new_folio()
2023 * @mode: The migration mode that specifies the constraints for
2024 * folio migration, if any.
2025 * @reason: The reason for folio migration.
2026 * @ret_succeeded: Set to the number of folios migrated successfully if
2027 * the caller passes a non-NULL pointer.
2028 *
2029 * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
2030 * are movable any more because the list has become empty or no retryable folios
2031 * exist any more. It is caller's responsibility to call putback_movable_pages()
2032 * only if ret != 0.
2033 *
2034 * Returns the number of {normal folio, large folio, hugetlb} that were not
2035 * migrated, or an error code. The number of large folio splits will be
2036 * considered as the number of non-migrated large folio, no matter how many
2037 * split folios of the large folio are migrated successfully.
2038 */
migrate_pages(struct list_head * from,new_folio_t get_new_folio,free_folio_t put_new_folio,unsigned long private,enum migrate_mode mode,int reason,unsigned int * ret_succeeded)2039 int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
2040 free_folio_t put_new_folio, unsigned long private,
2041 enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
2042 {
2043 int rc, rc_gather;
2044 int nr_pages;
2045 struct folio *folio, *folio2;
2046 LIST_HEAD(folios);
2047 LIST_HEAD(ret_folios);
2048 LIST_HEAD(split_folios);
2049 struct migrate_pages_stats stats;
2050
2051 trace_mm_migrate_pages_start(mode, reason);
2052
2053 memset(&stats, 0, sizeof(stats));
2054
2055 rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private,
2056 mode, reason, &stats, &ret_folios);
2057 if (rc_gather < 0)
2058 goto out;
2059
2060 again:
2061 nr_pages = 0;
2062 list_for_each_entry_safe(folio, folio2, from, lru) {
2063 /* Retried hugetlb folios will be kept in list */
2064 if (folio_test_hugetlb(folio)) {
2065 list_move_tail(&folio->lru, &ret_folios);
2066 continue;
2067 }
2068
2069 nr_pages += folio_nr_pages(folio);
2070 if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
2071 break;
2072 }
2073 if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
2074 list_cut_before(&folios, from, &folio2->lru);
2075 else
2076 list_splice_init(from, &folios);
2077 if (mode == MIGRATE_ASYNC)
2078 rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
2079 private, mode, reason, &ret_folios,
2080 &split_folios, &stats,
2081 NR_MAX_MIGRATE_PAGES_RETRY);
2082 else
2083 rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio,
2084 private, mode, reason, &ret_folios,
2085 &split_folios, &stats);
2086 list_splice_tail_init(&folios, &ret_folios);
2087 if (rc < 0) {
2088 rc_gather = rc;
2089 list_splice_tail(&split_folios, &ret_folios);
2090 goto out;
2091 }
2092 if (!list_empty(&split_folios)) {
2093 /*
2094 * Failure isn't counted since all split folios of a large folio
2095 * is counted as 1 failure already. And, we only try to migrate
2096 * with minimal effort, force MIGRATE_ASYNC mode and retry once.
2097 */
2098 migrate_pages_batch(&split_folios, get_new_folio,
2099 put_new_folio, private, MIGRATE_ASYNC, reason,
2100 &ret_folios, NULL, &stats, 1);
2101 list_splice_tail_init(&split_folios, &ret_folios);
2102 }
2103 rc_gather += rc;
2104 if (!list_empty(from))
2105 goto again;
2106 out:
2107 /*
2108 * Put the permanent failure folio back to migration list, they
2109 * will be put back to the right list by the caller.
2110 */
2111 list_splice(&ret_folios, from);
2112
2113 /*
2114 * Return 0 in case all split folios of fail-to-migrate large folios
2115 * are migrated successfully.
2116 */
2117 if (list_empty(from))
2118 rc_gather = 0;
2119
2120 count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded);
2121 count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages);
2122 count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded);
2123 count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed);
2124 count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
2125 trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
2126 stats.nr_thp_succeeded, stats.nr_thp_failed,
2127 stats.nr_thp_split, stats.nr_split, mode,
2128 reason);
2129
2130 if (ret_succeeded)
2131 *ret_succeeded = stats.nr_succeeded;
2132
2133 return rc_gather;
2134 }
2135
alloc_migration_target(struct folio * src,unsigned long private)2136 struct folio *alloc_migration_target(struct folio *src, unsigned long private)
2137 {
2138 struct migration_target_control *mtc;
2139 gfp_t gfp_mask;
2140 unsigned int order = 0;
2141 int nid;
2142 int zidx;
2143
2144 mtc = (struct migration_target_control *)private;
2145 gfp_mask = mtc->gfp_mask;
2146 nid = mtc->nid;
2147 if (nid == NUMA_NO_NODE)
2148 nid = folio_nid(src);
2149
2150 if (folio_test_hugetlb(src)) {
2151 struct hstate *h = folio_hstate(src);
2152
2153 gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
2154 return alloc_hugetlb_folio_nodemask(h, nid,
2155 mtc->nmask, gfp_mask,
2156 htlb_allow_alloc_fallback(mtc->reason));
2157 }
2158
2159 if (folio_test_large(src)) {
2160 /*
2161 * clear __GFP_RECLAIM to make the migration callback
2162 * consistent with regular THP allocations.
2163 */
2164 gfp_mask &= ~__GFP_RECLAIM;
2165 gfp_mask |= GFP_TRANSHUGE;
2166 order = folio_order(src);
2167 }
2168 zidx = zone_idx(folio_zone(src));
2169 if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
2170 gfp_mask |= __GFP_HIGHMEM;
2171
2172 return __folio_alloc(gfp_mask, order, nid, mtc->nmask);
2173 }
2174
2175 #ifdef CONFIG_NUMA
2176
store_status(int __user * status,int start,int value,int nr)2177 static int store_status(int __user *status, int start, int value, int nr)
2178 {
2179 while (nr-- > 0) {
2180 if (put_user(value, status + start))
2181 return -EFAULT;
2182 start++;
2183 }
2184
2185 return 0;
2186 }
2187
do_move_pages_to_node(struct list_head * pagelist,int node)2188 static int do_move_pages_to_node(struct list_head *pagelist, int node)
2189 {
2190 int err;
2191 struct migration_target_control mtc = {
2192 .nid = node,
2193 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
2194 .reason = MR_SYSCALL,
2195 };
2196
2197 err = migrate_pages(pagelist, alloc_migration_target, NULL,
2198 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
2199 if (err)
2200 putback_movable_pages(pagelist);
2201 return err;
2202 }
2203
__add_folio_for_migration(struct folio * folio,int node,struct list_head * pagelist,bool migrate_all)2204 static int __add_folio_for_migration(struct folio *folio, int node,
2205 struct list_head *pagelist, bool migrate_all)
2206 {
2207 if (is_zero_folio(folio) || is_huge_zero_folio(folio))
2208 return -EFAULT;
2209
2210 if (folio_is_zone_device(folio))
2211 return -ENOENT;
2212
2213 if (folio_nid(folio) == node)
2214 return 0;
2215
2216 if (folio_maybe_mapped_shared(folio) && !migrate_all)
2217 return -EACCES;
2218
2219 if (folio_test_hugetlb(folio)) {
2220 if (folio_isolate_hugetlb(folio, pagelist))
2221 return 1;
2222 } else if (folio_isolate_lru(folio)) {
2223 list_add_tail(&folio->lru, pagelist);
2224 node_stat_mod_folio(folio,
2225 NR_ISOLATED_ANON + folio_is_file_lru(folio),
2226 folio_nr_pages(folio));
2227 return 1;
2228 }
2229 return -EBUSY;
2230 }
2231
2232 /*
2233 * Resolves the given address to a struct folio, isolates it from the LRU and
2234 * puts it to the given pagelist.
2235 * Returns:
2236 * errno - if the folio cannot be found/isolated
2237 * 0 - when it doesn't have to be migrated because it is already on the
2238 * target node
2239 * 1 - when it has been queued
2240 */
add_folio_for_migration(struct mm_struct * mm,const void __user * p,int node,struct list_head * pagelist,bool migrate_all)2241 static int add_folio_for_migration(struct mm_struct *mm, const void __user *p,
2242 int node, struct list_head *pagelist, bool migrate_all)
2243 {
2244 struct vm_area_struct *vma;
2245 struct folio_walk fw;
2246 struct folio *folio;
2247 unsigned long addr;
2248 int err = -EFAULT;
2249
2250 mmap_read_lock(mm);
2251 addr = (unsigned long)untagged_addr_remote(mm, p);
2252
2253 vma = vma_lookup(mm, addr);
2254 if (vma && vma_migratable(vma)) {
2255 folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE);
2256 if (folio) {
2257 err = __add_folio_for_migration(folio, node, pagelist,
2258 migrate_all);
2259 folio_walk_end(&fw, vma);
2260 } else {
2261 err = -ENOENT;
2262 }
2263 }
2264 mmap_read_unlock(mm);
2265 return err;
2266 }
2267
move_pages_and_store_status(int node,struct list_head * pagelist,int __user * status,int start,int i,unsigned long nr_pages)2268 static int move_pages_and_store_status(int node,
2269 struct list_head *pagelist, int __user *status,
2270 int start, int i, unsigned long nr_pages)
2271 {
2272 int err;
2273
2274 if (list_empty(pagelist))
2275 return 0;
2276
2277 err = do_move_pages_to_node(pagelist, node);
2278 if (err) {
2279 /*
2280 * Positive err means the number of failed
2281 * pages to migrate. Since we are going to
2282 * abort and return the number of non-migrated
2283 * pages, so need to include the rest of the
2284 * nr_pages that have not been attempted as
2285 * well.
2286 */
2287 if (err > 0)
2288 err += nr_pages - i;
2289 return err;
2290 }
2291 return store_status(status, start, node, i - start);
2292 }
2293
2294 /*
2295 * Migrate an array of page address onto an array of nodes and fill
2296 * the corresponding array of status.
2297 */
do_pages_move(struct mm_struct * mm,nodemask_t task_nodes,unsigned long nr_pages,const void __user * __user * pages,const int __user * nodes,int __user * status,int flags)2298 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
2299 unsigned long nr_pages,
2300 const void __user * __user *pages,
2301 const int __user *nodes,
2302 int __user *status, int flags)
2303 {
2304 compat_uptr_t __user *compat_pages = (void __user *)pages;
2305 int current_node = NUMA_NO_NODE;
2306 LIST_HEAD(pagelist);
2307 int start, i;
2308 int err = 0, err1;
2309
2310 lru_cache_disable();
2311
2312 for (i = start = 0; i < nr_pages; i++) {
2313 const void __user *p;
2314 int node;
2315
2316 err = -EFAULT;
2317 if (in_compat_syscall()) {
2318 compat_uptr_t cp;
2319
2320 if (get_user(cp, compat_pages + i))
2321 goto out_flush;
2322
2323 p = compat_ptr(cp);
2324 } else {
2325 if (get_user(p, pages + i))
2326 goto out_flush;
2327 }
2328 if (get_user(node, nodes + i))
2329 goto out_flush;
2330
2331 err = -ENODEV;
2332 if (node < 0 || node >= MAX_NUMNODES)
2333 goto out_flush;
2334 if (!node_state(node, N_MEMORY))
2335 goto out_flush;
2336
2337 err = -EACCES;
2338 if (!node_isset(node, task_nodes))
2339 goto out_flush;
2340
2341 if (current_node == NUMA_NO_NODE) {
2342 current_node = node;
2343 start = i;
2344 } else if (node != current_node) {
2345 err = move_pages_and_store_status(current_node,
2346 &pagelist, status, start, i, nr_pages);
2347 if (err)
2348 goto out;
2349 start = i;
2350 current_node = node;
2351 }
2352
2353 /*
2354 * Errors in the page lookup or isolation are not fatal and we simply
2355 * report them via status
2356 */
2357 err = add_folio_for_migration(mm, p, current_node, &pagelist,
2358 flags & MPOL_MF_MOVE_ALL);
2359
2360 if (err > 0) {
2361 /* The page is successfully queued for migration */
2362 continue;
2363 }
2364
2365 /*
2366 * If the page is already on the target node (!err), store the
2367 * node, otherwise, store the err.
2368 */
2369 err = store_status(status, i, err ? : current_node, 1);
2370 if (err)
2371 goto out_flush;
2372
2373 err = move_pages_and_store_status(current_node, &pagelist,
2374 status, start, i, nr_pages);
2375 if (err) {
2376 /* We have accounted for page i */
2377 if (err > 0)
2378 err--;
2379 goto out;
2380 }
2381 current_node = NUMA_NO_NODE;
2382 }
2383 out_flush:
2384 /* Make sure we do not overwrite the existing error */
2385 err1 = move_pages_and_store_status(current_node, &pagelist,
2386 status, start, i, nr_pages);
2387 if (err >= 0)
2388 err = err1;
2389 out:
2390 lru_cache_enable();
2391 return err;
2392 }
2393
2394 /*
2395 * Determine the nodes of an array of pages and store it in an array of status.
2396 */
do_pages_stat_array(struct mm_struct * mm,unsigned long nr_pages,const void __user ** pages,int * status)2397 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
2398 const void __user **pages, int *status)
2399 {
2400 unsigned long i;
2401
2402 mmap_read_lock(mm);
2403
2404 for (i = 0; i < nr_pages; i++) {
2405 unsigned long addr = (unsigned long)(*pages);
2406 struct vm_area_struct *vma;
2407 struct folio_walk fw;
2408 struct folio *folio;
2409 int err = -EFAULT;
2410
2411 vma = vma_lookup(mm, addr);
2412 if (!vma)
2413 goto set_status;
2414
2415 folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE);
2416 if (folio) {
2417 if (is_zero_folio(folio) || is_huge_zero_folio(folio))
2418 err = -EFAULT;
2419 else if (folio_is_zone_device(folio))
2420 err = -ENOENT;
2421 else
2422 err = folio_nid(folio);
2423 folio_walk_end(&fw, vma);
2424 } else {
2425 err = -ENOENT;
2426 }
2427 set_status:
2428 *status = err;
2429
2430 pages++;
2431 status++;
2432 }
2433
2434 mmap_read_unlock(mm);
2435 }
2436
get_compat_pages_array(const void __user * chunk_pages[],const void __user * __user * pages,unsigned long chunk_offset,unsigned long chunk_nr)2437 static int get_compat_pages_array(const void __user *chunk_pages[],
2438 const void __user * __user *pages,
2439 unsigned long chunk_offset,
2440 unsigned long chunk_nr)
2441 {
2442 compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
2443 compat_uptr_t p;
2444 int i;
2445
2446 for (i = 0; i < chunk_nr; i++) {
2447 if (get_user(p, pages32 + chunk_offset + i))
2448 return -EFAULT;
2449 chunk_pages[i] = compat_ptr(p);
2450 }
2451
2452 return 0;
2453 }
2454
2455 /*
2456 * Determine the nodes of a user array of pages and store it in
2457 * a user array of status.
2458 */
do_pages_stat(struct mm_struct * mm,unsigned long nr_pages,const void __user * __user * pages,int __user * status)2459 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
2460 const void __user * __user *pages,
2461 int __user *status)
2462 {
2463 #define DO_PAGES_STAT_CHUNK_NR 16UL
2464 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
2465 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
2466 unsigned long chunk_offset = 0;
2467
2468 while (nr_pages) {
2469 unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
2470
2471 if (in_compat_syscall()) {
2472 if (get_compat_pages_array(chunk_pages, pages,
2473 chunk_offset, chunk_nr))
2474 break;
2475 } else {
2476 if (copy_from_user(chunk_pages, pages + chunk_offset,
2477 chunk_nr * sizeof(*chunk_pages)))
2478 break;
2479 }
2480
2481 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
2482
2483 if (copy_to_user(status + chunk_offset, chunk_status,
2484 chunk_nr * sizeof(*status)))
2485 break;
2486
2487 chunk_offset += chunk_nr;
2488 nr_pages -= chunk_nr;
2489 }
2490 return nr_pages ? -EFAULT : 0;
2491 }
2492
find_mm_struct(pid_t pid,nodemask_t * mem_nodes)2493 static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
2494 {
2495 struct task_struct *task;
2496 struct mm_struct *mm;
2497
2498 /*
2499 * There is no need to check if current process has the right to modify
2500 * the specified process when they are same.
2501 */
2502 if (!pid) {
2503 mmget(current->mm);
2504 *mem_nodes = cpuset_mems_allowed(current);
2505 return current->mm;
2506 }
2507
2508 task = find_get_task_by_vpid(pid);
2509 if (!task) {
2510 return ERR_PTR(-ESRCH);
2511 }
2512
2513 /*
2514 * Check if this process has the right to modify the specified
2515 * process. Use the regular "ptrace_may_access()" checks.
2516 */
2517 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
2518 mm = ERR_PTR(-EPERM);
2519 goto out;
2520 }
2521
2522 mm = ERR_PTR(security_task_movememory(task));
2523 if (IS_ERR(mm))
2524 goto out;
2525 *mem_nodes = cpuset_mems_allowed(task);
2526 mm = get_task_mm(task);
2527 out:
2528 put_task_struct(task);
2529 if (!mm)
2530 mm = ERR_PTR(-EINVAL);
2531 return mm;
2532 }
2533
2534 /*
2535 * Move a list of pages in the address space of the currently executing
2536 * process.
2537 */
kernel_move_pages(pid_t pid,unsigned long nr_pages,const void __user * __user * pages,const int __user * nodes,int __user * status,int flags)2538 static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
2539 const void __user * __user *pages,
2540 const int __user *nodes,
2541 int __user *status, int flags)
2542 {
2543 struct mm_struct *mm;
2544 int err;
2545 nodemask_t task_nodes;
2546
2547 /* Check flags */
2548 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
2549 return -EINVAL;
2550
2551 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
2552 return -EPERM;
2553
2554 mm = find_mm_struct(pid, &task_nodes);
2555 if (IS_ERR(mm))
2556 return PTR_ERR(mm);
2557
2558 if (nodes)
2559 err = do_pages_move(mm, task_nodes, nr_pages, pages,
2560 nodes, status, flags);
2561 else
2562 err = do_pages_stat(mm, nr_pages, pages, status);
2563
2564 mmput(mm);
2565 return err;
2566 }
2567
SYSCALL_DEFINE6(move_pages,pid_t,pid,unsigned long,nr_pages,const void __user * __user *,pages,const int __user *,nodes,int __user *,status,int,flags)2568 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
2569 const void __user * __user *, pages,
2570 const int __user *, nodes,
2571 int __user *, status, int, flags)
2572 {
2573 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
2574 }
2575
2576 #ifdef CONFIG_NUMA_BALANCING
2577 /*
2578 * Returns true if this is a safe migration target node for misplaced NUMA
2579 * pages. Currently it only checks the watermarks which is crude.
2580 */
migrate_balanced_pgdat(struct pglist_data * pgdat,unsigned long nr_migrate_pages)2581 static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
2582 unsigned long nr_migrate_pages)
2583 {
2584 int z;
2585
2586 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2587 struct zone *zone = pgdat->node_zones + z;
2588
2589 if (!managed_zone(zone))
2590 continue;
2591
2592 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
2593 if (!zone_watermark_ok(zone, 0,
2594 high_wmark_pages(zone) +
2595 nr_migrate_pages,
2596 ZONE_MOVABLE, ALLOC_CMA))
2597 continue;
2598 return true;
2599 }
2600 return false;
2601 }
2602
alloc_misplaced_dst_folio(struct folio * src,unsigned long data)2603 static struct folio *alloc_misplaced_dst_folio(struct folio *src,
2604 unsigned long data)
2605 {
2606 int nid = (int) data;
2607 int order = folio_order(src);
2608 gfp_t gfp = __GFP_THISNODE;
2609
2610 if (order > 0)
2611 gfp |= GFP_TRANSHUGE_LIGHT;
2612 else {
2613 gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
2614 __GFP_NOWARN;
2615 gfp &= ~__GFP_RECLAIM;
2616 }
2617 return __folio_alloc_node(gfp, order, nid);
2618 }
2619
2620 /*
2621 * Prepare for calling migrate_misplaced_folio() by isolating the folio if
2622 * permitted. Must be called with the PTL still held.
2623 */
migrate_misplaced_folio_prepare(struct folio * folio,struct vm_area_struct * vma,int node)2624 int migrate_misplaced_folio_prepare(struct folio *folio,
2625 struct vm_area_struct *vma, int node)
2626 {
2627 int nr_pages = folio_nr_pages(folio);
2628 pg_data_t *pgdat = NODE_DATA(node);
2629
2630 if (folio_is_file_lru(folio)) {
2631 /*
2632 * Do not migrate file folios that are mapped in multiple
2633 * processes with execute permissions as they are probably
2634 * shared libraries.
2635 *
2636 * See folio_maybe_mapped_shared() on possible imprecision
2637 * when we cannot easily detect if a folio is shared.
2638 */
2639 if ((vma->vm_flags & VM_EXEC) && folio_maybe_mapped_shared(folio))
2640 return -EACCES;
2641
2642 /*
2643 * Do not migrate dirty folios as not all filesystems can move
2644 * dirty folios in MIGRATE_ASYNC mode which is a waste of
2645 * cycles.
2646 */
2647 if (folio_test_dirty(folio))
2648 return -EAGAIN;
2649 }
2650
2651 /* Avoid migrating to a node that is nearly full */
2652 if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
2653 int z;
2654
2655 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
2656 return -EAGAIN;
2657 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2658 if (managed_zone(pgdat->node_zones + z))
2659 break;
2660 }
2661
2662 /*
2663 * If there are no managed zones, it should not proceed
2664 * further.
2665 */
2666 if (z < 0)
2667 return -EAGAIN;
2668
2669 wakeup_kswapd(pgdat->node_zones + z, 0,
2670 folio_order(folio), ZONE_MOVABLE);
2671 return -EAGAIN;
2672 }
2673
2674 if (!folio_isolate_lru(folio))
2675 return -EAGAIN;
2676
2677 node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio),
2678 nr_pages);
2679 return 0;
2680 }
2681
2682 /*
2683 * Attempt to migrate a misplaced folio to the specified destination
2684 * node. Caller is expected to have isolated the folio by calling
2685 * migrate_misplaced_folio_prepare(), which will result in an
2686 * elevated reference count on the folio. This function will un-isolate the
2687 * folio, dereferencing the folio before returning.
2688 */
migrate_misplaced_folio(struct folio * folio,int node)2689 int migrate_misplaced_folio(struct folio *folio, int node)
2690 {
2691 pg_data_t *pgdat = NODE_DATA(node);
2692 int nr_remaining;
2693 unsigned int nr_succeeded;
2694 LIST_HEAD(migratepages);
2695 struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
2696 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
2697
2698 list_add(&folio->lru, &migratepages);
2699 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
2700 NULL, node, MIGRATE_ASYNC,
2701 MR_NUMA_MISPLACED, &nr_succeeded);
2702 if (nr_remaining && !list_empty(&migratepages))
2703 putback_movable_pages(&migratepages);
2704 if (nr_succeeded) {
2705 count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
2706 count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded);
2707 if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
2708 && !node_is_toptier(folio_nid(folio))
2709 && node_is_toptier(node))
2710 mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded);
2711 }
2712 mem_cgroup_put(memcg);
2713 BUG_ON(!list_empty(&migratepages));
2714 return nr_remaining ? -EAGAIN : 0;
2715 }
2716 #endif /* CONFIG_NUMA_BALANCING */
2717 #endif /* CONFIG_NUMA */
2718