1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Device Memory Migration functionality. 4 * 5 * Originally written by Jérôme Glisse. 6 */ 7 #include <linux/export.h> 8 #include <linux/memremap.h> 9 #include <linux/migrate.h> 10 #include <linux/mm.h> 11 #include <linux/mm_inline.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/oom.h> 14 #include <linux/pagewalk.h> 15 #include <linux/rmap.h> 16 #include <linux/swapops.h> 17 #include <asm/tlbflush.h> 18 #include "internal.h" 19 20 static int migrate_vma_collect_skip(unsigned long start, 21 unsigned long end, 22 struct mm_walk *walk) 23 { 24 struct migrate_vma *migrate = walk->private; 25 unsigned long addr; 26 27 for (addr = start; addr < end; addr += PAGE_SIZE) { 28 migrate->dst[migrate->npages] = 0; 29 migrate->src[migrate->npages++] = 0; 30 } 31 32 return 0; 33 } 34 35 static int migrate_vma_collect_hole(unsigned long start, 36 unsigned long end, 37 __always_unused int depth, 38 struct mm_walk *walk) 39 { 40 struct migrate_vma *migrate = walk->private; 41 unsigned long addr; 42 43 /* Only allow populating anonymous memory. */ 44 if (!vma_is_anonymous(walk->vma)) 45 return migrate_vma_collect_skip(start, end, walk); 46 47 for (addr = start; addr < end; addr += PAGE_SIZE) { 48 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; 49 migrate->dst[migrate->npages] = 0; 50 migrate->npages++; 51 migrate->cpages++; 52 } 53 54 return 0; 55 } 56 57 static int migrate_vma_collect_pmd(pmd_t *pmdp, 58 unsigned long start, 59 unsigned long end, 60 struct mm_walk *walk) 61 { 62 struct migrate_vma *migrate = walk->private; 63 struct folio *fault_folio = migrate->fault_page ? 64 page_folio(migrate->fault_page) : NULL; 65 struct vm_area_struct *vma = walk->vma; 66 struct mm_struct *mm = vma->vm_mm; 67 unsigned long addr = start, unmapped = 0; 68 spinlock_t *ptl; 69 pte_t *ptep; 70 71 again: 72 if (pmd_none(*pmdp)) 73 return migrate_vma_collect_hole(start, end, -1, walk); 74 75 if (pmd_trans_huge(*pmdp)) { 76 struct folio *folio; 77 78 ptl = pmd_lock(mm, pmdp); 79 if (unlikely(!pmd_trans_huge(*pmdp))) { 80 spin_unlock(ptl); 81 goto again; 82 } 83 84 folio = pmd_folio(*pmdp); 85 if (is_huge_zero_folio(folio)) { 86 spin_unlock(ptl); 87 split_huge_pmd(vma, pmdp, addr); 88 } else { 89 int ret; 90 91 folio_get(folio); 92 spin_unlock(ptl); 93 /* FIXME: we don't expect THP for fault_folio */ 94 if (WARN_ON_ONCE(fault_folio == folio)) 95 return migrate_vma_collect_skip(start, end, 96 walk); 97 if (unlikely(!folio_trylock(folio))) 98 return migrate_vma_collect_skip(start, end, 99 walk); 100 ret = split_folio(folio); 101 if (fault_folio != folio) 102 folio_unlock(folio); 103 folio_put(folio); 104 if (ret) 105 return migrate_vma_collect_skip(start, end, 106 walk); 107 } 108 } 109 110 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 111 if (!ptep) 112 goto again; 113 arch_enter_lazy_mmu_mode(); 114 115 for (; addr < end; addr += PAGE_SIZE, ptep++) { 116 struct dev_pagemap *pgmap; 117 unsigned long mpfn = 0, pfn; 118 struct folio *folio; 119 struct page *page; 120 swp_entry_t entry; 121 pte_t pte; 122 123 pte = ptep_get(ptep); 124 125 if (pte_none(pte)) { 126 if (vma_is_anonymous(vma)) { 127 mpfn = MIGRATE_PFN_MIGRATE; 128 migrate->cpages++; 129 } 130 goto next; 131 } 132 133 if (!pte_present(pte)) { 134 /* 135 * Only care about unaddressable device page special 136 * page table entry. Other special swap entries are not 137 * migratable, and we ignore regular swapped page. 138 */ 139 entry = pte_to_swp_entry(pte); 140 if (!is_device_private_entry(entry)) 141 goto next; 142 143 page = pfn_swap_entry_to_page(entry); 144 pgmap = page_pgmap(page); 145 if (!(migrate->flags & 146 MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || 147 pgmap->owner != migrate->pgmap_owner) 148 goto next; 149 150 mpfn = migrate_pfn(page_to_pfn(page)) | 151 MIGRATE_PFN_MIGRATE; 152 if (is_writable_device_private_entry(entry)) 153 mpfn |= MIGRATE_PFN_WRITE; 154 } else { 155 pfn = pte_pfn(pte); 156 if (is_zero_pfn(pfn) && 157 (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { 158 mpfn = MIGRATE_PFN_MIGRATE; 159 migrate->cpages++; 160 goto next; 161 } 162 page = vm_normal_page(migrate->vma, addr, pte); 163 if (page && !is_zone_device_page(page) && 164 !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { 165 goto next; 166 } else if (page && is_device_coherent_page(page)) { 167 pgmap = page_pgmap(page); 168 169 if (!(migrate->flags & 170 MIGRATE_VMA_SELECT_DEVICE_COHERENT) || 171 pgmap->owner != migrate->pgmap_owner) 172 goto next; 173 } 174 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 175 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 176 } 177 178 /* FIXME support THP */ 179 if (!page || !page->mapping || PageTransCompound(page)) { 180 mpfn = 0; 181 goto next; 182 } 183 184 /* 185 * By getting a reference on the folio we pin it and that blocks 186 * any kind of migration. Side effect is that it "freezes" the 187 * pte. 188 * 189 * We drop this reference after isolating the folio from the lru 190 * for non device folio (device folio are not on the lru and thus 191 * can't be dropped from it). 192 */ 193 folio = page_folio(page); 194 folio_get(folio); 195 196 /* 197 * We rely on folio_trylock() to avoid deadlock between 198 * concurrent migrations where each is waiting on the others 199 * folio lock. If we can't immediately lock the folio we fail this 200 * migration as it is only best effort anyway. 201 * 202 * If we can lock the folio it's safe to set up a migration entry 203 * now. In the common case where the folio is mapped once in a 204 * single process setting up the migration entry now is an 205 * optimisation to avoid walking the rmap later with 206 * try_to_migrate(). 207 */ 208 if (fault_folio == folio || folio_trylock(folio)) { 209 bool anon_exclusive; 210 pte_t swp_pte; 211 212 flush_cache_page(vma, addr, pte_pfn(pte)); 213 anon_exclusive = folio_test_anon(folio) && 214 PageAnonExclusive(page); 215 if (anon_exclusive) { 216 pte = ptep_clear_flush(vma, addr, ptep); 217 218 if (folio_try_share_anon_rmap_pte(folio, page)) { 219 set_pte_at(mm, addr, ptep, pte); 220 if (fault_folio != folio) 221 folio_unlock(folio); 222 folio_put(folio); 223 mpfn = 0; 224 goto next; 225 } 226 } else { 227 pte = ptep_get_and_clear(mm, addr, ptep); 228 } 229 230 migrate->cpages++; 231 232 /* Set the dirty flag on the folio now the pte is gone. */ 233 if (pte_dirty(pte)) 234 folio_mark_dirty(folio); 235 236 /* Setup special migration page table entry */ 237 if (mpfn & MIGRATE_PFN_WRITE) 238 entry = make_writable_migration_entry( 239 page_to_pfn(page)); 240 else if (anon_exclusive) 241 entry = make_readable_exclusive_migration_entry( 242 page_to_pfn(page)); 243 else 244 entry = make_readable_migration_entry( 245 page_to_pfn(page)); 246 if (pte_present(pte)) { 247 if (pte_young(pte)) 248 entry = make_migration_entry_young(entry); 249 if (pte_dirty(pte)) 250 entry = make_migration_entry_dirty(entry); 251 } 252 swp_pte = swp_entry_to_pte(entry); 253 if (pte_present(pte)) { 254 if (pte_soft_dirty(pte)) 255 swp_pte = pte_swp_mksoft_dirty(swp_pte); 256 if (pte_uffd_wp(pte)) 257 swp_pte = pte_swp_mkuffd_wp(swp_pte); 258 } else { 259 if (pte_swp_soft_dirty(pte)) 260 swp_pte = pte_swp_mksoft_dirty(swp_pte); 261 if (pte_swp_uffd_wp(pte)) 262 swp_pte = pte_swp_mkuffd_wp(swp_pte); 263 } 264 set_pte_at(mm, addr, ptep, swp_pte); 265 266 /* 267 * This is like regular unmap: we remove the rmap and 268 * drop the folio refcount. The folio won't be freed, as 269 * we took a reference just above. 270 */ 271 folio_remove_rmap_pte(folio, page, vma); 272 folio_put(folio); 273 274 if (pte_present(pte)) 275 unmapped++; 276 } else { 277 folio_put(folio); 278 mpfn = 0; 279 } 280 281 next: 282 migrate->dst[migrate->npages] = 0; 283 migrate->src[migrate->npages++] = mpfn; 284 } 285 286 /* Only flush the TLB if we actually modified any entries */ 287 if (unmapped) 288 flush_tlb_range(walk->vma, start, end); 289 290 arch_leave_lazy_mmu_mode(); 291 pte_unmap_unlock(ptep - 1, ptl); 292 293 return 0; 294 } 295 296 static const struct mm_walk_ops migrate_vma_walk_ops = { 297 .pmd_entry = migrate_vma_collect_pmd, 298 .pte_hole = migrate_vma_collect_hole, 299 .walk_lock = PGWALK_RDLOCK, 300 }; 301 302 /* 303 * migrate_vma_collect() - collect pages over a range of virtual addresses 304 * @migrate: migrate struct containing all migration information 305 * 306 * This will walk the CPU page table. For each virtual address backed by a 307 * valid page, it updates the src array and takes a reference on the page, in 308 * order to pin the page until we lock it and unmap it. 309 */ 310 static void migrate_vma_collect(struct migrate_vma *migrate) 311 { 312 struct mmu_notifier_range range; 313 314 /* 315 * Note that the pgmap_owner is passed to the mmu notifier callback so 316 * that the registered device driver can skip invalidating device 317 * private page mappings that won't be migrated. 318 */ 319 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, 320 migrate->vma->vm_mm, migrate->start, migrate->end, 321 migrate->pgmap_owner); 322 mmu_notifier_invalidate_range_start(&range); 323 324 walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, 325 &migrate_vma_walk_ops, migrate); 326 327 mmu_notifier_invalidate_range_end(&range); 328 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 329 } 330 331 /* 332 * migrate_vma_check_page() - check if page is pinned or not 333 * @page: struct page to check 334 * 335 * Pinned pages cannot be migrated. This is the same test as in 336 * folio_migrate_mapping(), except that here we allow migration of a 337 * ZONE_DEVICE page. 338 */ 339 static bool migrate_vma_check_page(struct page *page, struct page *fault_page) 340 { 341 struct folio *folio = page_folio(page); 342 343 /* 344 * One extra ref because caller holds an extra reference, either from 345 * folio_isolate_lru() for a regular folio, or migrate_vma_collect() for 346 * a device folio. 347 */ 348 int extra = 1 + (page == fault_page); 349 350 /* 351 * FIXME support THP (transparent huge page), it is bit more complex to 352 * check them than regular pages, because they can be mapped with a pmd 353 * or with a pte (split pte mapping). 354 */ 355 if (folio_test_large(folio)) 356 return false; 357 358 /* Page from ZONE_DEVICE have one extra reference */ 359 if (folio_is_zone_device(folio)) 360 extra++; 361 362 /* For file back page */ 363 if (folio_mapping(folio)) 364 extra += 1 + folio_has_private(folio); 365 366 if ((folio_ref_count(folio) - extra) > folio_mapcount(folio)) 367 return false; 368 369 return true; 370 } 371 372 /* 373 * Unmaps pages for migration. Returns number of source pfns marked as 374 * migrating. 375 */ 376 static unsigned long migrate_device_unmap(unsigned long *src_pfns, 377 unsigned long npages, 378 struct page *fault_page) 379 { 380 struct folio *fault_folio = fault_page ? 381 page_folio(fault_page) : NULL; 382 unsigned long i, restore = 0; 383 bool allow_drain = true; 384 unsigned long unmapped = 0; 385 386 lru_add_drain(); 387 388 for (i = 0; i < npages; i++) { 389 struct page *page = migrate_pfn_to_page(src_pfns[i]); 390 struct folio *folio; 391 392 if (!page) { 393 if (src_pfns[i] & MIGRATE_PFN_MIGRATE) 394 unmapped++; 395 continue; 396 } 397 398 folio = page_folio(page); 399 /* ZONE_DEVICE folios are not on LRU */ 400 if (!folio_is_zone_device(folio)) { 401 if (!folio_test_lru(folio) && allow_drain) { 402 /* Drain CPU's lru cache */ 403 lru_add_drain_all(); 404 allow_drain = false; 405 } 406 407 if (!folio_isolate_lru(folio)) { 408 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 409 restore++; 410 continue; 411 } 412 413 /* Drop the reference we took in collect */ 414 folio_put(folio); 415 } 416 417 if (folio_mapped(folio)) 418 try_to_migrate(folio, 0); 419 420 if (folio_mapped(folio) || 421 !migrate_vma_check_page(page, fault_page)) { 422 if (!folio_is_zone_device(folio)) { 423 folio_get(folio); 424 folio_putback_lru(folio); 425 } 426 427 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 428 restore++; 429 continue; 430 } 431 432 unmapped++; 433 } 434 435 for (i = 0; i < npages && restore; i++) { 436 struct page *page = migrate_pfn_to_page(src_pfns[i]); 437 struct folio *folio; 438 439 if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE)) 440 continue; 441 442 folio = page_folio(page); 443 remove_migration_ptes(folio, folio, 0); 444 445 src_pfns[i] = 0; 446 if (fault_folio != folio) 447 folio_unlock(folio); 448 folio_put(folio); 449 restore--; 450 } 451 452 return unmapped; 453 } 454 455 /* 456 * migrate_vma_unmap() - replace page mapping with special migration pte entry 457 * @migrate: migrate struct containing all migration information 458 * 459 * Isolate pages from the LRU and replace mappings (CPU page table pte) with a 460 * special migration pte entry and check if it has been pinned. Pinned pages are 461 * restored because we cannot migrate them. 462 * 463 * This is the last step before we call the device driver callback to allocate 464 * destination memory and copy contents of original page over to new page. 465 */ 466 static void migrate_vma_unmap(struct migrate_vma *migrate) 467 { 468 migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages, 469 migrate->fault_page); 470 } 471 472 /** 473 * migrate_vma_setup() - prepare to migrate a range of memory 474 * @args: contains the vma, start, and pfns arrays for the migration 475 * 476 * Returns: negative errno on failures, 0 when 0 or more pages were migrated 477 * without an error. 478 * 479 * Prepare to migrate a range of memory virtual address range by collecting all 480 * the pages backing each virtual address in the range, saving them inside the 481 * src array. Then lock those pages and unmap them. Once the pages are locked 482 * and unmapped, check whether each page is pinned or not. Pages that aren't 483 * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the 484 * corresponding src array entry. Then restores any pages that are pinned, by 485 * remapping and unlocking those pages. 486 * 487 * The caller should then allocate destination memory and copy source memory to 488 * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE 489 * flag set). Once these are allocated and copied, the caller must update each 490 * corresponding entry in the dst array with the pfn value of the destination 491 * page and with MIGRATE_PFN_VALID. Destination pages must be locked via 492 * lock_page(). 493 * 494 * Note that the caller does not have to migrate all the pages that are marked 495 * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from 496 * device memory to system memory. If the caller cannot migrate a device page 497 * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe 498 * consequences for the userspace process, so it must be avoided if at all 499 * possible. 500 * 501 * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we 502 * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus 503 * allowing the caller to allocate device memory for those unbacked virtual 504 * addresses. For this the caller simply has to allocate device memory and 505 * properly set the destination entry like for regular migration. Note that 506 * this can still fail, and thus inside the device driver you must check if the 507 * migration was successful for those entries after calling migrate_vma_pages(), 508 * just like for regular migration. 509 * 510 * After that, the callers must call migrate_vma_pages() to go over each entry 511 * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 512 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 513 * then migrate_vma_pages() to migrate struct page information from the source 514 * struct page to the destination struct page. If it fails to migrate the 515 * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the 516 * src array. 517 * 518 * At this point all successfully migrated pages have an entry in the src 519 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 520 * array entry with MIGRATE_PFN_VALID flag set. 521 * 522 * Once migrate_vma_pages() returns the caller may inspect which pages were 523 * successfully migrated, and which were not. Successfully migrated pages will 524 * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. 525 * 526 * It is safe to update device page table after migrate_vma_pages() because 527 * both destination and source page are still locked, and the mmap_lock is held 528 * in read mode (hence no one can unmap the range being migrated). 529 * 530 * Once the caller is done cleaning up things and updating its page table (if it 531 * chose to do so, this is not an obligation) it finally calls 532 * migrate_vma_finalize() to update the CPU page table to point to new pages 533 * for successfully migrated pages or otherwise restore the CPU page table to 534 * point to the original source pages. 535 */ 536 int migrate_vma_setup(struct migrate_vma *args) 537 { 538 long nr_pages = (args->end - args->start) >> PAGE_SHIFT; 539 540 args->start &= PAGE_MASK; 541 args->end &= PAGE_MASK; 542 if (!args->vma || is_vm_hugetlb_page(args->vma) || 543 (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) 544 return -EINVAL; 545 if (nr_pages <= 0) 546 return -EINVAL; 547 if (args->start < args->vma->vm_start || 548 args->start >= args->vma->vm_end) 549 return -EINVAL; 550 if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) 551 return -EINVAL; 552 if (!args->src || !args->dst) 553 return -EINVAL; 554 if (args->fault_page && !is_device_private_page(args->fault_page)) 555 return -EINVAL; 556 if (args->fault_page && !PageLocked(args->fault_page)) 557 return -EINVAL; 558 559 memset(args->src, 0, sizeof(*args->src) * nr_pages); 560 args->cpages = 0; 561 args->npages = 0; 562 563 migrate_vma_collect(args); 564 565 if (args->cpages) 566 migrate_vma_unmap(args); 567 568 /* 569 * At this point pages are locked and unmapped, and thus they have 570 * stable content and can safely be copied to destination memory that 571 * is allocated by the drivers. 572 */ 573 return 0; 574 575 } 576 EXPORT_SYMBOL(migrate_vma_setup); 577 578 /* 579 * This code closely matches the code in: 580 * __handle_mm_fault() 581 * handle_pte_fault() 582 * do_anonymous_page() 583 * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE 584 * private or coherent page. 585 */ 586 static void migrate_vma_insert_page(struct migrate_vma *migrate, 587 unsigned long addr, 588 struct page *page, 589 unsigned long *src) 590 { 591 struct folio *folio = page_folio(page); 592 struct vm_area_struct *vma = migrate->vma; 593 struct mm_struct *mm = vma->vm_mm; 594 bool flush = false; 595 spinlock_t *ptl; 596 pte_t entry; 597 pgd_t *pgdp; 598 p4d_t *p4dp; 599 pud_t *pudp; 600 pmd_t *pmdp; 601 pte_t *ptep; 602 pte_t orig_pte; 603 604 /* Only allow populating anonymous memory */ 605 if (!vma_is_anonymous(vma)) 606 goto abort; 607 608 pgdp = pgd_offset(mm, addr); 609 p4dp = p4d_alloc(mm, pgdp, addr); 610 if (!p4dp) 611 goto abort; 612 pudp = pud_alloc(mm, p4dp, addr); 613 if (!pudp) 614 goto abort; 615 pmdp = pmd_alloc(mm, pudp, addr); 616 if (!pmdp) 617 goto abort; 618 if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) 619 goto abort; 620 if (pte_alloc(mm, pmdp)) 621 goto abort; 622 if (unlikely(anon_vma_prepare(vma))) 623 goto abort; 624 if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) 625 goto abort; 626 627 /* 628 * The memory barrier inside __folio_mark_uptodate makes sure that 629 * preceding stores to the folio contents become visible before 630 * the set_pte_at() write. 631 */ 632 __folio_mark_uptodate(folio); 633 634 if (folio_is_device_private(folio)) { 635 swp_entry_t swp_entry; 636 637 if (vma->vm_flags & VM_WRITE) 638 swp_entry = make_writable_device_private_entry( 639 page_to_pfn(page)); 640 else 641 swp_entry = make_readable_device_private_entry( 642 page_to_pfn(page)); 643 entry = swp_entry_to_pte(swp_entry); 644 } else { 645 if (folio_is_zone_device(folio) && 646 !folio_is_device_coherent(folio)) { 647 pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); 648 goto abort; 649 } 650 entry = mk_pte(page, vma->vm_page_prot); 651 if (vma->vm_flags & VM_WRITE) 652 entry = pte_mkwrite(pte_mkdirty(entry), vma); 653 } 654 655 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 656 if (!ptep) 657 goto abort; 658 orig_pte = ptep_get(ptep); 659 660 if (check_stable_address_space(mm)) 661 goto unlock_abort; 662 663 if (pte_present(orig_pte)) { 664 unsigned long pfn = pte_pfn(orig_pte); 665 666 if (!is_zero_pfn(pfn)) 667 goto unlock_abort; 668 flush = true; 669 } else if (!pte_none(orig_pte)) 670 goto unlock_abort; 671 672 /* 673 * Check for userfaultfd but do not deliver the fault. Instead, 674 * just back off. 675 */ 676 if (userfaultfd_missing(vma)) 677 goto unlock_abort; 678 679 inc_mm_counter(mm, MM_ANONPAGES); 680 folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); 681 if (!folio_is_zone_device(folio)) 682 folio_add_lru_vma(folio, vma); 683 folio_get(folio); 684 685 if (flush) { 686 flush_cache_page(vma, addr, pte_pfn(orig_pte)); 687 ptep_clear_flush(vma, addr, ptep); 688 } 689 set_pte_at(mm, addr, ptep, entry); 690 update_mmu_cache(vma, addr, ptep); 691 692 pte_unmap_unlock(ptep, ptl); 693 *src = MIGRATE_PFN_MIGRATE; 694 return; 695 696 unlock_abort: 697 pte_unmap_unlock(ptep, ptl); 698 abort: 699 *src &= ~MIGRATE_PFN_MIGRATE; 700 } 701 702 static void __migrate_device_pages(unsigned long *src_pfns, 703 unsigned long *dst_pfns, unsigned long npages, 704 struct migrate_vma *migrate) 705 { 706 struct mmu_notifier_range range; 707 unsigned long i; 708 bool notified = false; 709 710 for (i = 0; i < npages; i++) { 711 struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); 712 struct page *page = migrate_pfn_to_page(src_pfns[i]); 713 struct address_space *mapping; 714 struct folio *newfolio, *folio; 715 int r, extra_cnt = 0; 716 717 if (!newpage) { 718 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 719 continue; 720 } 721 722 if (!page) { 723 unsigned long addr; 724 725 if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) 726 continue; 727 728 /* 729 * The only time there is no vma is when called from 730 * migrate_device_coherent_folio(). However this isn't 731 * called if the page could not be unmapped. 732 */ 733 VM_BUG_ON(!migrate); 734 addr = migrate->start + i*PAGE_SIZE; 735 if (!notified) { 736 notified = true; 737 738 mmu_notifier_range_init_owner(&range, 739 MMU_NOTIFY_MIGRATE, 0, 740 migrate->vma->vm_mm, addr, migrate->end, 741 migrate->pgmap_owner); 742 mmu_notifier_invalidate_range_start(&range); 743 } 744 migrate_vma_insert_page(migrate, addr, newpage, 745 &src_pfns[i]); 746 continue; 747 } 748 749 newfolio = page_folio(newpage); 750 folio = page_folio(page); 751 mapping = folio_mapping(folio); 752 753 if (folio_is_device_private(newfolio) || 754 folio_is_device_coherent(newfolio)) { 755 if (mapping) { 756 /* 757 * For now only support anonymous memory migrating to 758 * device private or coherent memory. 759 * 760 * Try to get rid of swap cache if possible. 761 */ 762 if (!folio_test_anon(folio) || 763 !folio_free_swap(folio)) { 764 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 765 continue; 766 } 767 } 768 } else if (folio_is_zone_device(newfolio)) { 769 /* 770 * Other types of ZONE_DEVICE page are not supported. 771 */ 772 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 773 continue; 774 } 775 776 BUG_ON(folio_test_writeback(folio)); 777 778 if (migrate && migrate->fault_page == page) 779 extra_cnt = 1; 780 r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); 781 if (r != MIGRATEPAGE_SUCCESS) 782 src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 783 else 784 folio_migrate_flags(newfolio, folio); 785 } 786 787 if (notified) 788 mmu_notifier_invalidate_range_end(&range); 789 } 790 791 /** 792 * migrate_device_pages() - migrate meta-data from src page to dst page 793 * @src_pfns: src_pfns returned from migrate_device_range() 794 * @dst_pfns: array of pfns allocated by the driver to migrate memory to 795 * @npages: number of pages in the range 796 * 797 * Equivalent to migrate_vma_pages(). This is called to migrate struct page 798 * meta-data from source struct page to destination. 799 */ 800 void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, 801 unsigned long npages) 802 { 803 __migrate_device_pages(src_pfns, dst_pfns, npages, NULL); 804 } 805 EXPORT_SYMBOL(migrate_device_pages); 806 807 /** 808 * migrate_vma_pages() - migrate meta-data from src page to dst page 809 * @migrate: migrate struct containing all migration information 810 * 811 * This migrates struct page meta-data from source struct page to destination 812 * struct page. This effectively finishes the migration from source page to the 813 * destination page. 814 */ 815 void migrate_vma_pages(struct migrate_vma *migrate) 816 { 817 __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); 818 } 819 EXPORT_SYMBOL(migrate_vma_pages); 820 821 static void __migrate_device_finalize(unsigned long *src_pfns, 822 unsigned long *dst_pfns, 823 unsigned long npages, 824 struct page *fault_page) 825 { 826 struct folio *fault_folio = fault_page ? 827 page_folio(fault_page) : NULL; 828 unsigned long i; 829 830 for (i = 0; i < npages; i++) { 831 struct folio *dst = NULL, *src = NULL; 832 struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); 833 struct page *page = migrate_pfn_to_page(src_pfns[i]); 834 835 if (newpage) 836 dst = page_folio(newpage); 837 838 if (!page) { 839 if (dst) { 840 WARN_ON_ONCE(fault_folio == dst); 841 folio_unlock(dst); 842 folio_put(dst); 843 } 844 continue; 845 } 846 847 src = page_folio(page); 848 849 if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) { 850 if (dst) { 851 WARN_ON_ONCE(fault_folio == dst); 852 folio_unlock(dst); 853 folio_put(dst); 854 } 855 dst = src; 856 } 857 858 if (!folio_is_zone_device(dst)) 859 folio_add_lru(dst); 860 remove_migration_ptes(src, dst, 0); 861 if (fault_folio != src) 862 folio_unlock(src); 863 folio_put(src); 864 865 if (dst != src) { 866 WARN_ON_ONCE(fault_folio == dst); 867 folio_unlock(dst); 868 folio_put(dst); 869 } 870 } 871 } 872 873 /* 874 * migrate_device_finalize() - complete page migration 875 * @src_pfns: src_pfns returned from migrate_device_range() 876 * @dst_pfns: array of pfns allocated by the driver to migrate memory to 877 * @npages: number of pages in the range 878 * 879 * Completes migration of the page by removing special migration entries. 880 * Drivers must ensure copying of page data is complete and visible to the CPU 881 * before calling this. 882 */ 883 void migrate_device_finalize(unsigned long *src_pfns, 884 unsigned long *dst_pfns, unsigned long npages) 885 { 886 return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL); 887 } 888 EXPORT_SYMBOL(migrate_device_finalize); 889 890 /** 891 * migrate_vma_finalize() - restore CPU page table entry 892 * @migrate: migrate struct containing all migration information 893 * 894 * This replaces the special migration pte entry with either a mapping to the 895 * new page if migration was successful for that page, or to the original page 896 * otherwise. 897 * 898 * This also unlocks the pages and puts them back on the lru, or drops the extra 899 * refcount, for device pages. 900 */ 901 void migrate_vma_finalize(struct migrate_vma *migrate) 902 { 903 __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages, 904 migrate->fault_page); 905 } 906 EXPORT_SYMBOL(migrate_vma_finalize); 907 908 static unsigned long migrate_device_pfn_lock(unsigned long pfn) 909 { 910 struct folio *folio; 911 912 folio = folio_get_nontail_page(pfn_to_page(pfn)); 913 if (!folio) 914 return 0; 915 916 if (!folio_trylock(folio)) { 917 folio_put(folio); 918 return 0; 919 } 920 921 return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 922 } 923 924 /** 925 * migrate_device_range() - migrate device private pfns to normal memory. 926 * @src_pfns: array large enough to hold migrating source device private pfns. 927 * @start: starting pfn in the range to migrate. 928 * @npages: number of pages to migrate. 929 * 930 * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that 931 * instead of looking up pages based on virtual address mappings a range of 932 * device pfns that should be migrated to system memory is used instead. 933 * 934 * This is useful when a driver needs to free device memory but doesn't know the 935 * virtual mappings of every page that may be in device memory. For example this 936 * is often the case when a driver is being unloaded or unbound from a device. 937 * 938 * Like migrate_vma_setup() this function will take a reference and lock any 939 * migrating pages that aren't free before unmapping them. Drivers may then 940 * allocate destination pages and start copying data from the device to CPU 941 * memory before calling migrate_device_pages(). 942 */ 943 int migrate_device_range(unsigned long *src_pfns, unsigned long start, 944 unsigned long npages) 945 { 946 unsigned long i, pfn; 947 948 for (pfn = start, i = 0; i < npages; pfn++, i++) 949 src_pfns[i] = migrate_device_pfn_lock(pfn); 950 951 migrate_device_unmap(src_pfns, npages, NULL); 952 953 return 0; 954 } 955 EXPORT_SYMBOL(migrate_device_range); 956 957 /** 958 * migrate_device_pfns() - migrate device private pfns to normal memory. 959 * @src_pfns: pre-popluated array of source device private pfns to migrate. 960 * @npages: number of pages to migrate. 961 * 962 * Similar to migrate_device_range() but supports non-contiguous pre-popluated 963 * array of device pages to migrate. 964 */ 965 int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) 966 { 967 unsigned long i; 968 969 for (i = 0; i < npages; i++) 970 src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]); 971 972 migrate_device_unmap(src_pfns, npages, NULL); 973 974 return 0; 975 } 976 EXPORT_SYMBOL(migrate_device_pfns); 977 978 /* 979 * Migrate a device coherent folio back to normal memory. The caller should have 980 * a reference on folio which will be copied to the new folio if migration is 981 * successful or dropped on failure. 982 */ 983 int migrate_device_coherent_folio(struct folio *folio) 984 { 985 unsigned long src_pfn, dst_pfn = 0; 986 struct folio *dfolio; 987 988 WARN_ON_ONCE(folio_test_large(folio)); 989 990 folio_lock(folio); 991 src_pfn = migrate_pfn(folio_pfn(folio)) | MIGRATE_PFN_MIGRATE; 992 993 /* 994 * We don't have a VMA and don't need to walk the page tables to find 995 * the source folio. So call migrate_vma_unmap() directly to unmap the 996 * folio as migrate_vma_setup() will fail if args.vma == NULL. 997 */ 998 migrate_device_unmap(&src_pfn, 1, NULL); 999 if (!(src_pfn & MIGRATE_PFN_MIGRATE)) 1000 return -EBUSY; 1001 1002 dfolio = folio_alloc(GFP_USER | __GFP_NOWARN, 0); 1003 if (dfolio) { 1004 folio_lock(dfolio); 1005 dst_pfn = migrate_pfn(folio_pfn(dfolio)); 1006 } 1007 1008 migrate_device_pages(&src_pfn, &dst_pfn, 1); 1009 if (src_pfn & MIGRATE_PFN_MIGRATE) 1010 folio_copy(dfolio, folio); 1011 migrate_device_finalize(&src_pfn, &dst_pfn, 1); 1012 1013 if (src_pfn & MIGRATE_PFN_MIGRATE) 1014 return 0; 1015 return -EBUSY; 1016 } 1017