Lines Matching +full:- +full:- +full:enable +full:- +full:trace +full:- +full:backends

1 // SPDX-License-Identifier: GPL-2.0-only
9 * demand-loading started 01.12.91 - seems it is high on the list of
10 * things wanted, and it should be easy to implement. - Linus
14 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
15 * pages started 02.12.91, seems to work. - Linus.
21 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
27 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
29 * 20.12.91 - Ok, making the swap-device changeable like the root.
33 * 05.04.94 - Multi-page memory management added for v1.1.
36 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
69 #include <linux/memory-tiers.h>
80 #include <trace/events/kmem.h>
89 #include "pgalloc-track.h"
94 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
102 * Return true if the original pte was a uffd-wp pte marker (so the pte was
103 * wr-protected).
107 if (!userfaultfd_wp(vmf->vma)) in vmf_orig_pte_uffd_wp()
109 if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) in vmf_orig_pte_uffd_wp()
112 return pte_marker_uffd_wp(vmf->orig_pte); in vmf_orig_pte_uffd_wp()
177 mm_dec_nr_ptes(tlb->mm); in free_pte_range()
205 if (end - 1 > ceiling - 1) in free_pmd_range()
211 mm_dec_nr_pmds(tlb->mm); in free_pmd_range()
239 if (end - 1 > ceiling - 1) in free_pud_range()
245 mm_dec_nr_puds(tlb->mm); in free_pud_range()
273 if (end - 1 > ceiling - 1) in free_p4d_range()
282 * This function frees user-level page tables of a process.
298 * Why all these "- 1"s? Because 0 represents both the bottom in free_pgd_range()
299 * of the address space and the top of it (using -1 for the in free_pgd_range()
303 * Comparisons need to use "end - 1" and "ceiling - 1" (though in free_pgd_range()
314 * bother to round floor or end up - the tests don't need that. in free_pgd_range()
328 if (end - 1 > ceiling - 1) in free_pgd_range()
329 end -= PMD_SIZE; in free_pgd_range()
330 if (addr > end - 1) in free_pgd_range()
337 pgd = pgd_offset(tlb->mm, addr); in free_pgd_range()
353 unsigned long addr = vma->vm_start; in free_pgtables()
360 next = mas_find(mas, ceiling - 1); in free_pgtables()
374 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, in free_pgtables()
375 floor, next ? next->vm_start : ceiling); in free_pgtables()
383 while (next && next->vm_start <= vma->vm_end + PMD_SIZE in free_pgtables()
386 next = mas_find(mas, ceiling - 1); in free_pgtables()
395 free_pgd_range(tlb, addr, vma->vm_end, in free_pgtables()
396 floor, next ? next->vm_start : ceiling); in free_pgtables()
416 * of a chain of data-dependent loads, meaning most CPUs (alpha in pmd_install()
418 * seen in-order. See the alpha page table accessors for the in pmd_install()
432 return -ENOMEM; in __pte_alloc()
444 return -ENOMEM; in __pte_alloc_kernel()
474 * is found. For example, we might have a PFN-mapped pte in
482 pgd_t *pgd = pgd_offset(vma->vm_mm, addr); in print_bad_pte()
511 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; in print_bad_pte()
515 current->comm, in print_bad_pte()
520 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); in print_bad_pte()
522 vma->vm_file, in print_bad_pte()
523 vma->vm_ops ? vma->vm_ops->fault : NULL, in print_bad_pte()
524 vma->vm_file ? vma->vm_file->f_op->mmap : NULL, in print_bad_pte()
525 mapping ? mapping->a_ops->read_folio : NULL); in print_bad_pte()
531 * vm_normal_page -- This function gets the "struct page" associated with a pte.
551 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
583 if (vma->vm_ops && vma->vm_ops->find_special_page) in vm_normal_page()
584 return vma->vm_ops->find_special_page(vma, addr); in vm_normal_page()
585 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) in vm_normal_page()
606 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { in vm_normal_page()
607 if (vma->vm_flags & VM_MIXEDMAP) { in vm_normal_page()
615 off = (addr - vma->vm_start) >> PAGE_SHIFT; in vm_normal_page()
616 if (pfn == vma->vm_pgoff + off) in vm_normal_page()
618 if (!is_cow_mapping(vma->vm_flags)) in vm_normal_page()
661 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { in vm_normal_page_pmd()
662 if (vma->vm_flags & VM_MIXEDMAP) { in vm_normal_page_pmd()
668 off = (addr - vma->vm_start) >> PAGE_SHIFT; in vm_normal_page_pmd()
669 if (pfn == vma->vm_pgoff + off) in vm_normal_page_pmd()
671 if (!is_cow_mapping(vma->vm_flags)) in vm_normal_page_pmd()
703 * restore_exclusive_pte - Restore a device-exclusive entry
711 * Restore a device-exclusive non-swap entry to an ordinary present pte.
717 * a device-exclusive entry can map it into the device to make forward
736 pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); in restore_exclusive_pte()
743 if ((vma->vm_flags & VM_WRITE) && in restore_exclusive_pte()
749 set_pte_at(vma->vm_mm, address, ptep, pte); in restore_exclusive_pte()
752 * No need to invalidate - it was non-present before. However in restore_exclusive_pte()
774 return -EBUSY; in try_restore_exclusive_pte()
788 unsigned long vm_flags = dst_vma->vm_flags; in copy_nonpresent_pte()
797 return -EIO; in copy_nonpresent_pte()
800 if (unlikely(list_empty(&dst_mm->mmlist))) { in copy_nonpresent_pte()
802 if (list_empty(&dst_mm->mmlist)) in copy_nonpresent_pte()
803 list_add(&dst_mm->mmlist, in copy_nonpresent_pte()
804 &src_mm->mmlist); in copy_nonpresent_pte()
853 * We do not preserve soft-dirty information, because so in copy_nonpresent_pte()
875 VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); in copy_nonpresent_pte()
877 return -EBUSY; in copy_nonpresent_pte()
878 return -ENOENT; in copy_nonpresent_pte()
898 * and re-use the pte the traditional way.
900 * And if we need a pre-allocated page but don't yet have
915 return -EAGAIN; in copy_present_page()
922 if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma)) in copy_present_page()
923 return -EHWPOISON; in copy_present_page()
932 pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); in copy_present_page()
935 /* Uffd-wp needs to be delivered to dest pte as well */ in copy_present_page()
937 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); in copy_present_page()
945 struct mm_struct *src_mm = src_vma->vm_mm; in __copy_present_ptes()
948 if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { in __copy_present_ptes()
954 if (src_vma->vm_flags & VM_SHARED) in __copy_present_ptes()
961 set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); in __copy_present_ptes()
965 * Copy one present PTE, trying to batch-process subsequent PTEs that map
968 * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
994 if (src_vma->vm_flags & VM_SHARED) in copy_present_ptes()
1006 return -EAGAIN; in copy_present_ptes()
1075 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_pte_range()
1076 struct mm_struct *src_mm = src_vma->vm_mm; in copy_pte_range()
1096 * protected by mmap_lock-less collapse skipping areas with anon_vma in copy_pte_range()
1102 ret = -ENOMEM; in copy_pte_range()
1108 * retract_page_tables() are using vma->anon_vma to be exclusive, so in copy_pte_range()
1128 * We are holding two locks at this point - either of them in copy_pte_range()
1147 if (ret == -EIO) { in copy_pte_range()
1150 } else if (ret == -EBUSY) { in copy_pte_range()
1163 WARN_ON_ONCE(ret != -ENOENT); in copy_pte_range()
1166 max_nr = (end - addr) / PAGE_SIZE; in copy_pte_range()
1170 * If we need a pre-allocated page for this pte, drop the in copy_pte_range()
1174 if (unlikely(ret == -EAGAIN || ret == -EHWPOISON)) in copy_pte_range()
1178 * pre-alloc page cannot be reused by next time so as in copy_pte_range()
1197 if (ret == -EIO) { in copy_pte_range()
1200 ret = -ENOMEM; in copy_pte_range()
1204 } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) { in copy_pte_range()
1206 } else if (ret == -EAGAIN) { in copy_pte_range()
1209 return -ENOMEM; in copy_pte_range()
1230 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_pmd_range()
1231 struct mm_struct *src_mm = src_vma->vm_mm; in copy_pmd_range()
1237 return -ENOMEM; in copy_pmd_range()
1244 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); in copy_pmd_range()
1247 if (err == -ENOMEM) in copy_pmd_range()
1248 return -ENOMEM; in copy_pmd_range()
1257 return -ENOMEM; in copy_pmd_range()
1267 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_pud_range()
1268 struct mm_struct *src_mm = src_vma->vm_mm; in copy_pud_range()
1274 return -ENOMEM; in copy_pud_range()
1281 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); in copy_pud_range()
1284 if (err == -ENOMEM) in copy_pud_range()
1285 return -ENOMEM; in copy_pud_range()
1294 return -ENOMEM; in copy_pud_range()
1304 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_p4d_range()
1310 return -ENOMEM; in copy_p4d_range()
1318 return -ENOMEM; in copy_p4d_range()
1332 * Always copy pgtables when dst_vma has uffd-wp enabled even if it's in vma_needs_copy()
1333 * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable in vma_needs_copy()
1334 * contains uffd-wp protection information, that's something we can't in vma_needs_copy()
1340 if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) in vma_needs_copy()
1343 if (src_vma->anon_vma) in vma_needs_copy()
1359 unsigned long addr = src_vma->vm_start; in copy_page_range()
1360 unsigned long end = src_vma->vm_end; in copy_page_range()
1361 struct mm_struct *dst_mm = dst_vma->vm_mm; in copy_page_range()
1362 struct mm_struct *src_mm = src_vma->vm_mm; in copy_page_range()
1374 if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { in copy_page_range()
1386 is_cow = is_cow_mapping(src_vma->vm_flags); in copy_page_range()
1400 raw_write_seqcount_begin(&src_mm->write_protect_seq); in copy_page_range()
1412 ret = -ENOMEM; in copy_page_range()
1418 raw_write_seqcount_end(&src_mm->write_protect_seq); in copy_page_range()
1421 if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) in copy_page_range()
1430 if (!details || details->reclaim_pt) in should_zap_cows()
1434 return details->even_cows; in should_zap_cows()
1445 /* Otherwise we should only zap non-anon folios */ in should_zap_folio()
1454 return details->zap_flags & ZAP_FLAG_DROP_MARKER; in zap_drop_markers()
1458 * This function makes sure that we'll replace the none pte with an uffd-wp
1461 * Returns true if uffd-wp ptes was installed, false otherwise.
1482 if (--nr == 0) in zap_install_uffd_wp_if_needed()
1497 struct mm_struct *mm = tlb->mm; in zap_present_folio_ptes()
1501 ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); in zap_present_folio_ptes()
1511 rss[mm_counter(folio)] -= nr; in zap_present_folio_ptes()
1513 /* We don't need up-to-date accessed/dirty bits. */ in zap_present_folio_ptes()
1514 clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); in zap_present_folio_ptes()
1515 rss[MM_ANONPAGES] -= nr; in zap_present_folio_ptes()
1537 * Zap or skip at least one present PTE, trying to batch-process subsequent
1549 struct mm_struct *mm = tlb->mm; in zap_present_ptes()
1556 /* We don't need up-to-date accessed/dirty bits. */ in zap_present_ptes()
1557 ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); in zap_present_ptes()
1611 * consider uffd-wp bit when zap. For more information, in zap_nonpresent_ptes()
1615 rss[mm_counter(folio)]--; in zap_nonpresent_ptes()
1624 rss[MM_SWAPENTS] -= nr; in zap_nonpresent_ptes()
1631 rss[mm_counter(folio)]--; in zap_nonpresent_ptes()
1655 clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); in zap_nonpresent_ptes()
1669 int max_nr = (end - addr) / PAGE_SIZE; in do_zap_pte_range()
1679 max_nr -= nr; in do_zap_pte_range()
1703 struct mm_struct *mm = tlb->mm; in zap_pte_range()
1801 if (next - addr != HPAGE_PMD_SIZE) in zap_pmd_range()
1808 } else if (details && details->single_folio && in zap_pmd_range()
1809 folio_test_pmd_mappable(details->single_folio) && in zap_pmd_range()
1810 next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { in zap_pmd_range()
1811 spinlock_t *ptl = pmd_lock(tlb->mm, pmd); in zap_pmd_range()
1825 pmd--; in zap_pmd_range()
1843 if (next - addr != HPAGE_PUD_SIZE) { in zap_pud_range()
1844 mmap_assert_locked(tlb->mm); in zap_pud_range()
1889 pgd = pgd_offset(vma->vm_mm, addr); in unmap_page_range()
1905 unsigned long start = max(vma->vm_start, start_addr); in unmap_single_vma()
1908 if (start >= vma->vm_end) in unmap_single_vma()
1910 end = min(vma->vm_end, end_addr); in unmap_single_vma()
1911 if (end <= vma->vm_start) in unmap_single_vma()
1914 if (vma->vm_file) in unmap_single_vma()
1917 if (unlikely(vma->vm_flags & VM_PFNMAP)) in unmap_single_vma()
1923 * It is undesirable to test vma->vm_file as it in unmap_single_vma()
1924 * should be non-null for valid hugetlb area. in unmap_single_vma()
1927 * hugetlbfs ->mmap method fails, in unmap_single_vma()
1928 * mmap_region() nullifies vma->vm_file in unmap_single_vma()
1933 if (vma->vm_file) { in unmap_single_vma()
1935 details->zap_flags : 0; in unmap_single_vma()
1945 * unmap_vmas - unmap a range of memory covered by a list of vma's
1962 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1973 /* Careful - we need to zap private pages too! */ in unmap_vmas()
1977 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, in unmap_vmas()
1987 vma = mas_find(mas, tree_end - 1); in unmap_vmas()
1993 * zap_page_range_single - remove user pages in a given range
2008 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, in zap_page_range_single()
2011 tlb_gather_mmu(&tlb, vma->vm_mm); in zap_page_range_single()
2012 update_hiwater_rss(vma->vm_mm); in zap_page_range_single()
2015 * unmap 'address-end' not 'range.start-range.end' as range in zap_page_range_single()
2025 * zap_vma_ptes - remove ptes mapping the vma
2039 !(vma->vm_flags & VM_PFNMAP)) in zap_vma_ptes()
2080 VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); in vm_mixed_zeropage_allowed()
2087 if (mm_forbids_zeropage(vma->vm_mm)) in vm_mixed_zeropage_allowed()
2090 if (is_cow_mapping(vma->vm_flags)) in vm_mixed_zeropage_allowed()
2093 if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) in vm_mixed_zeropage_allowed()
2096 * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could in vm_mixed_zeropage_allowed()
2097 * find the shared zeropage and longterm-pin it, which would in vm_mixed_zeropage_allowed()
2099 * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would in vm_mixed_zeropage_allowed()
2104 return vma->vm_ops && vma->vm_ops->pfn_mkwrite && in vm_mixed_zeropage_allowed()
2105 (vma_is_fsdax(vma) || vma->vm_flags & VM_IO); in vm_mixed_zeropage_allowed()
2114 return -EINVAL; in validate_page_before_insert()
2117 return -EINVAL; in validate_page_before_insert()
2122 return -EINVAL; in validate_page_before_insert()
2136 return -EBUSY; in insert_page_into_pte_locked()
2141 return -EFAULT; in insert_page_into_pte_locked()
2161 inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); in insert_page_into_pte_locked()
2164 set_pte_at(vma->vm_mm, addr, pte, pteval); in insert_page_into_pte_locked()
2178 retval = -ENOMEM; in insert_page()
2179 pte = get_locked_pte(vma->vm_mm, addr, &ptl); in insert_page()
2209 struct mm_struct *const mm = vma->vm_mm; in insert_pages()
2215 ret = -EFAULT; in insert_pages()
2221 remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); in insert_pages()
2224 ret = -ENOMEM; in insert_pages()
2234 ret = -EFAULT; in insert_pages()
2243 remaining_pages_total -= pte_idx; in insert_pages()
2250 pages_to_write_in_pmd -= batch_size; in insert_pages()
2251 remaining_pages_total -= batch_size; in insert_pages()
2262 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
2279 const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; in vm_insert_pages()
2281 if (addr < vma->vm_start || end_addr >= vma->vm_end) in vm_insert_pages()
2282 return -EFAULT; in vm_insert_pages()
2283 if (!(vma->vm_flags & VM_MIXEDMAP)) { in vm_insert_pages()
2284 BUG_ON(mmap_read_trylock(vma->vm_mm)); in vm_insert_pages()
2285 BUG_ON(vma->vm_flags & VM_PFNMAP); in vm_insert_pages()
2289 return insert_pages(vma, addr, pages, num, vma->vm_page_prot); in vm_insert_pages()
2294 * vm_insert_page - insert single page into user vma
2316 * Usually this function is called from f_op->mmap() handler
2317 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
2319 * function from other places, for example from page-fault handler.
2326 if (addr < vma->vm_start || addr >= vma->vm_end) in vm_insert_page()
2327 return -EFAULT; in vm_insert_page()
2328 if (!(vma->vm_flags & VM_MIXEDMAP)) { in vm_insert_page()
2329 BUG_ON(mmap_read_trylock(vma->vm_mm)); in vm_insert_page()
2330 BUG_ON(vma->vm_flags & VM_PFNMAP); in vm_insert_page()
2333 return insert_page(vma, addr, page, vma->vm_page_prot, false); in vm_insert_page()
2338 * __vm_map_pages - maps range of kernel pages into user vma
2354 unsigned long uaddr = vma->vm_start; in __vm_map_pages()
2359 return -ENXIO; in __vm_map_pages()
2362 if (count > num - offset) in __vm_map_pages()
2363 return -ENXIO; in __vm_map_pages()
2376 * vm_map_pages - maps range of kernel pages starts with non zero offset
2396 return __vm_map_pages(vma, pages, num, vma->vm_pgoff); in vm_map_pages()
2401 * vm_map_pages_zero - map range of kernel pages starts with zero offset
2423 struct mm_struct *mm = vma->vm_mm; in insert_pfn()
2475 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
2482 * to override pgprot on a per-page basis.
2489 * pgprot typically only differs from @vma->vm_page_prot when drivers set
2490 * caching- and encryption bits different than those of @vma->vm_page_prot,
2491 * because the caching- or encryption mode may not be known at mmap() time.
2493 * This is ok as long as @vma->vm_page_prot is not used by the core vm
2496 * functions that don't touch caching- or encryption bits, using pte_modify()
2499 * Also when new page-table entries are created, this is only done using the
2500 * fault() callback, and never using the value of vma->vm_page_prot,
2501 * except for page-table entries that point to anonymous pages as the result
2516 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); in vmf_insert_pfn_prot()
2517 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == in vmf_insert_pfn_prot()
2519 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); in vmf_insert_pfn_prot()
2520 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); in vmf_insert_pfn_prot()
2522 if (addr < vma->vm_start || addr >= vma->vm_end) in vmf_insert_pfn_prot()
2536 * vmf_insert_pfn - insert single pfn into user vma
2544 * This function should only be called from a vm_ops->fault handler, and
2558 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); in vmf_insert_pfn()
2568 if (vma->vm_flags & VM_MIXEDMAP) in vm_mixed_ok()
2582 pgprot_t pgprot = vma->vm_page_prot; in __vm_insert_mixed()
2588 if (addr < vma->vm_start || addr >= vma->vm_end) in __vm_insert_mixed()
2618 if (err == -ENOMEM) in __vm_insert_mixed()
2620 if (err < 0 && err != -EBUSY) in __vm_insert_mixed()
2629 pgprot_t pgprot = vmf->vma->vm_page_prot; in vmf_insert_page_mkwrite()
2630 unsigned long addr = vmf->address; in vmf_insert_page_mkwrite()
2633 if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end) in vmf_insert_page_mkwrite()
2636 err = insert_page(vmf->vma, addr, page, pgprot, write); in vmf_insert_page_mkwrite()
2637 if (err == -ENOMEM) in vmf_insert_page_mkwrite()
2639 if (err < 0 && err != -EBUSY) in vmf_insert_page_mkwrite()
2667 * in null mappings (currently treated as "copy-on-access")
2679 return -ENOMEM; in remap_pte_range()
2684 err = -EACCES; in remap_pte_range()
2703 pfn -= addr >> PAGE_SHIFT; in remap_pmd_range()
2706 return -ENOMEM; in remap_pmd_range()
2726 pfn -= addr >> PAGE_SHIFT; in remap_pud_range()
2729 return -ENOMEM; in remap_pud_range()
2748 pfn -= addr >> PAGE_SHIFT; in remap_p4d_range()
2751 return -ENOMEM; in remap_p4d_range()
2768 struct mm_struct *mm = vma->vm_mm; in remap_pfn_range_internal()
2772 return -EINVAL; in remap_pfn_range_internal()
2787 * There's a horrible special case to handle copy-on-write in remap_pfn_range_internal()
2789 * un-COW'ed pages by matching them up with "vma->vm_pgoff". in remap_pfn_range_internal()
2792 if (is_cow_mapping(vma->vm_flags)) { in remap_pfn_range_internal()
2793 if (addr != vma->vm_start || end != vma->vm_end) in remap_pfn_range_internal()
2794 return -EINVAL; in remap_pfn_range_internal()
2795 vma->vm_pgoff = pfn; in remap_pfn_range_internal()
2801 pfn -= addr >> PAGE_SHIFT; in remap_pfn_range_internal()
2817 * must have pre-validated the caching bits of the pgprot_t.
2837 * remap_pfn_range - remap kernel memory to userspace
2855 return -EINVAL; in remap_pfn_range()
2865 * vm_iomap_memory - remap memory to userspace
2874 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
2875 * whatever write-combining details or similar.
2885 return -EINVAL; in vm_iomap_memory()
2887 * You *really* shouldn't map things that aren't page-aligned, in vm_iomap_memory()
2895 return -EINVAL; in vm_iomap_memory()
2898 if (vma->vm_pgoff > pages) in vm_iomap_memory()
2899 return -EINVAL; in vm_iomap_memory()
2900 pfn += vma->vm_pgoff; in vm_iomap_memory()
2901 pages -= vma->vm_pgoff; in vm_iomap_memory()
2904 vm_len = vma->vm_end - vma->vm_start; in vm_iomap_memory()
2906 return -EINVAL; in vm_iomap_memory()
2909 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); in vm_iomap_memory()
2927 return -ENOMEM; in apply_to_pte_range()
2933 return -EINVAL; in apply_to_pte_range()
2970 return -ENOMEM; in apply_to_pmd_range()
2979 return -EINVAL; in apply_to_pmd_range()
3006 return -ENOMEM; in apply_to_pud_range()
3015 return -EINVAL; in apply_to_pud_range()
3042 return -ENOMEM; in apply_to_p4d_range()
3051 return -EINVAL; in apply_to_p4d_range()
3077 return -EINVAL; in __apply_to_page_range()
3085 err = -EINVAL; in __apply_to_page_range()
3131 * read non-atomically. Before making any commitment, on those architectures
3142 spin_lock(vmf->ptl); in pte_unmap_same()
3143 same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); in pte_unmap_same()
3144 spin_unlock(vmf->ptl); in pte_unmap_same()
3147 pte_unmap(vmf->pte); in pte_unmap_same()
3148 vmf->pte = NULL; in pte_unmap_same()
3155 * -EHWPOISON: copy failed due to hwpoison in source page
3156 * -EAGAIN: copied failed (some other reason)
3164 struct vm_area_struct *vma = vmf->vma; in __wp_page_copy_user()
3165 struct mm_struct *mm = vma->vm_mm; in __wp_page_copy_user()
3166 unsigned long addr = vmf->address; in __wp_page_copy_user()
3170 return -EHWPOISON; in __wp_page_copy_user()
3176 * a "struct page" for it. We do a best-effort copy by in __wp_page_copy_user()
3178 * fails, we just zero-fill it. Live with it. in __wp_page_copy_user()
3188 vmf->pte = NULL; in __wp_page_copy_user()
3189 if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { in __wp_page_copy_user()
3192 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); in __wp_page_copy_user()
3193 if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { in __wp_page_copy_user()
3198 if (vmf->pte) in __wp_page_copy_user()
3199 update_mmu_tlb(vma, addr, vmf->pte); in __wp_page_copy_user()
3200 ret = -EAGAIN; in __wp_page_copy_user()
3204 entry = pte_mkyoung(vmf->orig_pte); in __wp_page_copy_user()
3205 if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) in __wp_page_copy_user()
3206 update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); in __wp_page_copy_user()
3216 if (vmf->pte) in __wp_page_copy_user()
3219 /* Re-validate under PTL if the page is still mapped */ in __wp_page_copy_user()
3220 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); in __wp_page_copy_user()
3221 if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { in __wp_page_copy_user()
3223 if (vmf->pte) in __wp_page_copy_user()
3224 update_mmu_tlb(vma, addr, vmf->pte); in __wp_page_copy_user()
3225 ret = -EAGAIN; in __wp_page_copy_user()
3236 * use-case in __wp_page_copy_user()
3247 if (vmf->pte) in __wp_page_copy_user()
3248 pte_unmap_unlock(vmf->pte, vmf->ptl); in __wp_page_copy_user()
3258 struct file *vm_file = vma->vm_file; in __get_fault_gfp_mask()
3261 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; in __get_fault_gfp_mask()
3279 unsigned int old_flags = vmf->flags; in do_page_mkwrite()
3281 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; in do_page_mkwrite()
3283 if (vmf->vma->vm_file && in do_page_mkwrite()
3284 IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) in do_page_mkwrite()
3287 ret = vmf->vma->vm_ops->page_mkwrite(vmf); in do_page_mkwrite()
3289 vmf->flags = old_flags; in do_page_mkwrite()
3294 if (!folio->mapping) { in do_page_mkwrite()
3311 struct vm_area_struct *vma = vmf->vma; in fault_dirty_shared_page()
3313 struct folio *folio = page_folio(vmf->page); in fault_dirty_shared_page()
3315 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; in fault_dirty_shared_page()
3320 * Take a local copy of the address_space - folio.mapping may be zeroed in fault_dirty_shared_page()
3322 * pinned by vma->vm_file's reference. We rely on folio_unlock()'s in fault_dirty_shared_page()
3329 file_update_time(vma->vm_file); in fault_dirty_shared_page()
3360 * any related book-keeping.
3363 __releases(vmf->ptl) in wp_page_reuse()
3365 struct vm_area_struct *vma = vmf->vma; in wp_page_reuse()
3368 VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); in wp_page_reuse()
3369 VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte))); in wp_page_reuse()
3373 !PageAnonExclusive(vmf->page)); in wp_page_reuse()
3379 folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1); in wp_page_reuse()
3382 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); in wp_page_reuse()
3383 entry = pte_mkyoung(vmf->orig_pte); in wp_page_reuse()
3385 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) in wp_page_reuse()
3386 update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); in wp_page_reuse()
3387 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_page_reuse()
3393 * vm_ops that have a ->map_pages have been audited and don't need
3398 struct vm_area_struct *vma = vmf->vma; in vmf_can_call_fault()
3400 if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) in vmf_can_call_fault()
3407 * __vmf_anon_prepare - Prepare to handle an anonymous fault.
3413 * only protected by the per-VMA lock, the caller must retry with the
3416 * do with only the per-VMA lock held for this VMA.
3423 struct vm_area_struct *vma = vmf->vma; in __vmf_anon_prepare()
3426 if (likely(vma->anon_vma)) in __vmf_anon_prepare()
3428 if (vmf->flags & FAULT_FLAG_VMA_LOCK) { in __vmf_anon_prepare()
3429 if (!mmap_read_trylock(vma->vm_mm)) in __vmf_anon_prepare()
3434 if (vmf->flags & FAULT_FLAG_VMA_LOCK) in __vmf_anon_prepare()
3435 mmap_read_unlock(vma->vm_mm); in __vmf_anon_prepare()
3448 * - Allocate a page, copy the content of the old page to the new one.
3449 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
3450 * - Take the PTL. If the pte changed, bail out and release the allocated page
3451 * - If the pte is still the way we remember it, update the page table and all
3452 * relevant references. This includes dropping the reference the page-table
3454 * - In any case, unlock the PTL and drop the reference we took to the old page.
3458 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; in wp_page_copy()
3459 struct vm_area_struct *vma = vmf->vma; in wp_page_copy()
3460 struct mm_struct *mm = vma->vm_mm; in wp_page_copy()
3471 if (vmf->page) in wp_page_copy()
3472 old_folio = page_folio(vmf->page); in wp_page_copy()
3477 pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte)); in wp_page_copy()
3478 new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero); in wp_page_copy()
3485 err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); in wp_page_copy()
3489 * it's fine. If not, userspace would re-fault on in wp_page_copy()
3492 * The -EHWPOISON case will not be retried. in wp_page_copy()
3499 return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; in wp_page_copy()
3501 kmsan_copy_page_meta(&new_folio->page, vmf->page); in wp_page_copy()
3507 vmf->address & PAGE_MASK, in wp_page_copy()
3508 (vmf->address & PAGE_MASK) + PAGE_SIZE); in wp_page_copy()
3512 * Re-check the pte - we dropped the lock in wp_page_copy()
3514 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); in wp_page_copy()
3515 if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { in wp_page_copy()
3522 ksm_might_unmap_zero_page(mm, vmf->orig_pte); in wp_page_copy()
3525 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); in wp_page_copy()
3526 entry = mk_pte(&new_folio->page, vma->vm_page_prot); in wp_page_copy()
3529 if (pte_soft_dirty(vmf->orig_pte)) in wp_page_copy()
3531 if (pte_uffd_wp(vmf->orig_pte)) in wp_page_copy()
3544 ptep_clear_flush(vma, vmf->address, vmf->pte); in wp_page_copy()
3545 folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); in wp_page_copy()
3548 set_pte_at(mm, vmf->address, vmf->pte, entry); in wp_page_copy()
3549 update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); in wp_page_copy()
3573 folio_remove_rmap_pte(old_folio, vmf->page, vma); in wp_page_copy()
3579 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_page_copy()
3580 } else if (vmf->pte) { in wp_page_copy()
3581 update_mmu_tlb(vma, vmf->address, vmf->pte); in wp_page_copy()
3582 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_page_copy()
3608 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
3612 * @folio: the folio of vmf->page
3615 * shared mapping due to PTE being read-only once the mapped page is prepared.
3626 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); in finish_mkwrite_fault()
3627 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, in finish_mkwrite_fault()
3628 &vmf->ptl); in finish_mkwrite_fault()
3629 if (!vmf->pte) in finish_mkwrite_fault()
3635 if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { in finish_mkwrite_fault()
3636 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); in finish_mkwrite_fault()
3637 pte_unmap_unlock(vmf->pte, vmf->ptl); in finish_mkwrite_fault()
3650 struct vm_area_struct *vma = vmf->vma; in wp_pfn_shared()
3652 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { in wp_pfn_shared()
3655 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_pfn_shared()
3660 vmf->flags |= FAULT_FLAG_MKWRITE; in wp_pfn_shared()
3661 ret = vma->vm_ops->pfn_mkwrite(vmf); in wp_pfn_shared()
3671 __releases(vmf->ptl) in wp_page_shared()
3673 struct vm_area_struct *vma = vmf->vma; in wp_page_shared()
3678 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { in wp_page_shared()
3681 pte_unmap_unlock(vmf->pte, vmf->ptl); in wp_page_shared()
3763 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && in __wp_can_reuse_large_anon_folio()
3764 folio_mm_id(folio, 1) != vma->vm_mm->mm_id); in __wp_can_reuse_large_anon_folio()
3832 * shared-page counter for the old page.
3835 * done by the caller (the low-level page fault routine in most cases).
3843 * We enter with non-exclusive mmap_lock (to exclude vma changes,
3848 __releases(vmf->ptl) in do_wp_page()
3850 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; in do_wp_page()
3851 struct vm_area_struct *vma = vmf->vma; in do_wp_page()
3856 if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { in do_wp_page()
3858 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_wp_page()
3864 * etc.) because we're only removing the uffd-wp bit, in do_wp_page()
3867 pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); in do_wp_page()
3869 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); in do_wp_page()
3874 vmf->orig_pte = pte; in do_wp_page()
3878 * Userfaultfd write-protect can defer flushes. Ensure the TLB in do_wp_page()
3881 if (unlikely(userfaultfd_wp(vmf->vma) && in do_wp_page()
3882 mm_tlb_flush_pending(vmf->vma->vm_mm))) in do_wp_page()
3883 flush_tlb_page(vmf->vma, vmf->address); in do_wp_page()
3886 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); in do_wp_page()
3888 if (vmf->page) in do_wp_page()
3889 folio = page_folio(vmf->page); in do_wp_page()
3895 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { in do_wp_page()
3898 * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called. in do_wp_page()
3901 * Just mark the pages writable and/or call ops->pfn_mkwrite. in do_wp_page()
3903 if (!vmf->page || is_fsdax_page(vmf->page)) { in do_wp_page()
3904 vmf->page = NULL; in do_wp_page()
3918 (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { in do_wp_page()
3919 if (!PageAnonExclusive(vmf->page)) in do_wp_page()
3920 SetPageAnonExclusive(vmf->page); in do_wp_page()
3922 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_wp_page()
3934 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_wp_page()
3946 zap_page_range_single(vma, start_addr, end_addr - start_addr, details); in unmap_mapping_range_vma()
3958 vba = vma->vm_pgoff; in unmap_mapping_range_tree()
3959 vea = vba + vma_pages(vma) - 1; in unmap_mapping_range_tree()
3964 ((zba - vba) << PAGE_SHIFT) + vma->vm_start, in unmap_mapping_range_tree()
3965 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, in unmap_mapping_range_tree()
3971 * unmap_mapping_folio() - Unmap single folio from processes.
3983 struct address_space *mapping = folio->mapping; in unmap_mapping_folio()
3990 first_index = folio->index; in unmap_mapping_folio()
3991 last_index = folio_next_index(folio) - 1; in unmap_mapping_folio()
3998 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) in unmap_mapping_folio()
3999 unmap_mapping_range_tree(&mapping->i_mmap, first_index, in unmap_mapping_folio()
4005 * unmap_mapping_pages() - Unmap pages from processes.
4021 pgoff_t last_index = start + nr - 1; in unmap_mapping_pages()
4028 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) in unmap_mapping_pages()
4029 unmap_mapping_range_tree(&mapping->i_mmap, first_index, in unmap_mapping_pages()
4036 * unmap_mapping_range - unmap the portion of all mmaps in the specified
4056 pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; in unmap_mapping_range()
4061 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; in unmap_mapping_range()
4063 hlen = ULONG_MAX - hba + 1; in unmap_mapping_range()
4075 struct folio *folio = page_folio(vmf->page); in remove_device_exclusive_entry()
4076 struct vm_area_struct *vma = vmf->vma; in remove_device_exclusive_entry()
4082 * the PTL so a racing thread can remove the device-exclusive in remove_device_exclusive_entry()
4085 * been re-allocated after being freed all we do is lock and in remove_device_exclusive_entry()
4097 vma->vm_mm, vmf->address & PAGE_MASK, in remove_device_exclusive_entry()
4098 (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); in remove_device_exclusive_entry()
4101 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, in remove_device_exclusive_entry()
4102 &vmf->ptl); in remove_device_exclusive_entry()
4103 if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) in remove_device_exclusive_entry()
4104 restore_exclusive_pte(vma, folio, vmf->page, vmf->address, in remove_device_exclusive_entry()
4105 vmf->pte, vmf->orig_pte); in remove_device_exclusive_entry()
4107 if (vmf->pte) in remove_device_exclusive_entry()
4108 pte_unmap_unlock(vmf->pte, vmf->ptl); in remove_device_exclusive_entry()
4122 if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || in should_try_to_free_swap()
4137 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, in pte_marker_clear()
4138 vmf->address, &vmf->ptl); in pte_marker_clear()
4139 if (!vmf->pte) in pte_marker_clear()
4142 * Be careful so that we will only recover a special uffd-wp pte into a in pte_marker_clear()
4149 if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) in pte_marker_clear()
4150 pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); in pte_marker_clear()
4151 pte_unmap_unlock(vmf->pte, vmf->ptl); in pte_marker_clear()
4157 if (vma_is_anonymous(vmf->vma)) in do_pte_missing()
4164 * This is actually a page-missing access, but with uffd-wp special pte
4165 * installed. It means this pte was wr-protected before being unmapped.
4171 * got unregistered - we can simply clear them. in pte_marker_handle_uffd_wp()
4173 if (unlikely(!userfaultfd_wp(vmf->vma))) in pte_marker_handle_uffd_wp()
4181 swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); in handle_pte_marker()
4191 /* Higher priority than uffd-wp when data corrupted */ in handle_pte_marker()
4208 struct vm_area_struct *vma = vmf->vma; in __alloc_swap_folio()
4212 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); in __alloc_swap_folio()
4216 entry = pte_to_swp_entry(vmf->orig_pte); in __alloc_swap_folio()
4217 if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, in __alloc_swap_folio()
4240 if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) in non_swapcache_batch()
4258 addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); in can_swapin_thp()
4259 idx = (vmf->address - addr) / PAGE_SIZE; in can_swapin_thp()
4262 if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) in can_swapin_thp()
4270 * from different backends. And they are likely corner cases. Similar in can_swapin_thp()
4306 struct vm_area_struct *vma = vmf->vma; in alloc_swap_folio()
4317 * If uffd is active for the vma we need per-page fault fidelity to in alloc_swap_folio()
4325 * lack handling for such cases, so fallback to swapping in order-0 in alloc_swap_folio()
4331 entry = pte_to_swp_entry(vmf->orig_pte); in alloc_swap_folio()
4336 orders = thp_vma_allowable_orders(vma, vma->vm_flags, in alloc_swap_folio()
4337 TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); in alloc_swap_folio()
4338 orders = thp_vma_suitable_orders(vma, vmf->address, orders); in alloc_swap_folio()
4340 vmf->address, orders); in alloc_swap_folio()
4345 pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, in alloc_swap_folio()
4346 vmf->address & PMD_MASK, &ptl); in alloc_swap_folio()
4356 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); in alloc_swap_folio()
4367 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); in alloc_swap_folio()
4370 if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, in alloc_swap_folio()
4393 * We enter with non-exclusive mmap_lock (to exclude vma changes,
4402 struct vm_area_struct *vma = vmf->vma; in do_swap_page()
4422 entry = pte_to_swp_entry(vmf->orig_pte); in do_swap_page()
4425 migration_entry_wait(vma->vm_mm, vmf->pmd, in do_swap_page()
4426 vmf->address); in do_swap_page()
4428 vmf->page = pfn_swap_entry_to_page(entry); in do_swap_page()
4431 if (vmf->flags & FAULT_FLAG_VMA_LOCK) { in do_swap_page()
4441 vmf->page = pfn_swap_entry_to_page(entry); in do_swap_page()
4442 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in do_swap_page()
4443 vmf->address, &vmf->ptl); in do_swap_page()
4444 if (unlikely(!vmf->pte || in do_swap_page()
4445 !pte_same(ptep_get(vmf->pte), in do_swap_page()
4446 vmf->orig_pte))) in do_swap_page()
4453 if (trylock_page(vmf->page)) { in do_swap_page()
4456 get_page(vmf->page); in do_swap_page()
4457 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_swap_page()
4458 pgmap = page_pgmap(vmf->page); in do_swap_page()
4459 ret = pgmap->ops->migrate_to_ram(vmf); in do_swap_page()
4460 unlock_page(vmf->page); in do_swap_page()
4461 put_page(vmf->page); in do_swap_page()
4463 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_swap_page()
4470 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); in do_swap_page()
4481 folio = swap_cache_get_folio(entry, vma, vmf->address); in do_swap_page()
4487 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && in do_swap_page()
4527 folio->swap = entry; in do_swap_page()
4529 folio->private = NULL; in do_swap_page()
4542 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in do_swap_page()
4543 vmf->address, &vmf->ptl); in do_swap_page()
4544 if (likely(vmf->pte && in do_swap_page()
4545 pte_same(ptep_get(vmf->pte), vmf->orig_pte))) in do_swap_page()
4553 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); in do_swap_page()
4582 * page->index of !PageKSM() pages would be nonlinear inside the in do_swap_page()
4583 * anon VMA -- PageKSM() is lost on actual swapout. in do_swap_page()
4585 folio = ksm_might_need_to_copy(folio, vma, vmf->address); in do_swap_page()
4590 } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { in do_swap_page()
4604 if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && in do_swap_page()
4614 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, in do_swap_page()
4615 &vmf->ptl); in do_swap_page()
4616 if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) in do_swap_page()
4627 unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); in do_swap_page()
4628 unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; in do_swap_page()
4629 pte_t *folio_ptep = vmf->pte - idx; in do_swap_page()
4632 if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || in do_swap_page()
4644 address = vmf->address; in do_swap_page()
4645 ptep = vmf->pte; in do_swap_page()
4649 unsigned long folio_start = address - idx * PAGE_SIZE; in do_swap_page()
4654 if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start))) in do_swap_page()
4656 if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end))) in do_swap_page()
4659 folio_ptep = vmf->pte - idx; in do_swap_page()
4661 if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || in do_swap_page()
4669 entry = folio->swap; in do_swap_page()
4670 page = &folio->page; in do_swap_page()
4690 exclusive = pte_swp_exclusive(vmf->orig_pte); in do_swap_page()
4694 * swapcache -> certainly exclusive. in do_swap_page()
4698 data_race(si->flags & SWP_STABLE_WRITES)) { in do_swap_page()
4700 * This is tricky: not all swap backends support in do_swap_page()
4708 * For these problematic swap backends, simply drop the in do_swap_page()
4734 if (should_try_to_free_swap(folio, vma, vmf->flags)) in do_swap_page()
4737 add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); in do_swap_page()
4738 add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); in do_swap_page()
4739 pte = mk_pte(page, vma->vm_page_prot); in do_swap_page()
4740 if (pte_swp_soft_dirty(vmf->orig_pte)) in do_swap_page()
4742 if (pte_swp_uffd_wp(vmf->orig_pte)) in do_swap_page()
4753 if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) && in do_swap_page()
4756 if (vmf->flags & FAULT_FLAG_WRITE) { in do_swap_page()
4758 vmf->flags &= ~FAULT_FLAG_WRITE; in do_swap_page()
4763 folio_ref_add(folio, nr_pages - 1); in do_swap_page()
4765 vmf->orig_pte = pte_advance_pfn(pte, page_idx); in do_swap_page()
4788 set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); in do_swap_page()
4789 arch_do_swap_page_nr(vma->vm_mm, vma, address, in do_swap_page()
4806 if (vmf->flags & FAULT_FLAG_WRITE) { in do_swap_page()
4813 /* No need to invalidate - it was non-present before */ in do_swap_page()
4816 if (vmf->pte) in do_swap_page()
4817 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_swap_page()
4829 if (vmf->pte) in do_swap_page()
4830 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_swap_page()
4863 struct vm_area_struct *vma = vmf->vma; in alloc_anon_folio()
4873 * If uffd is active for the vma we need per-page fault fidelity to in alloc_anon_folio()
4884 orders = thp_vma_allowable_orders(vma, vma->vm_flags, in alloc_anon_folio()
4885 TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); in alloc_anon_folio()
4886 orders = thp_vma_suitable_orders(vma, vmf->address, orders); in alloc_anon_folio()
4891 pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); in alloc_anon_folio()
4893 return ERR_PTR(-EAGAIN); in alloc_anon_folio()
4902 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); in alloc_anon_folio()
4916 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); in alloc_anon_folio()
4919 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { in alloc_anon_folio()
4933 folio_zero_user(folio, vmf->address); in alloc_anon_folio()
4943 return folio_prealloc(vma->vm_mm, vma, vmf->address, true); in alloc_anon_folio()
4947 * We enter with non-exclusive mmap_lock (to exclude vma changes,
4953 struct vm_area_struct *vma = vmf->vma; in do_anonymous_page()
4954 unsigned long addr = vmf->address; in do_anonymous_page()
4960 /* File mapping without ->vm_ops ? */ in do_anonymous_page()
4961 if (vma->vm_flags & VM_SHARED) in do_anonymous_page()
4968 if (pte_alloc(vma->vm_mm, vmf->pmd)) in do_anonymous_page()
4971 /* Use the zero-page for reads */ in do_anonymous_page()
4972 if (!(vmf->flags & FAULT_FLAG_WRITE) && in do_anonymous_page()
4973 !mm_forbids_zeropage(vma->vm_mm)) { in do_anonymous_page()
4974 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), in do_anonymous_page()
4975 vma->vm_page_prot)); in do_anonymous_page()
4976 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in do_anonymous_page()
4977 vmf->address, &vmf->ptl); in do_anonymous_page()
4978 if (!vmf->pte) in do_anonymous_page()
4981 update_mmu_tlb(vma, vmf->address, vmf->pte); in do_anonymous_page()
4984 ret = check_stable_address_space(vma->vm_mm); in do_anonymous_page()
4989 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_anonymous_page()
4999 /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ in do_anonymous_page()
5007 addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); in do_anonymous_page()
5016 entry = mk_pte(&folio->page, vma->vm_page_prot); in do_anonymous_page()
5018 if (vma->vm_flags & VM_WRITE) in do_anonymous_page()
5021 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); in do_anonymous_page()
5022 if (!vmf->pte) in do_anonymous_page()
5025 update_mmu_tlb(vma, addr, vmf->pte); in do_anonymous_page()
5027 } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { in do_anonymous_page()
5028 update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); in do_anonymous_page()
5032 ret = check_stable_address_space(vma->vm_mm); in do_anonymous_page()
5038 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_anonymous_page()
5043 folio_ref_add(folio, nr_pages - 1); in do_anonymous_page()
5044 add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); in do_anonymous_page()
5051 set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); in do_anonymous_page()
5053 /* No need to invalidate - it was non-present before */ in do_anonymous_page()
5054 update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); in do_anonymous_page()
5056 if (vmf->pte) in do_anonymous_page()
5057 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_anonymous_page()
5068 * released depending on flags and vma->vm_ops->fault() return value.
5073 struct vm_area_struct *vma = vmf->vma; in __do_fault()
5092 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { in __do_fault()
5093 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); in __do_fault()
5094 if (!vmf->prealloc_pte) in __do_fault()
5098 ret = vma->vm_ops->fault(vmf); in __do_fault()
5103 folio = page_folio(vmf->page); in __do_fault()
5104 if (unlikely(PageHWPoison(vmf->page))) { in __do_fault()
5107 if (page_mapped(vmf->page)) in __do_fault()
5110 if (mapping_evict_folio(folio->mapping, folio)) in __do_fault()
5115 vmf->page = NULL; in __do_fault()
5122 VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page); in __do_fault()
5130 struct vm_area_struct *vma = vmf->vma; in deposit_prealloc_pte()
5132 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); in deposit_prealloc_pte()
5137 mm_inc_nr_ptes(vma->vm_mm); in deposit_prealloc_pte()
5138 vmf->prealloc_pte = NULL; in deposit_prealloc_pte()
5144 struct vm_area_struct *vma = vmf->vma; in do_set_pmd()
5145 bool write = vmf->flags & FAULT_FLAG_WRITE; in do_set_pmd()
5146 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; in do_set_pmd()
5153 * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any in do_set_pmd()
5156 if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) in do_set_pmd()
5164 page = &folio->page; in do_set_pmd()
5179 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { in do_set_pmd()
5180 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); in do_set_pmd()
5181 if (!vmf->prealloc_pte) in do_set_pmd()
5185 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); in do_set_pmd()
5186 if (unlikely(!pmd_none(*vmf->pmd))) in do_set_pmd()
5191 entry = mk_huge_pmd(page, vma->vm_page_prot); in do_set_pmd()
5195 add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); in do_set_pmd()
5204 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); in do_set_pmd()
5206 update_mmu_cache_pmd(vma, haddr, vmf->pmd); in do_set_pmd()
5212 spin_unlock(vmf->ptl); in do_set_pmd()
5223 * set_pte_range - Set a range of PTEs to point to pages in a folio.
5233 struct vm_area_struct *vma = vmf->vma; in set_pte_range()
5234 bool write = vmf->flags & FAULT_FLAG_WRITE; in set_pte_range()
5235 bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); in set_pte_range()
5239 entry = mk_pte(page, vma->vm_page_prot); in set_pte_range()
5250 /* copy-on-write page */ in set_pte_range()
5251 if (write && !(vma->vm_flags & VM_SHARED)) { in set_pte_range()
5258 set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); in set_pte_range()
5260 /* no need to invalidate: a not-present page won't be cached */ in set_pte_range()
5261 update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); in set_pte_range()
5266 if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) in vmf_pte_changed()
5267 return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); in vmf_pte_changed()
5269 return !pte_none(ptep_get(vmf->pte)); in vmf_pte_changed()
5273 * finish_fault - finish page fault once we have prepared the page to fault
5289 struct vm_area_struct *vma = vmf->vma; in finish_fault()
5293 bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && in finish_fault()
5294 !(vma->vm_flags & VM_SHARED); in finish_fault()
5300 addr = vmf->address; in finish_fault()
5304 page = vmf->cow_page; in finish_fault()
5306 page = vmf->page; in finish_fault()
5312 if (!(vma->vm_flags & VM_SHARED)) { in finish_fault()
5313 ret = check_stable_address_space(vma->vm_mm); in finish_fault()
5318 if (pmd_none(*vmf->pmd)) { in finish_fault()
5325 if (vmf->prealloc_pte) in finish_fault()
5326 pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); in finish_fault()
5327 else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) in finish_fault()
5335 * Using per-page fault to maintain the uffd semantics, and same in finish_fault()
5336 * approach also applies to non-anonymous-shmem faults to avoid in finish_fault()
5344 /* The page offset of vmf->address within the VMA. */ in finish_fault()
5345 pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; in finish_fault()
5347 pgoff_t pte_off = pte_index(vmf->address); in finish_fault()
5350 * Fallback to per-page fault in case the folio size in page in finish_fault()
5354 vma_off + (nr_pages - idx) > vma_pages(vma) || in finish_fault()
5356 pte_off + (nr_pages - idx) > PTRS_PER_PTE)) { in finish_fault()
5360 addr = vmf->address - idx * PAGE_SIZE; in finish_fault()
5361 page = &folio->page; in finish_fault()
5365 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in finish_fault()
5366 addr, &vmf->ptl); in finish_fault()
5367 if (!vmf->pte) in finish_fault()
5370 /* Re-check under ptl */ in finish_fault()
5372 update_mmu_tlb(vma, addr, vmf->pte); in finish_fault()
5375 } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { in finish_fault()
5377 pte_unmap_unlock(vmf->pte, vmf->ptl); in finish_fault()
5381 folio_ref_add(folio, nr_pages - 1); in finish_fault()
5384 add_mm_counter(vma->vm_mm, type, nr_pages); in finish_fault()
5388 pte_unmap_unlock(vmf->pte, vmf->ptl); in finish_fault()
5409 return -EINVAL; in fault_around_bytes_set()
5412 * The minimum value is 1 page, however this results in no fault-around in fault_around_bytes_set()
5437 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
5438 * not ready to be mapped: not up-to-date, locked, etc.
5455 pgoff_t pte_off = pte_index(vmf->address); in do_fault_around()
5456 /* The page offset of vmf->address within the VMA. */ in do_fault_around()
5457 pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; in do_fault_around()
5463 pte_off - min(pte_off, vma_off)); in do_fault_around()
5467 pte_off + vma_pages(vmf->vma) - vma_off) - 1; in do_fault_around()
5469 if (pmd_none(*vmf->pmd)) { in do_fault_around()
5470 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); in do_fault_around()
5471 if (!vmf->prealloc_pte) in do_fault_around()
5476 ret = vmf->vma->vm_ops->map_pages(vmf, in do_fault_around()
5477 vmf->pgoff + from_pte - pte_off, in do_fault_around()
5478 vmf->pgoff + to_pte - pte_off); in do_fault_around()
5484 /* Return true if we should do read fault-around, false otherwise */
5487 /* No ->map_pages? No way to fault around... */ in should_fault_around()
5488 if (!vmf->vma->vm_ops->map_pages) in should_fault_around()
5491 if (uffd_disable_fault_around(vmf->vma)) in should_fault_around()
5504 * Let's call ->map_pages() first and use ->fault() as fallback in do_read_fault()
5523 folio = page_folio(vmf->page); in do_read_fault()
5532 struct vm_area_struct *vma = vmf->vma; in do_cow_fault()
5542 folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false); in do_cow_fault()
5546 vmf->cow_page = &folio->page; in do_cow_fault()
5554 if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) { in do_cow_fault()
5562 unlock_page(vmf->page); in do_cow_fault()
5563 put_page(vmf->page); in do_cow_fault()
5574 struct vm_area_struct *vma = vmf->vma; in do_shared_fault()
5586 folio = page_folio(vmf->page); in do_shared_fault()
5592 if (vma->vm_ops->page_mkwrite) { in do_shared_fault()
5615 * We enter with non-exclusive mmap_lock (to exclude vma changes,
5624 struct vm_area_struct *vma = vmf->vma; in do_fault()
5625 struct mm_struct *vm_mm = vma->vm_mm; in do_fault()
5631 if (!vma->vm_ops->fault) { in do_fault()
5632 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, in do_fault()
5633 vmf->address, &vmf->ptl); in do_fault()
5634 if (unlikely(!vmf->pte)) in do_fault()
5644 if (unlikely(pte_none(ptep_get(vmf->pte)))) in do_fault()
5649 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_fault()
5651 } else if (!(vmf->flags & FAULT_FLAG_WRITE)) in do_fault()
5653 else if (!(vma->vm_flags & VM_SHARED)) in do_fault()
5659 if (vmf->prealloc_pte) { in do_fault()
5660 pte_free(vm_mm, vmf->prealloc_pte); in do_fault()
5661 vmf->prealloc_pte = NULL; in do_fault()
5670 struct vm_area_struct *vma = vmf->vma; in numa_migrate_check()
5687 if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) in numa_migrate_check()
5694 *last_cpupid = (-1 & LAST_CPUPID_MASK); in numa_migrate_check()
5720 pte = pte_modify(old_pte, vma->vm_page_prot); in numa_rebuild_single_mapping()
5732 int nr = pte_pfn(fault_pte) - folio_pfn(folio); in numa_rebuild_large_mapping()
5733 unsigned long start, end, addr = vmf->address; in numa_rebuild_large_mapping()
5734 unsigned long addr_start = addr - (nr << PAGE_SHIFT); in numa_rebuild_large_mapping()
5739 start = max3(addr_start, pt_start, vma->vm_start); in numa_rebuild_large_mapping()
5741 vma->vm_end); in numa_rebuild_large_mapping()
5742 start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); in numa_rebuild_large_mapping()
5756 ptent = pte_modify(ptent, vma->vm_page_prot); in numa_rebuild_large_mapping()
5769 struct vm_area_struct *vma = vmf->vma; in do_numa_page()
5783 spin_lock(vmf->ptl); in do_numa_page()
5785 old_pte = ptep_get(vmf->pte); in do_numa_page()
5787 if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { in do_numa_page()
5788 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_numa_page()
5792 pte = pte_modify(old_pte, vma->vm_page_prot); in do_numa_page()
5800 can_change_pte_writable(vma, vmf->address, pte)) in do_numa_page()
5803 folio = vm_normal_folio(vma, vmf->address, pte); in do_numa_page()
5810 target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags, in do_numa_page()
5819 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_numa_page()
5832 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, in do_numa_page()
5833 vmf->address, &vmf->ptl); in do_numa_page()
5834 if (unlikely(!vmf->pte)) in do_numa_page()
5836 if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { in do_numa_page()
5837 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_numa_page()
5843 * non-accessible ptes, some can allow access by kernel mode. in do_numa_page()
5849 numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, in do_numa_page()
5851 pte_unmap_unlock(vmf->pte, vmf->ptl); in do_numa_page()
5860 struct vm_area_struct *vma = vmf->vma; in create_huge_pmd()
5863 if (vma->vm_ops->huge_fault) in create_huge_pmd()
5864 return vma->vm_ops->huge_fault(vmf, PMD_ORDER); in create_huge_pmd()
5871 struct vm_area_struct *vma = vmf->vma; in wp_huge_pmd()
5872 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; in wp_huge_pmd()
5877 userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { in wp_huge_pmd()
5878 if (userfaultfd_wp_async(vmf->vma)) in wp_huge_pmd()
5885 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { in wp_huge_pmd()
5886 if (vma->vm_ops->huge_fault) { in wp_huge_pmd()
5887 ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); in wp_huge_pmd()
5894 /* COW or write-notify handled on pte level: split pmd. */ in wp_huge_pmd()
5895 __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); in wp_huge_pmd()
5904 struct vm_area_struct *vma = vmf->vma; in create_huge_pud()
5908 if (vma->vm_ops->huge_fault) in create_huge_pud()
5909 return vma->vm_ops->huge_fault(vmf, PUD_ORDER); in create_huge_pud()
5918 struct vm_area_struct *vma = vmf->vma; in wp_huge_pud()
5924 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { in wp_huge_pud()
5925 if (vma->vm_ops->huge_fault) { in wp_huge_pud()
5926 ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); in wp_huge_pud()
5932 /* COW or write-notify not handled on PUD level: split pud.*/ in wp_huge_pud()
5933 __split_huge_pud(vma, vmf->pud, vmf->address); in wp_huge_pud()
5947 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
5957 if (unlikely(pmd_none(*vmf->pmd))) { in handle_pte_fault()
5959 * Leave __pte_alloc() until later: because vm_ops->fault may in handle_pte_fault()
5964 vmf->pte = NULL; in handle_pte_fault()
5965 vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; in handle_pte_fault()
5975 * Use the maywrite version to indicate that vmf->pte may be in handle_pte_fault()
5982 vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd, in handle_pte_fault()
5983 vmf->address, &dummy_pmdval, in handle_pte_fault()
5984 &vmf->ptl); in handle_pte_fault()
5985 if (unlikely(!vmf->pte)) in handle_pte_fault()
5987 vmf->orig_pte = ptep_get_lockless(vmf->pte); in handle_pte_fault()
5988 vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; in handle_pte_fault()
5990 if (pte_none(vmf->orig_pte)) { in handle_pte_fault()
5991 pte_unmap(vmf->pte); in handle_pte_fault()
5992 vmf->pte = NULL; in handle_pte_fault()
5996 if (!vmf->pte) in handle_pte_fault()
5999 if (!pte_present(vmf->orig_pte)) in handle_pte_fault()
6002 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) in handle_pte_fault()
6005 spin_lock(vmf->ptl); in handle_pte_fault()
6006 entry = vmf->orig_pte; in handle_pte_fault()
6007 if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { in handle_pte_fault()
6008 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); in handle_pte_fault()
6011 if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { in handle_pte_fault()
6014 else if (likely(vmf->flags & FAULT_FLAG_WRITE)) in handle_pte_fault()
6018 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, in handle_pte_fault()
6019 vmf->flags & FAULT_FLAG_WRITE)) { in handle_pte_fault()
6020 update_mmu_cache_range(vmf, vmf->vma, vmf->address, in handle_pte_fault()
6021 vmf->pte, 1); in handle_pte_fault()
6024 if (vmf->flags & FAULT_FLAG_TRIED) in handle_pte_fault()
6032 if (vmf->flags & FAULT_FLAG_WRITE) in handle_pte_fault()
6033 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, in handle_pte_fault()
6034 vmf->pte); in handle_pte_fault()
6037 pte_unmap_unlock(vmf->pte, vmf->ptl); in handle_pte_fault()
6058 struct mm_struct *mm = vma->vm_mm; in __handle_mm_fault()
6059 unsigned long vm_flags = vma->vm_flags; in __handle_mm_fault()
6144 * mm_account_fault - Do page fault accounting
6147 * of perf event counters, but we'll still do the per-task accounting to
6156 * still be in per-arch page fault handlers at the entry of page fault.
6193 current->maj_flt++; in mm_account_fault()
6195 current->min_flt++; in mm_account_fault()
6215 current->in_lru_fault = vma_has_recency(vma); in lru_gen_enter_fault()
6220 current->in_lru_fault = false; in lru_gen_exit_fault()
6240 * just treat it like an ordinary read-fault otherwise. in sanitize_fault_flags()
6242 if (!is_cow_mapping(vma->vm_flags)) in sanitize_fault_flags()
6245 /* Write faults on read-only mappings are impossible ... */ in sanitize_fault_flags()
6246 if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) in sanitize_fault_flags()
6249 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && in sanitize_fault_flags()
6250 !is_cow_mapping(vma->vm_flags))) in sanitize_fault_flags()
6255 * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of in sanitize_fault_flags()
6278 struct mm_struct *mm = vma->vm_mm; in handle_mm_fault()
6295 is_droppable = !!(vma->vm_flags & VM_DROPPABLE); in handle_mm_fault()
6298 * Enable the memcg OOM handling for faults triggered in user in handle_mm_fault()
6307 ret = hugetlb_fault(vma->vm_mm, vma, address, flags); in handle_mm_fault()
6312 * Warning: It is no longer safe to dereference vma-> after this point, in handle_mm_fault()
6365 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but in mmap_upgrade_trylock()
6410 if (likely(vma && (vma->vm_start <= addr))) in lock_mm_and_find_vma()
6417 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { in lock_mm_and_find_vma()
6428 * re-take it, and also look up the vma again, in lock_mm_and_find_vma()
6429 * re-checking it. in lock_mm_and_find_vma()
6438 if (vma->vm_start <= addr) in lock_mm_and_find_vma()
6440 if (!(vma->vm_flags & VM_GROWSDOWN)) in lock_mm_and_find_vma()
6470 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) in __vma_enter_locked()
6473 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); in __vma_enter_locked()
6474 rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, in __vma_enter_locked()
6475 refcount_read(&vma->vm_refcnt) == tgt_refcnt, in __vma_enter_locked()
6477 lock_acquired(&vma->vmlock_dep_map, _RET_IP_); in __vma_enter_locked()
6484 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); in __vma_exit_locked()
6485 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); in __vma_exit_locked()
6505 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); in __vma_start_write()
6524 * write-locked and readers can increment vm_refcnt only temporarily in vma_mark_detached()
6529 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { in vma_mark_detached()
6548 MA_STATE(mas, &mm->mm_mt, address, address); in lock_vma_under_rcu()
6560 if (PTR_ERR(vma) == -EAGAIN) { in lock_vma_under_rcu()
6577 if (unlikely(vma->vm_mm != mm || in lock_vma_under_rcu()
6578 address < vma->vm_start || address >= vma->vm_end)) in lock_vma_under_rcu()
6596 * We've already handled the fast-path in-line.
6602 return -ENOMEM; in __p4d_alloc()
6604 spin_lock(&mm->page_table_lock); in __p4d_alloc()
6611 spin_unlock(&mm->page_table_lock); in __p4d_alloc()
6619 * We've already handled the fast-path in-line.
6625 return -ENOMEM; in __pud_alloc()
6627 spin_lock(&mm->page_table_lock); in __pud_alloc()
6634 spin_unlock(&mm->page_table_lock); in __pud_alloc()
6642 * We've already handled the fast-path in-line.
6649 return -ENOMEM; in __pmd_alloc()
6670 args->lock = lock; in pfnmap_args_setup()
6671 args->ptep = ptep; in pfnmap_args_setup()
6672 args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); in pfnmap_args_setup()
6673 args->addr_mask = addr_mask; in pfnmap_args_setup()
6674 args->pgprot = pgprot; in pfnmap_args_setup()
6675 args->writable = writable; in pfnmap_args_setup()
6676 args->special = special; in pfnmap_args_setup()
6682 struct file *file = vma->vm_file; in pfnmap_lockdep_assert()
6683 struct address_space *mapping = file ? file->f_mapping : NULL; in pfnmap_lockdep_assert()
6686 lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || in pfnmap_lockdep_assert()
6687 lockdep_is_held(&vma->vm_mm->mmap_lock)); in pfnmap_lockdep_assert()
6689 lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); in pfnmap_lockdep_assert()
6694 * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
6697 * The caller needs to setup args->vma and args->address to point to the
6714 * a later point in time can trigger use-after-free.
6726 struct vm_area_struct *vma = args->vma; in follow_pfnmap_start()
6727 unsigned long address = args->address; in follow_pfnmap_start()
6728 struct mm_struct *mm = vma->vm_mm; in follow_pfnmap_start()
6738 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) in follow_pfnmap_start()
6741 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) in follow_pfnmap_start()
6796 return -EINVAL; in follow_pfnmap_start()
6809 if (args->lock) in follow_pfnmap_end()
6810 spin_unlock(args->lock); in follow_pfnmap_end()
6811 if (args->ptep) in follow_pfnmap_end()
6812 pte_unmap(args->ptep); in follow_pfnmap_end()
6818 * generic_access_phys - generic implementation for iomem mmap access
6836 int ret = -EINVAL; in generic_access_phys()
6842 return -EINVAL; in generic_access_phys()
6849 return -EINVAL; in generic_access_phys()
6853 return -ENOMEM; in generic_access_phys()
6915 return buf - old_buf; in __access_remote_vm()
6927 if (vma->vm_ops && vma->vm_ops->access) in __access_remote_vm()
6928 bytes = vma->vm_ops->access(vma, addr, buf, in __access_remote_vm()
6935 offset = addr & (PAGE_SIZE-1); in __access_remote_vm()
6936 if (bytes > PAGE_SIZE-offset) in __access_remote_vm()
6937 bytes = PAGE_SIZE-offset; in __access_remote_vm()
6950 len -= bytes; in __access_remote_vm()
6956 return buf - old_buf; in __access_remote_vm()
6960 * access_remote_vm - access another process' address space
7003 * If there is any error return -EFAULT.
7014 return -EFAULT; in __copy_remote_vm_str()
7020 err = -EFAULT; in __copy_remote_vm_str()
7038 err = -EFAULT; in __copy_remote_vm_str()
7043 offset = addr & (PAGE_SIZE - 1); in __copy_remote_vm_str()
7044 if (bytes > PAGE_SIZE - offset) in __copy_remote_vm_str()
7045 bytes = PAGE_SIZE - offset; in __copy_remote_vm_str()
7056 buf += bytes - 1; in __copy_remote_vm_str()
7063 addr += bytes - 1; in __copy_remote_vm_str()
7064 copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1); in __copy_remote_vm_str()
7068 len -= bytes; in __copy_remote_vm_str()
7077 return buf - old_buf; in __copy_remote_vm_str()
7081 * copy_remote_vm_str - copy a string from another process's address space.
7091 * not including the trailing NUL. Always guaranteed to leave NUL-terminated
7092 * buffer. On any error, return -EFAULT.
7106 return -EFAULT; in copy_remote_vm_str()
7123 struct mm_struct *mm = current->mm; in print_vma_addr()
7133 if (vma && vma->vm_file) { in print_vma_addr()
7134 struct file *f = vma->vm_file; in print_vma_addr()
7135 ip -= vma->vm_start; in print_vma_addr()
7136 ip += vma->vm_pgoff << PAGE_SHIFT; in print_vma_addr()
7138 vma->vm_start, in print_vma_addr()
7139 vma->vm_end - vma->vm_start); in print_vma_addr()
7150 if (current->mm) in __might_fault()
7151 might_lock_read(&current->mm->mmap_lock); in __might_fault()
7169 ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1); in process_huge_page()
7173 n = (addr_hint - addr) / PAGE_SIZE; in process_huge_page()
7179 for (i = nr_pages - 1; i >= 2 * n; i--) { in process_huge_page()
7187 base = nr_pages - 2 * (nr_pages - n); in process_huge_page()
7188 l = nr_pages - n; in process_huge_page()
7198 * Process remaining subpages in left-right-left-right pattern in process_huge_page()
7203 int right_idx = base + 2 * l - 1 - i; in process_huge_page()
7239 * folio_zero_user - Zero a folio which will be mapped to userspace.
7270 return -EHWPOISON; in copy_user_gigantic_page()
7284 struct page *dst = folio_page(copy_arg->dst, idx); in copy_subpage()
7285 struct page *src = folio_page(copy_arg->src, idx); in copy_subpage()
7287 if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) in copy_subpage()
7288 return -EHWPOISON; in copy_subpage()
7328 ret_val -= (PAGE_SIZE - rc); in copy_folio_from_user()
7346 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, in ptlock_cache_init()
7357 ptdesc->ptl = ptl; in ptlock_alloc()
7363 if (ptdesc->ptl) in ptlock_free()
7364 kmem_cache_free(page_ptl_cachep, ptdesc->ptl); in ptlock_free()