1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4 #include <linux/mm.h>
5 #include <linux/sched.h>
6 #include <linux/sched/mm.h>
7 #include <linux/mmu_notifier.h>
8 #include <linux/rmap.h>
9 #include <linux/swap.h>
10 #include <linux/mm_inline.h>
11 #include <linux/kthread.h>
12 #include <linux/khugepaged.h>
13 #include <linux/freezer.h>
14 #include <linux/mman.h>
15 #include <linux/hashtable.h>
16 #include <linux/userfaultfd_k.h>
17 #include <linux/page_idle.h>
18 #include <linux/page_table_check.h>
19 #include <linux/rcupdate_wait.h>
20 #include <linux/swapops.h>
21 #include <linux/shmem_fs.h>
22 #include <linux/dax.h>
23 #include <linux/ksm.h>
24
25 #include <asm/tlb.h>
26 #include <asm/pgalloc.h>
27 #include "internal.h"
28 #include "mm_slot.h"
29
30 enum scan_result {
31 SCAN_FAIL,
32 SCAN_SUCCEED,
33 SCAN_PMD_NULL,
34 SCAN_PMD_NONE,
35 SCAN_PMD_MAPPED,
36 SCAN_EXCEED_NONE_PTE,
37 SCAN_EXCEED_SWAP_PTE,
38 SCAN_EXCEED_SHARED_PTE,
39 SCAN_PTE_NON_PRESENT,
40 SCAN_PTE_UFFD_WP,
41 SCAN_PTE_MAPPED_HUGEPAGE,
42 SCAN_PAGE_RO,
43 SCAN_LACK_REFERENCED_PAGE,
44 SCAN_PAGE_NULL,
45 SCAN_SCAN_ABORT,
46 SCAN_PAGE_COUNT,
47 SCAN_PAGE_LRU,
48 SCAN_PAGE_LOCK,
49 SCAN_PAGE_ANON,
50 SCAN_PAGE_COMPOUND,
51 SCAN_ANY_PROCESS,
52 SCAN_VMA_NULL,
53 SCAN_VMA_CHECK,
54 SCAN_ADDRESS_RANGE,
55 SCAN_DEL_PAGE_LRU,
56 SCAN_ALLOC_HUGE_PAGE_FAIL,
57 SCAN_CGROUP_CHARGE_FAIL,
58 SCAN_TRUNCATED,
59 SCAN_PAGE_HAS_PRIVATE,
60 SCAN_STORE_FAILED,
61 SCAN_COPY_MC,
62 SCAN_PAGE_FILLED,
63 };
64
65 #define CREATE_TRACE_POINTS
66 #include <trace/events/huge_memory.h>
67
68 static struct task_struct *khugepaged_thread __read_mostly;
69 static DEFINE_MUTEX(khugepaged_mutex);
70
71 /* default scan 8*512 pte (or vmas) every 30 second */
72 static unsigned int khugepaged_pages_to_scan __read_mostly;
73 static unsigned int khugepaged_pages_collapsed;
74 static unsigned int khugepaged_full_scans;
75 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
76 /* during fragmentation poll the hugepage allocator once every minute */
77 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
78 static unsigned long khugepaged_sleep_expire;
79 static DEFINE_SPINLOCK(khugepaged_mm_lock);
80 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
81 /*
82 * default collapse hugepages if there is at least one pte mapped like
83 * it would have happened if the vma was large enough during page
84 * fault.
85 *
86 * Note that these are only respected if collapse was initiated by khugepaged.
87 */
88 unsigned int khugepaged_max_ptes_none __read_mostly;
89 static unsigned int khugepaged_max_ptes_swap __read_mostly;
90 static unsigned int khugepaged_max_ptes_shared __read_mostly;
91
92 #define MM_SLOTS_HASH_BITS 10
93 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
94
95 static struct kmem_cache *mm_slot_cache __ro_after_init;
96
97 struct collapse_control {
98 bool is_khugepaged;
99
100 /* Num pages scanned per node */
101 u32 node_load[MAX_NUMNODES];
102
103 /* nodemask for allocation fallback */
104 nodemask_t alloc_nmask;
105 };
106
107 /**
108 * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
109 * @slot: hash lookup from mm to mm_slot
110 */
111 struct khugepaged_mm_slot {
112 struct mm_slot slot;
113 };
114
115 /**
116 * struct khugepaged_scan - cursor for scanning
117 * @mm_head: the head of the mm list to scan
118 * @mm_slot: the current mm_slot we are scanning
119 * @address: the next address inside that to be scanned
120 *
121 * There is only the one khugepaged_scan instance of this cursor structure.
122 */
123 struct khugepaged_scan {
124 struct list_head mm_head;
125 struct khugepaged_mm_slot *mm_slot;
126 unsigned long address;
127 };
128
129 static struct khugepaged_scan khugepaged_scan = {
130 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
131 };
132
133 #ifdef CONFIG_SYSFS
scan_sleep_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)134 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
135 struct kobj_attribute *attr,
136 char *buf)
137 {
138 return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
139 }
140
scan_sleep_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)141 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
142 struct kobj_attribute *attr,
143 const char *buf, size_t count)
144 {
145 unsigned int msecs;
146 int err;
147
148 err = kstrtouint(buf, 10, &msecs);
149 if (err)
150 return -EINVAL;
151
152 khugepaged_scan_sleep_millisecs = msecs;
153 khugepaged_sleep_expire = 0;
154 wake_up_interruptible(&khugepaged_wait);
155
156 return count;
157 }
158 static struct kobj_attribute scan_sleep_millisecs_attr =
159 __ATTR_RW(scan_sleep_millisecs);
160
alloc_sleep_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)161 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
162 struct kobj_attribute *attr,
163 char *buf)
164 {
165 return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
166 }
167
alloc_sleep_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)168 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 const char *buf, size_t count)
171 {
172 unsigned int msecs;
173 int err;
174
175 err = kstrtouint(buf, 10, &msecs);
176 if (err)
177 return -EINVAL;
178
179 khugepaged_alloc_sleep_millisecs = msecs;
180 khugepaged_sleep_expire = 0;
181 wake_up_interruptible(&khugepaged_wait);
182
183 return count;
184 }
185 static struct kobj_attribute alloc_sleep_millisecs_attr =
186 __ATTR_RW(alloc_sleep_millisecs);
187
pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)188 static ssize_t pages_to_scan_show(struct kobject *kobj,
189 struct kobj_attribute *attr,
190 char *buf)
191 {
192 return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
193 }
pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)194 static ssize_t pages_to_scan_store(struct kobject *kobj,
195 struct kobj_attribute *attr,
196 const char *buf, size_t count)
197 {
198 unsigned int pages;
199 int err;
200
201 err = kstrtouint(buf, 10, &pages);
202 if (err || !pages)
203 return -EINVAL;
204
205 khugepaged_pages_to_scan = pages;
206
207 return count;
208 }
209 static struct kobj_attribute pages_to_scan_attr =
210 __ATTR_RW(pages_to_scan);
211
pages_collapsed_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)212 static ssize_t pages_collapsed_show(struct kobject *kobj,
213 struct kobj_attribute *attr,
214 char *buf)
215 {
216 return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
217 }
218 static struct kobj_attribute pages_collapsed_attr =
219 __ATTR_RO(pages_collapsed);
220
full_scans_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)221 static ssize_t full_scans_show(struct kobject *kobj,
222 struct kobj_attribute *attr,
223 char *buf)
224 {
225 return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
226 }
227 static struct kobj_attribute full_scans_attr =
228 __ATTR_RO(full_scans);
229
defrag_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)230 static ssize_t defrag_show(struct kobject *kobj,
231 struct kobj_attribute *attr, char *buf)
232 {
233 return single_hugepage_flag_show(kobj, attr, buf,
234 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
235 }
defrag_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)236 static ssize_t defrag_store(struct kobject *kobj,
237 struct kobj_attribute *attr,
238 const char *buf, size_t count)
239 {
240 return single_hugepage_flag_store(kobj, attr, buf, count,
241 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
242 }
243 static struct kobj_attribute khugepaged_defrag_attr =
244 __ATTR_RW(defrag);
245
246 /*
247 * max_ptes_none controls if khugepaged should collapse hugepages over
248 * any unmapped ptes in turn potentially increasing the memory
249 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
250 * reduce the available free memory in the system as it
251 * runs. Increasing max_ptes_none will instead potentially reduce the
252 * free memory in the system during the khugepaged scan.
253 */
max_ptes_none_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)254 static ssize_t max_ptes_none_show(struct kobject *kobj,
255 struct kobj_attribute *attr,
256 char *buf)
257 {
258 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
259 }
max_ptes_none_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)260 static ssize_t max_ptes_none_store(struct kobject *kobj,
261 struct kobj_attribute *attr,
262 const char *buf, size_t count)
263 {
264 int err;
265 unsigned long max_ptes_none;
266
267 err = kstrtoul(buf, 10, &max_ptes_none);
268 if (err || max_ptes_none > HPAGE_PMD_NR - 1)
269 return -EINVAL;
270
271 khugepaged_max_ptes_none = max_ptes_none;
272
273 return count;
274 }
275 static struct kobj_attribute khugepaged_max_ptes_none_attr =
276 __ATTR_RW(max_ptes_none);
277
max_ptes_swap_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)278 static ssize_t max_ptes_swap_show(struct kobject *kobj,
279 struct kobj_attribute *attr,
280 char *buf)
281 {
282 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
283 }
284
max_ptes_swap_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)285 static ssize_t max_ptes_swap_store(struct kobject *kobj,
286 struct kobj_attribute *attr,
287 const char *buf, size_t count)
288 {
289 int err;
290 unsigned long max_ptes_swap;
291
292 err = kstrtoul(buf, 10, &max_ptes_swap);
293 if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
294 return -EINVAL;
295
296 khugepaged_max_ptes_swap = max_ptes_swap;
297
298 return count;
299 }
300
301 static struct kobj_attribute khugepaged_max_ptes_swap_attr =
302 __ATTR_RW(max_ptes_swap);
303
max_ptes_shared_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)304 static ssize_t max_ptes_shared_show(struct kobject *kobj,
305 struct kobj_attribute *attr,
306 char *buf)
307 {
308 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
309 }
310
max_ptes_shared_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)311 static ssize_t max_ptes_shared_store(struct kobject *kobj,
312 struct kobj_attribute *attr,
313 const char *buf, size_t count)
314 {
315 int err;
316 unsigned long max_ptes_shared;
317
318 err = kstrtoul(buf, 10, &max_ptes_shared);
319 if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
320 return -EINVAL;
321
322 khugepaged_max_ptes_shared = max_ptes_shared;
323
324 return count;
325 }
326
327 static struct kobj_attribute khugepaged_max_ptes_shared_attr =
328 __ATTR_RW(max_ptes_shared);
329
330 static struct attribute *khugepaged_attr[] = {
331 &khugepaged_defrag_attr.attr,
332 &khugepaged_max_ptes_none_attr.attr,
333 &khugepaged_max_ptes_swap_attr.attr,
334 &khugepaged_max_ptes_shared_attr.attr,
335 &pages_to_scan_attr.attr,
336 &pages_collapsed_attr.attr,
337 &full_scans_attr.attr,
338 &scan_sleep_millisecs_attr.attr,
339 &alloc_sleep_millisecs_attr.attr,
340 NULL,
341 };
342
343 struct attribute_group khugepaged_attr_group = {
344 .attrs = khugepaged_attr,
345 .name = "khugepaged",
346 };
347 #endif /* CONFIG_SYSFS */
348
hugepage_madvise(struct vm_area_struct * vma,vm_flags_t * vm_flags,int advice)349 int hugepage_madvise(struct vm_area_struct *vma,
350 vm_flags_t *vm_flags, int advice)
351 {
352 switch (advice) {
353 case MADV_HUGEPAGE:
354 #ifdef CONFIG_S390
355 /*
356 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
357 * can't handle this properly after s390_enable_sie, so we simply
358 * ignore the madvise to prevent qemu from causing a SIGSEGV.
359 */
360 if (mm_has_pgste(vma->vm_mm))
361 return 0;
362 #endif
363 *vm_flags &= ~VM_NOHUGEPAGE;
364 *vm_flags |= VM_HUGEPAGE;
365 /*
366 * If the vma become good for khugepaged to scan,
367 * register it here without waiting a page fault that
368 * may not happen any time soon.
369 */
370 khugepaged_enter_vma(vma, *vm_flags);
371 break;
372 case MADV_NOHUGEPAGE:
373 *vm_flags &= ~VM_HUGEPAGE;
374 *vm_flags |= VM_NOHUGEPAGE;
375 /*
376 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
377 * this vma even if we leave the mm registered in khugepaged if
378 * it got registered before VM_NOHUGEPAGE was set.
379 */
380 break;
381 }
382
383 return 0;
384 }
385
khugepaged_init(void)386 int __init khugepaged_init(void)
387 {
388 mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0);
389 if (!mm_slot_cache)
390 return -ENOMEM;
391
392 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
393 khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
394 khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
395 khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
396
397 return 0;
398 }
399
khugepaged_destroy(void)400 void __init khugepaged_destroy(void)
401 {
402 kmem_cache_destroy(mm_slot_cache);
403 }
404
hpage_collapse_test_exit(struct mm_struct * mm)405 static inline int hpage_collapse_test_exit(struct mm_struct *mm)
406 {
407 return atomic_read(&mm->mm_users) == 0;
408 }
409
hpage_collapse_test_exit_or_disable(struct mm_struct * mm)410 static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
411 {
412 return hpage_collapse_test_exit(mm) ||
413 test_bit(MMF_DISABLE_THP, &mm->flags);
414 }
415
hugepage_pmd_enabled(void)416 static bool hugepage_pmd_enabled(void)
417 {
418 /*
419 * We cover the anon, shmem and the file-backed case here; file-backed
420 * hugepages, when configured in, are determined by the global control.
421 * Anon pmd-sized hugepages are determined by the pmd-size control.
422 * Shmem pmd-sized hugepages are also determined by its pmd-size control,
423 * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
424 */
425 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
426 hugepage_global_enabled())
427 return true;
428 if (test_bit(PMD_ORDER, &huge_anon_orders_always))
429 return true;
430 if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
431 return true;
432 if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
433 hugepage_global_enabled())
434 return true;
435 if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
436 return true;
437 return false;
438 }
439
__khugepaged_enter(struct mm_struct * mm)440 void __khugepaged_enter(struct mm_struct *mm)
441 {
442 struct khugepaged_mm_slot *mm_slot;
443 struct mm_slot *slot;
444 int wakeup;
445
446 /* __khugepaged_exit() must not run from under us */
447 VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
448 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
449 return;
450
451 mm_slot = mm_slot_alloc(mm_slot_cache);
452 if (!mm_slot)
453 return;
454
455 slot = &mm_slot->slot;
456
457 spin_lock(&khugepaged_mm_lock);
458 mm_slot_insert(mm_slots_hash, mm, slot);
459 /*
460 * Insert just behind the scanning cursor, to let the area settle
461 * down a little.
462 */
463 wakeup = list_empty(&khugepaged_scan.mm_head);
464 list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
465 spin_unlock(&khugepaged_mm_lock);
466
467 mmgrab(mm);
468 if (wakeup)
469 wake_up_interruptible(&khugepaged_wait);
470 }
471
khugepaged_enter_vma(struct vm_area_struct * vma,vm_flags_t vm_flags)472 void khugepaged_enter_vma(struct vm_area_struct *vma,
473 vm_flags_t vm_flags)
474 {
475 if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
476 hugepage_pmd_enabled()) {
477 if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
478 PMD_ORDER))
479 __khugepaged_enter(vma->vm_mm);
480 }
481 }
482
__khugepaged_exit(struct mm_struct * mm)483 void __khugepaged_exit(struct mm_struct *mm)
484 {
485 struct khugepaged_mm_slot *mm_slot;
486 struct mm_slot *slot;
487 int free = 0;
488
489 spin_lock(&khugepaged_mm_lock);
490 slot = mm_slot_lookup(mm_slots_hash, mm);
491 mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
492 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
493 hash_del(&slot->hash);
494 list_del(&slot->mm_node);
495 free = 1;
496 }
497 spin_unlock(&khugepaged_mm_lock);
498
499 if (free) {
500 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
501 mm_slot_free(mm_slot_cache, mm_slot);
502 mmdrop(mm);
503 } else if (mm_slot) {
504 /*
505 * This is required to serialize against
506 * hpage_collapse_test_exit() (which is guaranteed to run
507 * under mmap sem read mode). Stop here (after we return all
508 * pagetables will be destroyed) until khugepaged has finished
509 * working on the pagetables under the mmap_lock.
510 */
511 mmap_write_lock(mm);
512 mmap_write_unlock(mm);
513 }
514 }
515
release_pte_folio(struct folio * folio)516 static void release_pte_folio(struct folio *folio)
517 {
518 node_stat_mod_folio(folio,
519 NR_ISOLATED_ANON + folio_is_file_lru(folio),
520 -folio_nr_pages(folio));
521 folio_unlock(folio);
522 folio_putback_lru(folio);
523 }
524
release_pte_pages(pte_t * pte,pte_t * _pte,struct list_head * compound_pagelist)525 static void release_pte_pages(pte_t *pte, pte_t *_pte,
526 struct list_head *compound_pagelist)
527 {
528 struct folio *folio, *tmp;
529
530 while (--_pte >= pte) {
531 pte_t pteval = ptep_get(_pte);
532 unsigned long pfn;
533
534 if (pte_none(pteval))
535 continue;
536 pfn = pte_pfn(pteval);
537 if (is_zero_pfn(pfn))
538 continue;
539 folio = pfn_folio(pfn);
540 if (folio_test_large(folio))
541 continue;
542 release_pte_folio(folio);
543 }
544
545 list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
546 list_del(&folio->lru);
547 release_pte_folio(folio);
548 }
549 }
550
__collapse_huge_page_isolate(struct vm_area_struct * vma,unsigned long address,pte_t * pte,struct collapse_control * cc,struct list_head * compound_pagelist)551 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
552 unsigned long address,
553 pte_t *pte,
554 struct collapse_control *cc,
555 struct list_head *compound_pagelist)
556 {
557 struct page *page = NULL;
558 struct folio *folio = NULL;
559 pte_t *_pte;
560 int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
561 bool writable = false;
562
563 for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
564 _pte++, address += PAGE_SIZE) {
565 pte_t pteval = ptep_get(_pte);
566 if (pte_none(pteval) || (pte_present(pteval) &&
567 is_zero_pfn(pte_pfn(pteval)))) {
568 ++none_or_zero;
569 if (!userfaultfd_armed(vma) &&
570 (!cc->is_khugepaged ||
571 none_or_zero <= khugepaged_max_ptes_none)) {
572 continue;
573 } else {
574 result = SCAN_EXCEED_NONE_PTE;
575 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
576 goto out;
577 }
578 }
579 if (!pte_present(pteval)) {
580 result = SCAN_PTE_NON_PRESENT;
581 goto out;
582 }
583 if (pte_uffd_wp(pteval)) {
584 result = SCAN_PTE_UFFD_WP;
585 goto out;
586 }
587 page = vm_normal_page(vma, address, pteval);
588 if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
589 result = SCAN_PAGE_NULL;
590 goto out;
591 }
592
593 folio = page_folio(page);
594 VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
595
596 /* See hpage_collapse_scan_pmd(). */
597 if (folio_maybe_mapped_shared(folio)) {
598 ++shared;
599 if (cc->is_khugepaged &&
600 shared > khugepaged_max_ptes_shared) {
601 result = SCAN_EXCEED_SHARED_PTE;
602 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
603 goto out;
604 }
605 }
606
607 if (folio_test_large(folio)) {
608 struct folio *f;
609
610 /*
611 * Check if we have dealt with the compound page
612 * already
613 */
614 list_for_each_entry(f, compound_pagelist, lru) {
615 if (folio == f)
616 goto next;
617 }
618 }
619
620 /*
621 * We can do it before folio_isolate_lru because the
622 * folio can't be freed from under us. NOTE: PG_lock
623 * is needed to serialize against split_huge_page
624 * when invoked from the VM.
625 */
626 if (!folio_trylock(folio)) {
627 result = SCAN_PAGE_LOCK;
628 goto out;
629 }
630
631 /*
632 * Check if the page has any GUP (or other external) pins.
633 *
634 * The page table that maps the page has been already unlinked
635 * from the page table tree and this process cannot get
636 * an additional pin on the page.
637 *
638 * New pins can come later if the page is shared across fork,
639 * but not from this process. The other process cannot write to
640 * the page, only trigger CoW.
641 */
642 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
643 folio_unlock(folio);
644 result = SCAN_PAGE_COUNT;
645 goto out;
646 }
647
648 /*
649 * Isolate the page to avoid collapsing an hugepage
650 * currently in use by the VM.
651 */
652 if (!folio_isolate_lru(folio)) {
653 folio_unlock(folio);
654 result = SCAN_DEL_PAGE_LRU;
655 goto out;
656 }
657 node_stat_mod_folio(folio,
658 NR_ISOLATED_ANON + folio_is_file_lru(folio),
659 folio_nr_pages(folio));
660 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
661 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
662
663 if (folio_test_large(folio))
664 list_add_tail(&folio->lru, compound_pagelist);
665 next:
666 /*
667 * If collapse was initiated by khugepaged, check that there is
668 * enough young pte to justify collapsing the page
669 */
670 if (cc->is_khugepaged &&
671 (pte_young(pteval) || folio_test_young(folio) ||
672 folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
673 address)))
674 referenced++;
675
676 if (pte_write(pteval))
677 writable = true;
678 }
679
680 if (unlikely(!writable)) {
681 result = SCAN_PAGE_RO;
682 } else if (unlikely(cc->is_khugepaged && !referenced)) {
683 result = SCAN_LACK_REFERENCED_PAGE;
684 } else {
685 result = SCAN_SUCCEED;
686 trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
687 referenced, writable, result);
688 return result;
689 }
690 out:
691 release_pte_pages(pte, _pte, compound_pagelist);
692 trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
693 referenced, writable, result);
694 return result;
695 }
696
__collapse_huge_page_copy_succeeded(pte_t * pte,struct vm_area_struct * vma,unsigned long address,spinlock_t * ptl,struct list_head * compound_pagelist)697 static void __collapse_huge_page_copy_succeeded(pte_t *pte,
698 struct vm_area_struct *vma,
699 unsigned long address,
700 spinlock_t *ptl,
701 struct list_head *compound_pagelist)
702 {
703 unsigned long end = address + HPAGE_PMD_SIZE;
704 struct folio *src, *tmp;
705 pte_t pteval;
706 pte_t *_pte;
707 unsigned int nr_ptes;
708
709 for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
710 address += nr_ptes * PAGE_SIZE) {
711 nr_ptes = 1;
712 pteval = ptep_get(_pte);
713 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
714 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
715 if (is_zero_pfn(pte_pfn(pteval))) {
716 /*
717 * ptl mostly unnecessary.
718 */
719 spin_lock(ptl);
720 ptep_clear(vma->vm_mm, address, _pte);
721 spin_unlock(ptl);
722 ksm_might_unmap_zero_page(vma->vm_mm, pteval);
723 }
724 } else {
725 struct page *src_page = pte_page(pteval);
726
727 src = page_folio(src_page);
728
729 if (folio_test_large(src)) {
730 unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
731
732 nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
733 } else {
734 release_pte_folio(src);
735 }
736
737 /*
738 * ptl mostly unnecessary, but preempt has to
739 * be disabled to update the per-cpu stats
740 * inside folio_remove_rmap_pte().
741 */
742 spin_lock(ptl);
743 clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
744 folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
745 spin_unlock(ptl);
746 free_swap_cache(src);
747 folio_put_refs(src, nr_ptes);
748 }
749 }
750
751 list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
752 list_del(&src->lru);
753 node_stat_sub_folio(src, NR_ISOLATED_ANON +
754 folio_is_file_lru(src));
755 folio_unlock(src);
756 free_swap_cache(src);
757 folio_putback_lru(src);
758 }
759 }
760
__collapse_huge_page_copy_failed(pte_t * pte,pmd_t * pmd,pmd_t orig_pmd,struct vm_area_struct * vma,struct list_head * compound_pagelist)761 static void __collapse_huge_page_copy_failed(pte_t *pte,
762 pmd_t *pmd,
763 pmd_t orig_pmd,
764 struct vm_area_struct *vma,
765 struct list_head *compound_pagelist)
766 {
767 spinlock_t *pmd_ptl;
768
769 /*
770 * Re-establish the PMD to point to the original page table
771 * entry. Restoring PMD needs to be done prior to releasing
772 * pages. Since pages are still isolated and locked here,
773 * acquiring anon_vma_lock_write is unnecessary.
774 */
775 pmd_ptl = pmd_lock(vma->vm_mm, pmd);
776 pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
777 spin_unlock(pmd_ptl);
778 /*
779 * Release both raw and compound pages isolated
780 * in __collapse_huge_page_isolate.
781 */
782 release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
783 }
784
785 /*
786 * __collapse_huge_page_copy - attempts to copy memory contents from raw
787 * pages to a hugepage. Cleans up the raw pages if copying succeeds;
788 * otherwise restores the original page table and releases isolated raw pages.
789 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
790 *
791 * @pte: starting of the PTEs to copy from
792 * @folio: the new hugepage to copy contents to
793 * @pmd: pointer to the new hugepage's PMD
794 * @orig_pmd: the original raw pages' PMD
795 * @vma: the original raw pages' virtual memory area
796 * @address: starting address to copy
797 * @ptl: lock on raw pages' PTEs
798 * @compound_pagelist: list that stores compound pages
799 */
__collapse_huge_page_copy(pte_t * pte,struct folio * folio,pmd_t * pmd,pmd_t orig_pmd,struct vm_area_struct * vma,unsigned long address,spinlock_t * ptl,struct list_head * compound_pagelist)800 static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
801 pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
802 unsigned long address, spinlock_t *ptl,
803 struct list_head *compound_pagelist)
804 {
805 unsigned int i;
806 int result = SCAN_SUCCEED;
807
808 /*
809 * Copying pages' contents is subject to memory poison at any iteration.
810 */
811 for (i = 0; i < HPAGE_PMD_NR; i++) {
812 pte_t pteval = ptep_get(pte + i);
813 struct page *page = folio_page(folio, i);
814 unsigned long src_addr = address + i * PAGE_SIZE;
815 struct page *src_page;
816
817 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
818 clear_user_highpage(page, src_addr);
819 continue;
820 }
821 src_page = pte_page(pteval);
822 if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) {
823 result = SCAN_COPY_MC;
824 break;
825 }
826 }
827
828 if (likely(result == SCAN_SUCCEED))
829 __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
830 compound_pagelist);
831 else
832 __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
833 compound_pagelist);
834
835 return result;
836 }
837
khugepaged_alloc_sleep(void)838 static void khugepaged_alloc_sleep(void)
839 {
840 DEFINE_WAIT(wait);
841
842 add_wait_queue(&khugepaged_wait, &wait);
843 __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
844 schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
845 remove_wait_queue(&khugepaged_wait, &wait);
846 }
847
848 struct collapse_control khugepaged_collapse_control = {
849 .is_khugepaged = true,
850 };
851
hpage_collapse_scan_abort(int nid,struct collapse_control * cc)852 static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
853 {
854 int i;
855
856 /*
857 * If node_reclaim_mode is disabled, then no extra effort is made to
858 * allocate memory locally.
859 */
860 if (!node_reclaim_enabled())
861 return false;
862
863 /* If there is a count for this node already, it must be acceptable */
864 if (cc->node_load[nid])
865 return false;
866
867 for (i = 0; i < MAX_NUMNODES; i++) {
868 if (!cc->node_load[i])
869 continue;
870 if (node_distance(nid, i) > node_reclaim_distance)
871 return true;
872 }
873 return false;
874 }
875
876 #define khugepaged_defrag() \
877 (transparent_hugepage_flags & \
878 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
879
880 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
alloc_hugepage_khugepaged_gfpmask(void)881 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
882 {
883 return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
884 }
885
886 #ifdef CONFIG_NUMA
hpage_collapse_find_target_node(struct collapse_control * cc)887 static int hpage_collapse_find_target_node(struct collapse_control *cc)
888 {
889 int nid, target_node = 0, max_value = 0;
890
891 /* find first node with max normal pages hit */
892 for (nid = 0; nid < MAX_NUMNODES; nid++)
893 if (cc->node_load[nid] > max_value) {
894 max_value = cc->node_load[nid];
895 target_node = nid;
896 }
897
898 for_each_online_node(nid) {
899 if (max_value == cc->node_load[nid])
900 node_set(nid, cc->alloc_nmask);
901 }
902
903 return target_node;
904 }
905 #else
hpage_collapse_find_target_node(struct collapse_control * cc)906 static int hpage_collapse_find_target_node(struct collapse_control *cc)
907 {
908 return 0;
909 }
910 #endif
911
912 /*
913 * If mmap_lock temporarily dropped, revalidate vma
914 * before taking mmap_lock.
915 * Returns enum scan_result value.
916 */
917
hugepage_vma_revalidate(struct mm_struct * mm,unsigned long address,bool expect_anon,struct vm_area_struct ** vmap,struct collapse_control * cc)918 static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
919 bool expect_anon,
920 struct vm_area_struct **vmap,
921 struct collapse_control *cc)
922 {
923 struct vm_area_struct *vma;
924 unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
925
926 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
927 return SCAN_ANY_PROCESS;
928
929 *vmap = vma = find_vma(mm, address);
930 if (!vma)
931 return SCAN_VMA_NULL;
932
933 if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
934 return SCAN_ADDRESS_RANGE;
935 if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
936 return SCAN_VMA_CHECK;
937 /*
938 * Anon VMA expected, the address may be unmapped then
939 * remapped to file after khugepaged reaquired the mmap_lock.
940 *
941 * thp_vma_allowable_order may return true for qualified file
942 * vmas.
943 */
944 if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
945 return SCAN_PAGE_ANON;
946 return SCAN_SUCCEED;
947 }
948
check_pmd_state(pmd_t * pmd)949 static inline int check_pmd_state(pmd_t *pmd)
950 {
951 pmd_t pmde = pmdp_get_lockless(pmd);
952
953 if (pmd_none(pmde))
954 return SCAN_PMD_NONE;
955
956 /*
957 * The folio may be under migration when khugepaged is trying to
958 * collapse it. Migration success or failure will eventually end
959 * up with a present PMD mapping a folio again.
960 */
961 if (is_pmd_migration_entry(pmde))
962 return SCAN_PMD_MAPPED;
963 if (!pmd_present(pmde))
964 return SCAN_PMD_NULL;
965 if (pmd_trans_huge(pmde))
966 return SCAN_PMD_MAPPED;
967 if (pmd_bad(pmde))
968 return SCAN_PMD_NULL;
969 return SCAN_SUCCEED;
970 }
971
find_pmd_or_thp_or_none(struct mm_struct * mm,unsigned long address,pmd_t ** pmd)972 static int find_pmd_or_thp_or_none(struct mm_struct *mm,
973 unsigned long address,
974 pmd_t **pmd)
975 {
976 *pmd = mm_find_pmd(mm, address);
977 if (!*pmd)
978 return SCAN_PMD_NULL;
979
980 return check_pmd_state(*pmd);
981 }
982
check_pmd_still_valid(struct mm_struct * mm,unsigned long address,pmd_t * pmd)983 static int check_pmd_still_valid(struct mm_struct *mm,
984 unsigned long address,
985 pmd_t *pmd)
986 {
987 pmd_t *new_pmd;
988 int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
989
990 if (result != SCAN_SUCCEED)
991 return result;
992 if (new_pmd != pmd)
993 return SCAN_FAIL;
994 return SCAN_SUCCEED;
995 }
996
997 /*
998 * Bring missing pages in from swap, to complete THP collapse.
999 * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
1000 *
1001 * Called and returns without pte mapped or spinlocks held.
1002 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
1003 */
__collapse_huge_page_swapin(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long haddr,pmd_t * pmd,int referenced)1004 static int __collapse_huge_page_swapin(struct mm_struct *mm,
1005 struct vm_area_struct *vma,
1006 unsigned long haddr, pmd_t *pmd,
1007 int referenced)
1008 {
1009 int swapped_in = 0;
1010 vm_fault_t ret = 0;
1011 unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
1012 int result;
1013 pte_t *pte = NULL;
1014 spinlock_t *ptl;
1015
1016 for (address = haddr; address < end; address += PAGE_SIZE) {
1017 struct vm_fault vmf = {
1018 .vma = vma,
1019 .address = address,
1020 .pgoff = linear_page_index(vma, address),
1021 .flags = FAULT_FLAG_ALLOW_RETRY,
1022 .pmd = pmd,
1023 };
1024
1025 if (!pte++) {
1026 /*
1027 * Here the ptl is only used to check pte_same() in
1028 * do_swap_page(), so readonly version is enough.
1029 */
1030 pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl);
1031 if (!pte) {
1032 mmap_read_unlock(mm);
1033 result = SCAN_PMD_NULL;
1034 goto out;
1035 }
1036 }
1037
1038 vmf.orig_pte = ptep_get_lockless(pte);
1039 if (!is_swap_pte(vmf.orig_pte))
1040 continue;
1041
1042 vmf.pte = pte;
1043 vmf.ptl = ptl;
1044 ret = do_swap_page(&vmf);
1045 /* Which unmaps pte (after perhaps re-checking the entry) */
1046 pte = NULL;
1047
1048 /*
1049 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
1050 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
1051 * we do not retry here and swap entry will remain in pagetable
1052 * resulting in later failure.
1053 */
1054 if (ret & VM_FAULT_RETRY) {
1055 /* Likely, but not guaranteed, that page lock failed */
1056 result = SCAN_PAGE_LOCK;
1057 goto out;
1058 }
1059 if (ret & VM_FAULT_ERROR) {
1060 mmap_read_unlock(mm);
1061 result = SCAN_FAIL;
1062 goto out;
1063 }
1064 swapped_in++;
1065 }
1066
1067 if (pte)
1068 pte_unmap(pte);
1069
1070 /* Drain LRU cache to remove extra pin on the swapped in pages */
1071 if (swapped_in)
1072 lru_add_drain();
1073
1074 result = SCAN_SUCCEED;
1075 out:
1076 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
1077 return result;
1078 }
1079
alloc_charge_folio(struct folio ** foliop,struct mm_struct * mm,struct collapse_control * cc)1080 static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
1081 struct collapse_control *cc)
1082 {
1083 gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
1084 GFP_TRANSHUGE);
1085 int node = hpage_collapse_find_target_node(cc);
1086 struct folio *folio;
1087
1088 folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
1089 if (!folio) {
1090 *foliop = NULL;
1091 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1092 return SCAN_ALLOC_HUGE_PAGE_FAIL;
1093 }
1094
1095 count_vm_event(THP_COLLAPSE_ALLOC);
1096 if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
1097 folio_put(folio);
1098 *foliop = NULL;
1099 return SCAN_CGROUP_CHARGE_FAIL;
1100 }
1101
1102 count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
1103
1104 *foliop = folio;
1105 return SCAN_SUCCEED;
1106 }
1107
collapse_huge_page(struct mm_struct * mm,unsigned long address,int referenced,int unmapped,struct collapse_control * cc)1108 static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
1109 int referenced, int unmapped,
1110 struct collapse_control *cc)
1111 {
1112 LIST_HEAD(compound_pagelist);
1113 pmd_t *pmd, _pmd;
1114 pte_t *pte;
1115 pgtable_t pgtable;
1116 struct folio *folio;
1117 spinlock_t *pmd_ptl, *pte_ptl;
1118 int result = SCAN_FAIL;
1119 struct vm_area_struct *vma;
1120 struct mmu_notifier_range range;
1121
1122 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1123
1124 /*
1125 * Before allocating the hugepage, release the mmap_lock read lock.
1126 * The allocation can take potentially a long time if it involves
1127 * sync compaction, and we do not need to hold the mmap_lock during
1128 * that. We will recheck the vma after taking it again in write mode.
1129 */
1130 mmap_read_unlock(mm);
1131
1132 result = alloc_charge_folio(&folio, mm, cc);
1133 if (result != SCAN_SUCCEED)
1134 goto out_nolock;
1135
1136 mmap_read_lock(mm);
1137 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1138 if (result != SCAN_SUCCEED) {
1139 mmap_read_unlock(mm);
1140 goto out_nolock;
1141 }
1142
1143 result = find_pmd_or_thp_or_none(mm, address, &pmd);
1144 if (result != SCAN_SUCCEED) {
1145 mmap_read_unlock(mm);
1146 goto out_nolock;
1147 }
1148
1149 if (unmapped) {
1150 /*
1151 * __collapse_huge_page_swapin will return with mmap_lock
1152 * released when it fails. So we jump out_nolock directly in
1153 * that case. Continuing to collapse causes inconsistency.
1154 */
1155 result = __collapse_huge_page_swapin(mm, vma, address, pmd,
1156 referenced);
1157 if (result != SCAN_SUCCEED)
1158 goto out_nolock;
1159 }
1160
1161 mmap_read_unlock(mm);
1162 /*
1163 * Prevent all access to pagetables with the exception of
1164 * gup_fast later handled by the ptep_clear_flush and the VM
1165 * handled by the anon_vma lock + PG_lock.
1166 *
1167 * UFFDIO_MOVE is prevented to race as well thanks to the
1168 * mmap_lock.
1169 */
1170 mmap_write_lock(mm);
1171 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
1172 if (result != SCAN_SUCCEED)
1173 goto out_up_write;
1174 /* check if the pmd is still valid */
1175 result = check_pmd_still_valid(mm, address, pmd);
1176 if (result != SCAN_SUCCEED)
1177 goto out_up_write;
1178
1179 vma_start_write(vma);
1180 anon_vma_lock_write(vma->anon_vma);
1181
1182 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
1183 address + HPAGE_PMD_SIZE);
1184 mmu_notifier_invalidate_range_start(&range);
1185
1186 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1187 /*
1188 * This removes any huge TLB entry from the CPU so we won't allow
1189 * huge and small TLB entries for the same virtual address to
1190 * avoid the risk of CPU bugs in that area.
1191 *
1192 * Parallel GUP-fast is fine since GUP-fast will back off when
1193 * it detects PMD is changed.
1194 */
1195 _pmd = pmdp_collapse_flush(vma, address, pmd);
1196 spin_unlock(pmd_ptl);
1197 mmu_notifier_invalidate_range_end(&range);
1198 tlb_remove_table_sync_one();
1199
1200 pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
1201 if (pte) {
1202 result = __collapse_huge_page_isolate(vma, address, pte, cc,
1203 &compound_pagelist);
1204 spin_unlock(pte_ptl);
1205 } else {
1206 result = SCAN_PMD_NULL;
1207 }
1208
1209 if (unlikely(result != SCAN_SUCCEED)) {
1210 if (pte)
1211 pte_unmap(pte);
1212 spin_lock(pmd_ptl);
1213 BUG_ON(!pmd_none(*pmd));
1214 /*
1215 * We can only use set_pmd_at when establishing
1216 * hugepmds and never for establishing regular pmds that
1217 * points to regular pagetables. Use pmd_populate for that
1218 */
1219 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1220 spin_unlock(pmd_ptl);
1221 anon_vma_unlock_write(vma->anon_vma);
1222 goto out_up_write;
1223 }
1224
1225 /*
1226 * All pages are isolated and locked so anon_vma rmap
1227 * can't run anymore.
1228 */
1229 anon_vma_unlock_write(vma->anon_vma);
1230
1231 result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
1232 vma, address, pte_ptl,
1233 &compound_pagelist);
1234 pte_unmap(pte);
1235 if (unlikely(result != SCAN_SUCCEED))
1236 goto out_up_write;
1237
1238 /*
1239 * The smp_wmb() inside __folio_mark_uptodate() ensures the
1240 * copy_huge_page writes become visible before the set_pmd_at()
1241 * write.
1242 */
1243 __folio_mark_uptodate(folio);
1244 pgtable = pmd_pgtable(_pmd);
1245
1246 _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
1247 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1248
1249 spin_lock(pmd_ptl);
1250 BUG_ON(!pmd_none(*pmd));
1251 folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
1252 folio_add_lru_vma(folio, vma);
1253 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1254 set_pmd_at(mm, address, pmd, _pmd);
1255 update_mmu_cache_pmd(vma, address, pmd);
1256 deferred_split_folio(folio, false);
1257 spin_unlock(pmd_ptl);
1258
1259 folio = NULL;
1260
1261 result = SCAN_SUCCEED;
1262 out_up_write:
1263 mmap_write_unlock(mm);
1264 out_nolock:
1265 if (folio)
1266 folio_put(folio);
1267 trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
1268 return result;
1269 }
1270
hpage_collapse_scan_pmd(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long address,bool * mmap_locked,struct collapse_control * cc)1271 static int hpage_collapse_scan_pmd(struct mm_struct *mm,
1272 struct vm_area_struct *vma,
1273 unsigned long address, bool *mmap_locked,
1274 struct collapse_control *cc)
1275 {
1276 pmd_t *pmd;
1277 pte_t *pte, *_pte;
1278 int result = SCAN_FAIL, referenced = 0;
1279 int none_or_zero = 0, shared = 0;
1280 struct page *page = NULL;
1281 struct folio *folio = NULL;
1282 unsigned long _address;
1283 spinlock_t *ptl;
1284 int node = NUMA_NO_NODE, unmapped = 0;
1285 bool writable = false;
1286
1287 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1288
1289 result = find_pmd_or_thp_or_none(mm, address, &pmd);
1290 if (result != SCAN_SUCCEED)
1291 goto out;
1292
1293 memset(cc->node_load, 0, sizeof(cc->node_load));
1294 nodes_clear(cc->alloc_nmask);
1295 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1296 if (!pte) {
1297 result = SCAN_PMD_NULL;
1298 goto out;
1299 }
1300
1301 for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
1302 _pte++, _address += PAGE_SIZE) {
1303 pte_t pteval = ptep_get(_pte);
1304 if (is_swap_pte(pteval)) {
1305 ++unmapped;
1306 if (!cc->is_khugepaged ||
1307 unmapped <= khugepaged_max_ptes_swap) {
1308 /*
1309 * Always be strict with uffd-wp
1310 * enabled swap entries. Please see
1311 * comment below for pte_uffd_wp().
1312 */
1313 if (pte_swp_uffd_wp_any(pteval)) {
1314 result = SCAN_PTE_UFFD_WP;
1315 goto out_unmap;
1316 }
1317 continue;
1318 } else {
1319 result = SCAN_EXCEED_SWAP_PTE;
1320 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
1321 goto out_unmap;
1322 }
1323 }
1324 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
1325 ++none_or_zero;
1326 if (!userfaultfd_armed(vma) &&
1327 (!cc->is_khugepaged ||
1328 none_or_zero <= khugepaged_max_ptes_none)) {
1329 continue;
1330 } else {
1331 result = SCAN_EXCEED_NONE_PTE;
1332 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
1333 goto out_unmap;
1334 }
1335 }
1336 if (pte_uffd_wp(pteval)) {
1337 /*
1338 * Don't collapse the page if any of the small
1339 * PTEs are armed with uffd write protection.
1340 * Here we can also mark the new huge pmd as
1341 * write protected if any of the small ones is
1342 * marked but that could bring unknown
1343 * userfault messages that falls outside of
1344 * the registered range. So, just be simple.
1345 */
1346 result = SCAN_PTE_UFFD_WP;
1347 goto out_unmap;
1348 }
1349 if (pte_write(pteval))
1350 writable = true;
1351
1352 page = vm_normal_page(vma, _address, pteval);
1353 if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
1354 result = SCAN_PAGE_NULL;
1355 goto out_unmap;
1356 }
1357 folio = page_folio(page);
1358
1359 if (!folio_test_anon(folio)) {
1360 result = SCAN_PAGE_ANON;
1361 goto out_unmap;
1362 }
1363
1364 /*
1365 * We treat a single page as shared if any part of the THP
1366 * is shared.
1367 */
1368 if (folio_maybe_mapped_shared(folio)) {
1369 ++shared;
1370 if (cc->is_khugepaged &&
1371 shared > khugepaged_max_ptes_shared) {
1372 result = SCAN_EXCEED_SHARED_PTE;
1373 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
1374 goto out_unmap;
1375 }
1376 }
1377
1378 /*
1379 * Record which node the original page is from and save this
1380 * information to cc->node_load[].
1381 * Khugepaged will allocate hugepage from the node has the max
1382 * hit record.
1383 */
1384 node = folio_nid(folio);
1385 if (hpage_collapse_scan_abort(node, cc)) {
1386 result = SCAN_SCAN_ABORT;
1387 goto out_unmap;
1388 }
1389 cc->node_load[node]++;
1390 if (!folio_test_lru(folio)) {
1391 result = SCAN_PAGE_LRU;
1392 goto out_unmap;
1393 }
1394 if (folio_test_locked(folio)) {
1395 result = SCAN_PAGE_LOCK;
1396 goto out_unmap;
1397 }
1398
1399 /*
1400 * Check if the page has any GUP (or other external) pins.
1401 *
1402 * Here the check may be racy:
1403 * it may see folio_mapcount() > folio_ref_count().
1404 * But such case is ephemeral we could always retry collapse
1405 * later. However it may report false positive if the page
1406 * has excessive GUP pins (i.e. 512). Anyway the same check
1407 * will be done again later the risk seems low.
1408 */
1409 if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
1410 result = SCAN_PAGE_COUNT;
1411 goto out_unmap;
1412 }
1413
1414 /*
1415 * If collapse was initiated by khugepaged, check that there is
1416 * enough young pte to justify collapsing the page
1417 */
1418 if (cc->is_khugepaged &&
1419 (pte_young(pteval) || folio_test_young(folio) ||
1420 folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
1421 address)))
1422 referenced++;
1423 }
1424 if (!writable) {
1425 result = SCAN_PAGE_RO;
1426 } else if (cc->is_khugepaged &&
1427 (!referenced ||
1428 (unmapped && referenced < HPAGE_PMD_NR / 2))) {
1429 result = SCAN_LACK_REFERENCED_PAGE;
1430 } else {
1431 result = SCAN_SUCCEED;
1432 }
1433 out_unmap:
1434 pte_unmap_unlock(pte, ptl);
1435 if (result == SCAN_SUCCEED) {
1436 result = collapse_huge_page(mm, address, referenced,
1437 unmapped, cc);
1438 /* collapse_huge_page will return with the mmap_lock released */
1439 *mmap_locked = false;
1440 }
1441 out:
1442 trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
1443 none_or_zero, result, unmapped);
1444 return result;
1445 }
1446
collect_mm_slot(struct khugepaged_mm_slot * mm_slot)1447 static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
1448 {
1449 struct mm_slot *slot = &mm_slot->slot;
1450 struct mm_struct *mm = slot->mm;
1451
1452 lockdep_assert_held(&khugepaged_mm_lock);
1453
1454 if (hpage_collapse_test_exit(mm)) {
1455 /* free mm_slot */
1456 hash_del(&slot->hash);
1457 list_del(&slot->mm_node);
1458
1459 /*
1460 * Not strictly needed because the mm exited already.
1461 *
1462 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1463 */
1464
1465 /* khugepaged_mm_lock actually not necessary for the below */
1466 mm_slot_free(mm_slot_cache, mm_slot);
1467 mmdrop(mm);
1468 }
1469 }
1470
1471 /* folio must be locked, and mmap_lock must be held */
set_huge_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp,struct folio * folio,struct page * page)1472 static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
1473 pmd_t *pmdp, struct folio *folio, struct page *page)
1474 {
1475 struct vm_fault vmf = {
1476 .vma = vma,
1477 .address = addr,
1478 .flags = 0,
1479 .pmd = pmdp,
1480 };
1481
1482 mmap_assert_locked(vma->vm_mm);
1483
1484 if (do_set_pmd(&vmf, folio, page))
1485 return SCAN_FAIL;
1486
1487 folio_get(folio);
1488 return SCAN_SUCCEED;
1489 }
1490
1491 /**
1492 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
1493 * address haddr.
1494 *
1495 * @mm: process address space where collapse happens
1496 * @addr: THP collapse address
1497 * @install_pmd: If a huge PMD should be installed
1498 *
1499 * This function checks whether all the PTEs in the PMD are pointing to the
1500 * right THP. If so, retract the page table so the THP can refault in with
1501 * as pmd-mapped. Possibly install a huge PMD mapping the THP.
1502 */
collapse_pte_mapped_thp(struct mm_struct * mm,unsigned long addr,bool install_pmd)1503 int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
1504 bool install_pmd)
1505 {
1506 int nr_mapped_ptes = 0, result = SCAN_FAIL;
1507 unsigned int nr_batch_ptes;
1508 struct mmu_notifier_range range;
1509 bool notified = false;
1510 unsigned long haddr = addr & HPAGE_PMD_MASK;
1511 unsigned long end = haddr + HPAGE_PMD_SIZE;
1512 struct vm_area_struct *vma = vma_lookup(mm, haddr);
1513 struct folio *folio;
1514 pte_t *start_pte, *pte;
1515 pmd_t *pmd, pgt_pmd;
1516 spinlock_t *pml = NULL, *ptl;
1517 int i;
1518
1519 mmap_assert_locked(mm);
1520
1521 /* First check VMA found, in case page tables are being torn down */
1522 if (!vma || !vma->vm_file ||
1523 !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
1524 return SCAN_VMA_CHECK;
1525
1526 /* Fast check before locking page if already PMD-mapped */
1527 result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
1528 if (result == SCAN_PMD_MAPPED)
1529 return result;
1530
1531 /*
1532 * If we are here, we've succeeded in replacing all the native pages
1533 * in the page cache with a single hugepage. If a mm were to fault-in
1534 * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
1535 * and map it by a PMD, regardless of sysfs THP settings. As such, let's
1536 * analogously elide sysfs THP settings here.
1537 */
1538 if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
1539 return SCAN_VMA_CHECK;
1540
1541 /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
1542 if (userfaultfd_wp(vma))
1543 return SCAN_PTE_UFFD_WP;
1544
1545 folio = filemap_lock_folio(vma->vm_file->f_mapping,
1546 linear_page_index(vma, haddr));
1547 if (IS_ERR(folio))
1548 return SCAN_PAGE_NULL;
1549
1550 if (folio_order(folio) != HPAGE_PMD_ORDER) {
1551 result = SCAN_PAGE_COMPOUND;
1552 goto drop_folio;
1553 }
1554
1555 result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
1556 switch (result) {
1557 case SCAN_SUCCEED:
1558 break;
1559 case SCAN_PMD_NONE:
1560 /*
1561 * All pte entries have been removed and pmd cleared.
1562 * Skip all the pte checks and just update the pmd mapping.
1563 */
1564 goto maybe_install_pmd;
1565 default:
1566 goto drop_folio;
1567 }
1568
1569 result = SCAN_FAIL;
1570 start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
1571 if (!start_pte) /* mmap_lock + page lock should prevent this */
1572 goto drop_folio;
1573
1574 /* step 1: check all mapped PTEs are to the right huge page */
1575 for (i = 0, addr = haddr, pte = start_pte;
1576 i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1577 struct page *page;
1578 pte_t ptent = ptep_get(pte);
1579
1580 /* empty pte, skip */
1581 if (pte_none(ptent))
1582 continue;
1583
1584 /* page swapped out, abort */
1585 if (!pte_present(ptent)) {
1586 result = SCAN_PTE_NON_PRESENT;
1587 goto abort;
1588 }
1589
1590 page = vm_normal_page(vma, addr, ptent);
1591 if (WARN_ON_ONCE(page && is_zone_device_page(page)))
1592 page = NULL;
1593 /*
1594 * Note that uprobe, debugger, or MAP_PRIVATE may change the
1595 * page table, but the new page will not be a subpage of hpage.
1596 */
1597 if (folio_page(folio, i) != page)
1598 goto abort;
1599 }
1600
1601 pte_unmap_unlock(start_pte, ptl);
1602 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1603 haddr, haddr + HPAGE_PMD_SIZE);
1604 mmu_notifier_invalidate_range_start(&range);
1605 notified = true;
1606
1607 /*
1608 * pmd_lock covers a wider range than ptl, and (if split from mm's
1609 * page_table_lock) ptl nests inside pml. The less time we hold pml,
1610 * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
1611 * inserts a valid as-if-COWed PTE without even looking up page cache.
1612 * So page lock of folio does not protect from it, so we must not drop
1613 * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
1614 */
1615 if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
1616 pml = pmd_lock(mm, pmd);
1617
1618 start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
1619 if (!start_pte) /* mmap_lock + page lock should prevent this */
1620 goto abort;
1621 if (!pml)
1622 spin_lock(ptl);
1623 else if (ptl != pml)
1624 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1625
1626 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
1627 goto abort;
1628
1629 /* step 2: clear page table and adjust rmap */
1630 for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
1631 i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
1632 pte += nr_batch_ptes) {
1633 unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
1634 struct page *page;
1635 pte_t ptent = ptep_get(pte);
1636
1637 nr_batch_ptes = 1;
1638
1639 if (pte_none(ptent))
1640 continue;
1641 /*
1642 * We dropped ptl after the first scan, to do the mmu_notifier:
1643 * page lock stops more PTEs of the folio being faulted in, but
1644 * does not stop write faults COWing anon copies from existing
1645 * PTEs; and does not stop those being swapped out or migrated.
1646 */
1647 if (!pte_present(ptent)) {
1648 result = SCAN_PTE_NON_PRESENT;
1649 goto abort;
1650 }
1651 page = vm_normal_page(vma, addr, ptent);
1652
1653 if (folio_page(folio, i) != page)
1654 goto abort;
1655
1656 nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
1657
1658 /*
1659 * Must clear entry, or a racing truncate may re-remove it.
1660 * TLB flush can be left until pmdp_collapse_flush() does it.
1661 * PTE dirty? Shmem page is already dirty; file is read-only.
1662 */
1663 clear_ptes(mm, addr, pte, nr_batch_ptes);
1664 folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
1665 nr_mapped_ptes += nr_batch_ptes;
1666 }
1667
1668 if (!pml)
1669 spin_unlock(ptl);
1670
1671 /* step 3: set proper refcount and mm_counters. */
1672 if (nr_mapped_ptes) {
1673 folio_ref_sub(folio, nr_mapped_ptes);
1674 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
1675 }
1676
1677 /* step 4: remove empty page table */
1678 if (!pml) {
1679 pml = pmd_lock(mm, pmd);
1680 if (ptl != pml) {
1681 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1682 if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
1683 flush_tlb_mm(mm);
1684 goto unlock;
1685 }
1686 }
1687 }
1688 pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
1689 pmdp_get_lockless_sync();
1690 pte_unmap_unlock(start_pte, ptl);
1691 if (ptl != pml)
1692 spin_unlock(pml);
1693
1694 mmu_notifier_invalidate_range_end(&range);
1695
1696 mm_dec_nr_ptes(mm);
1697 page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
1698 pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1699
1700 maybe_install_pmd:
1701 /* step 5: install pmd entry */
1702 result = install_pmd
1703 ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
1704 : SCAN_SUCCEED;
1705 goto drop_folio;
1706 abort:
1707 if (nr_mapped_ptes) {
1708 flush_tlb_mm(mm);
1709 folio_ref_sub(folio, nr_mapped_ptes);
1710 add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
1711 }
1712 unlock:
1713 if (start_pte)
1714 pte_unmap_unlock(start_pte, ptl);
1715 if (pml && pml != ptl)
1716 spin_unlock(pml);
1717 if (notified)
1718 mmu_notifier_invalidate_range_end(&range);
1719 drop_folio:
1720 folio_unlock(folio);
1721 folio_put(folio);
1722 return result;
1723 }
1724
retract_page_tables(struct address_space * mapping,pgoff_t pgoff)1725 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1726 {
1727 struct vm_area_struct *vma;
1728
1729 i_mmap_lock_read(mapping);
1730 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1731 struct mmu_notifier_range range;
1732 struct mm_struct *mm;
1733 unsigned long addr;
1734 pmd_t *pmd, pgt_pmd;
1735 spinlock_t *pml;
1736 spinlock_t *ptl;
1737 bool success = false;
1738
1739 /*
1740 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1741 * got written to. These VMAs are likely not worth removing
1742 * page tables from, as PMD-mapping is likely to be split later.
1743 */
1744 if (READ_ONCE(vma->anon_vma))
1745 continue;
1746
1747 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1748 if (addr & ~HPAGE_PMD_MASK ||
1749 vma->vm_end < addr + HPAGE_PMD_SIZE)
1750 continue;
1751
1752 mm = vma->vm_mm;
1753 if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
1754 continue;
1755
1756 if (hpage_collapse_test_exit(mm))
1757 continue;
1758 /*
1759 * When a vma is registered with uffd-wp, we cannot recycle
1760 * the page table because there may be pte markers installed.
1761 * Other vmas can still have the same file mapped hugely, but
1762 * skip this one: it will always be mapped in small page size
1763 * for uffd-wp registered ranges.
1764 */
1765 if (userfaultfd_wp(vma))
1766 continue;
1767
1768 /* PTEs were notified when unmapped; but now for the PMD? */
1769 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1770 addr, addr + HPAGE_PMD_SIZE);
1771 mmu_notifier_invalidate_range_start(&range);
1772
1773 pml = pmd_lock(mm, pmd);
1774 /*
1775 * The lock of new_folio is still held, we will be blocked in
1776 * the page fault path, which prevents the pte entries from
1777 * being set again. So even though the old empty PTE page may be
1778 * concurrently freed and a new PTE page is filled into the pmd
1779 * entry, it is still empty and can be removed.
1780 *
1781 * So here we only need to recheck if the state of pmd entry
1782 * still meets our requirements, rather than checking pmd_same()
1783 * like elsewhere.
1784 */
1785 if (check_pmd_state(pmd) != SCAN_SUCCEED)
1786 goto drop_pml;
1787 ptl = pte_lockptr(mm, pmd);
1788 if (ptl != pml)
1789 spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1790
1791 /*
1792 * Huge page lock is still held, so normally the page table
1793 * must remain empty; and we have already skipped anon_vma
1794 * and userfaultfd_wp() vmas. But since the mmap_lock is not
1795 * held, it is still possible for a racing userfaultfd_ioctl()
1796 * to have inserted ptes or markers. Now that we hold ptlock,
1797 * repeating the anon_vma check protects from one category,
1798 * and repeating the userfaultfd_wp() check from another.
1799 */
1800 if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
1801 pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
1802 pmdp_get_lockless_sync();
1803 success = true;
1804 }
1805
1806 if (ptl != pml)
1807 spin_unlock(ptl);
1808 drop_pml:
1809 spin_unlock(pml);
1810
1811 mmu_notifier_invalidate_range_end(&range);
1812
1813 if (success) {
1814 mm_dec_nr_ptes(mm);
1815 page_table_check_pte_clear_range(mm, addr, pgt_pmd);
1816 pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1817 }
1818 }
1819 i_mmap_unlock_read(mapping);
1820 }
1821
1822 /**
1823 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
1824 *
1825 * @mm: process address space where collapse happens
1826 * @addr: virtual collapse start address
1827 * @file: file that collapse on
1828 * @start: collapse start address
1829 * @cc: collapse context and scratchpad
1830 *
1831 * Basic scheme is simple, details are more complex:
1832 * - allocate and lock a new huge page;
1833 * - scan page cache, locking old pages
1834 * + swap/gup in pages if necessary;
1835 * - copy data to new page
1836 * - handle shmem holes
1837 * + re-validate that holes weren't filled by someone else
1838 * + check for userfaultfd
1839 * - finalize updates to the page cache;
1840 * - if replacing succeeds:
1841 * + unlock huge page;
1842 * + free old pages;
1843 * - if replacing failed;
1844 * + unlock old pages
1845 * + unlock and free huge page;
1846 */
collapse_file(struct mm_struct * mm,unsigned long addr,struct file * file,pgoff_t start,struct collapse_control * cc)1847 static int collapse_file(struct mm_struct *mm, unsigned long addr,
1848 struct file *file, pgoff_t start,
1849 struct collapse_control *cc)
1850 {
1851 struct address_space *mapping = file->f_mapping;
1852 struct page *dst;
1853 struct folio *folio, *tmp, *new_folio;
1854 pgoff_t index = 0, end = start + HPAGE_PMD_NR;
1855 LIST_HEAD(pagelist);
1856 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1857 int nr_none = 0, result = SCAN_SUCCEED;
1858 bool is_shmem = shmem_file(file);
1859
1860 VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
1861 VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
1862
1863 result = alloc_charge_folio(&new_folio, mm, cc);
1864 if (result != SCAN_SUCCEED)
1865 goto out;
1866
1867 mapping_set_update(&xas, mapping);
1868
1869 __folio_set_locked(new_folio);
1870 if (is_shmem)
1871 __folio_set_swapbacked(new_folio);
1872 new_folio->index = start;
1873 new_folio->mapping = mapping;
1874
1875 /*
1876 * Ensure we have slots for all the pages in the range. This is
1877 * almost certainly a no-op because most of the pages must be present
1878 */
1879 do {
1880 xas_lock_irq(&xas);
1881 xas_create_range(&xas);
1882 if (!xas_error(&xas))
1883 break;
1884 xas_unlock_irq(&xas);
1885 if (!xas_nomem(&xas, GFP_KERNEL)) {
1886 result = SCAN_FAIL;
1887 goto rollback;
1888 }
1889 } while (1);
1890
1891 for (index = start; index < end;) {
1892 xas_set(&xas, index);
1893 folio = xas_load(&xas);
1894
1895 VM_BUG_ON(index != xas.xa_index);
1896 if (is_shmem) {
1897 if (!folio) {
1898 /*
1899 * Stop if extent has been truncated or
1900 * hole-punched, and is now completely
1901 * empty.
1902 */
1903 if (index == start) {
1904 if (!xas_next_entry(&xas, end - 1)) {
1905 result = SCAN_TRUNCATED;
1906 goto xa_locked;
1907 }
1908 }
1909 nr_none++;
1910 index++;
1911 continue;
1912 }
1913
1914 if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
1915 xas_unlock_irq(&xas);
1916 /* swap in or instantiate fallocated page */
1917 if (shmem_get_folio(mapping->host, index, 0,
1918 &folio, SGP_NOALLOC)) {
1919 result = SCAN_FAIL;
1920 goto xa_unlocked;
1921 }
1922 /* drain lru cache to help folio_isolate_lru() */
1923 lru_add_drain();
1924 } else if (folio_trylock(folio)) {
1925 folio_get(folio);
1926 xas_unlock_irq(&xas);
1927 } else {
1928 result = SCAN_PAGE_LOCK;
1929 goto xa_locked;
1930 }
1931 } else { /* !is_shmem */
1932 if (!folio || xa_is_value(folio)) {
1933 xas_unlock_irq(&xas);
1934 page_cache_sync_readahead(mapping, &file->f_ra,
1935 file, index,
1936 end - index);
1937 /* drain lru cache to help folio_isolate_lru() */
1938 lru_add_drain();
1939 folio = filemap_lock_folio(mapping, index);
1940 if (IS_ERR(folio)) {
1941 result = SCAN_FAIL;
1942 goto xa_unlocked;
1943 }
1944 } else if (folio_test_dirty(folio)) {
1945 /*
1946 * khugepaged only works on read-only fd,
1947 * so this page is dirty because it hasn't
1948 * been flushed since first write. There
1949 * won't be new dirty pages.
1950 *
1951 * Trigger async flush here and hope the
1952 * writeback is done when khugepaged
1953 * revisits this page.
1954 *
1955 * This is a one-off situation. We are not
1956 * forcing writeback in loop.
1957 */
1958 xas_unlock_irq(&xas);
1959 filemap_flush(mapping);
1960 result = SCAN_FAIL;
1961 goto xa_unlocked;
1962 } else if (folio_test_writeback(folio)) {
1963 xas_unlock_irq(&xas);
1964 result = SCAN_FAIL;
1965 goto xa_unlocked;
1966 } else if (folio_trylock(folio)) {
1967 folio_get(folio);
1968 xas_unlock_irq(&xas);
1969 } else {
1970 result = SCAN_PAGE_LOCK;
1971 goto xa_locked;
1972 }
1973 }
1974
1975 /*
1976 * The folio must be locked, so we can drop the i_pages lock
1977 * without racing with truncate.
1978 */
1979 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1980
1981 /* make sure the folio is up to date */
1982 if (unlikely(!folio_test_uptodate(folio))) {
1983 result = SCAN_FAIL;
1984 goto out_unlock;
1985 }
1986
1987 /*
1988 * If file was truncated then extended, or hole-punched, before
1989 * we locked the first folio, then a THP might be there already.
1990 * This will be discovered on the first iteration.
1991 */
1992 if (folio_order(folio) == HPAGE_PMD_ORDER &&
1993 folio->index == start) {
1994 /* Maybe PMD-mapped */
1995 result = SCAN_PTE_MAPPED_HUGEPAGE;
1996 goto out_unlock;
1997 }
1998
1999 if (folio_mapping(folio) != mapping) {
2000 result = SCAN_TRUNCATED;
2001 goto out_unlock;
2002 }
2003
2004 if (!is_shmem && (folio_test_dirty(folio) ||
2005 folio_test_writeback(folio))) {
2006 /*
2007 * khugepaged only works on read-only fd, so this
2008 * folio is dirty because it hasn't been flushed
2009 * since first write.
2010 */
2011 result = SCAN_FAIL;
2012 goto out_unlock;
2013 }
2014
2015 if (!folio_isolate_lru(folio)) {
2016 result = SCAN_DEL_PAGE_LRU;
2017 goto out_unlock;
2018 }
2019
2020 if (!filemap_release_folio(folio, GFP_KERNEL)) {
2021 result = SCAN_PAGE_HAS_PRIVATE;
2022 folio_putback_lru(folio);
2023 goto out_unlock;
2024 }
2025
2026 if (folio_mapped(folio))
2027 try_to_unmap(folio,
2028 TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
2029
2030 xas_lock_irq(&xas);
2031
2032 VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);
2033
2034 /*
2035 * We control 2 + nr_pages references to the folio:
2036 * - we hold a pin on it;
2037 * - nr_pages reference from page cache;
2038 * - one from lru_isolate_folio;
2039 * If those are the only references, then any new usage
2040 * of the folio will have to fetch it from the page
2041 * cache. That requires locking the folio to handle
2042 * truncate, so any new usage will be blocked until we
2043 * unlock folio after collapse/during rollback.
2044 */
2045 if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
2046 result = SCAN_PAGE_COUNT;
2047 xas_unlock_irq(&xas);
2048 folio_putback_lru(folio);
2049 goto out_unlock;
2050 }
2051
2052 /*
2053 * Accumulate the folios that are being collapsed.
2054 */
2055 list_add_tail(&folio->lru, &pagelist);
2056 index += folio_nr_pages(folio);
2057 continue;
2058 out_unlock:
2059 folio_unlock(folio);
2060 folio_put(folio);
2061 goto xa_unlocked;
2062 }
2063
2064 if (!is_shmem) {
2065 filemap_nr_thps_inc(mapping);
2066 /*
2067 * Paired with the fence in do_dentry_open() -> get_write_access()
2068 * to ensure i_writecount is up to date and the update to nr_thps
2069 * is visible. Ensures the page cache will be truncated if the
2070 * file is opened writable.
2071 */
2072 smp_mb();
2073 if (inode_is_open_for_write(mapping->host)) {
2074 result = SCAN_FAIL;
2075 filemap_nr_thps_dec(mapping);
2076 }
2077 }
2078
2079 xa_locked:
2080 xas_unlock_irq(&xas);
2081 xa_unlocked:
2082
2083 /*
2084 * If collapse is successful, flush must be done now before copying.
2085 * If collapse is unsuccessful, does flush actually need to be done?
2086 * Do it anyway, to clear the state.
2087 */
2088 try_to_unmap_flush();
2089
2090 if (result == SCAN_SUCCEED && nr_none &&
2091 !shmem_charge(mapping->host, nr_none))
2092 result = SCAN_FAIL;
2093 if (result != SCAN_SUCCEED) {
2094 nr_none = 0;
2095 goto rollback;
2096 }
2097
2098 /*
2099 * The old folios are locked, so they won't change anymore.
2100 */
2101 index = start;
2102 dst = folio_page(new_folio, 0);
2103 list_for_each_entry(folio, &pagelist, lru) {
2104 int i, nr_pages = folio_nr_pages(folio);
2105
2106 while (index < folio->index) {
2107 clear_highpage(dst);
2108 index++;
2109 dst++;
2110 }
2111
2112 for (i = 0; i < nr_pages; i++) {
2113 if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) {
2114 result = SCAN_COPY_MC;
2115 goto rollback;
2116 }
2117 index++;
2118 dst++;
2119 }
2120 }
2121 while (index < end) {
2122 clear_highpage(dst);
2123 index++;
2124 dst++;
2125 }
2126
2127 if (nr_none) {
2128 struct vm_area_struct *vma;
2129 int nr_none_check = 0;
2130
2131 i_mmap_lock_read(mapping);
2132 xas_lock_irq(&xas);
2133
2134 xas_set(&xas, start);
2135 for (index = start; index < end; index++) {
2136 if (!xas_next(&xas)) {
2137 xas_store(&xas, XA_RETRY_ENTRY);
2138 if (xas_error(&xas)) {
2139 result = SCAN_STORE_FAILED;
2140 goto immap_locked;
2141 }
2142 nr_none_check++;
2143 }
2144 }
2145
2146 if (nr_none != nr_none_check) {
2147 result = SCAN_PAGE_FILLED;
2148 goto immap_locked;
2149 }
2150
2151 /*
2152 * If userspace observed a missing page in a VMA with
2153 * a MODE_MISSING userfaultfd, then it might expect a
2154 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
2155 * roll back to avoid suppressing such an event. Since
2156 * wp/minor userfaultfds don't give userspace any
2157 * guarantees that the kernel doesn't fill a missing
2158 * page with a zero page, so they don't matter here.
2159 *
2160 * Any userfaultfds registered after this point will
2161 * not be able to observe any missing pages due to the
2162 * previously inserted retry entries.
2163 */
2164 vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
2165 if (userfaultfd_missing(vma)) {
2166 result = SCAN_EXCEED_NONE_PTE;
2167 goto immap_locked;
2168 }
2169 }
2170
2171 immap_locked:
2172 i_mmap_unlock_read(mapping);
2173 if (result != SCAN_SUCCEED) {
2174 xas_set(&xas, start);
2175 for (index = start; index < end; index++) {
2176 if (xas_next(&xas) == XA_RETRY_ENTRY)
2177 xas_store(&xas, NULL);
2178 }
2179
2180 xas_unlock_irq(&xas);
2181 goto rollback;
2182 }
2183 } else {
2184 xas_lock_irq(&xas);
2185 }
2186
2187 if (is_shmem)
2188 __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
2189 else
2190 __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
2191
2192 if (nr_none) {
2193 __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
2194 /* nr_none is always 0 for non-shmem. */
2195 __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
2196 }
2197
2198 /*
2199 * Mark new_folio as uptodate before inserting it into the
2200 * page cache so that it isn't mistaken for an fallocated but
2201 * unwritten page.
2202 */
2203 folio_mark_uptodate(new_folio);
2204 folio_ref_add(new_folio, HPAGE_PMD_NR - 1);
2205
2206 if (is_shmem)
2207 folio_mark_dirty(new_folio);
2208 folio_add_lru(new_folio);
2209
2210 /* Join all the small entries into a single multi-index entry. */
2211 xas_set_order(&xas, start, HPAGE_PMD_ORDER);
2212 xas_store(&xas, new_folio);
2213 WARN_ON_ONCE(xas_error(&xas));
2214 xas_unlock_irq(&xas);
2215
2216 /*
2217 * Remove pte page tables, so we can re-fault the page as huge.
2218 * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
2219 */
2220 retract_page_tables(mapping, start);
2221 if (cc && !cc->is_khugepaged)
2222 result = SCAN_PTE_MAPPED_HUGEPAGE;
2223 folio_unlock(new_folio);
2224
2225 /*
2226 * The collapse has succeeded, so free the old folios.
2227 */
2228 list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
2229 list_del(&folio->lru);
2230 folio->mapping = NULL;
2231 folio_clear_active(folio);
2232 folio_clear_unevictable(folio);
2233 folio_unlock(folio);
2234 folio_put_refs(folio, 2 + folio_nr_pages(folio));
2235 }
2236
2237 goto out;
2238
2239 rollback:
2240 /* Something went wrong: roll back page cache changes */
2241 if (nr_none) {
2242 xas_lock_irq(&xas);
2243 mapping->nrpages -= nr_none;
2244 xas_unlock_irq(&xas);
2245 shmem_uncharge(mapping->host, nr_none);
2246 }
2247
2248 list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
2249 list_del(&folio->lru);
2250 folio_unlock(folio);
2251 folio_putback_lru(folio);
2252 folio_put(folio);
2253 }
2254 /*
2255 * Undo the updates of filemap_nr_thps_inc for non-SHMEM
2256 * file only. This undo is not needed unless failure is
2257 * due to SCAN_COPY_MC.
2258 */
2259 if (!is_shmem && result == SCAN_COPY_MC) {
2260 filemap_nr_thps_dec(mapping);
2261 /*
2262 * Paired with the fence in do_dentry_open() -> get_write_access()
2263 * to ensure the update to nr_thps is visible.
2264 */
2265 smp_mb();
2266 }
2267
2268 new_folio->mapping = NULL;
2269
2270 folio_unlock(new_folio);
2271 folio_put(new_folio);
2272 out:
2273 VM_BUG_ON(!list_empty(&pagelist));
2274 trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
2275 return result;
2276 }
2277
hpage_collapse_scan_file(struct mm_struct * mm,unsigned long addr,struct file * file,pgoff_t start,struct collapse_control * cc)2278 static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
2279 struct file *file, pgoff_t start,
2280 struct collapse_control *cc)
2281 {
2282 struct folio *folio = NULL;
2283 struct address_space *mapping = file->f_mapping;
2284 XA_STATE(xas, &mapping->i_pages, start);
2285 int present, swap;
2286 int node = NUMA_NO_NODE;
2287 int result = SCAN_SUCCEED;
2288
2289 present = 0;
2290 swap = 0;
2291 memset(cc->node_load, 0, sizeof(cc->node_load));
2292 nodes_clear(cc->alloc_nmask);
2293 rcu_read_lock();
2294 xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
2295 if (xas_retry(&xas, folio))
2296 continue;
2297
2298 if (xa_is_value(folio)) {
2299 swap += 1 << xas_get_order(&xas);
2300 if (cc->is_khugepaged &&
2301 swap > khugepaged_max_ptes_swap) {
2302 result = SCAN_EXCEED_SWAP_PTE;
2303 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
2304 break;
2305 }
2306 continue;
2307 }
2308
2309 if (!folio_try_get(folio)) {
2310 xas_reset(&xas);
2311 continue;
2312 }
2313
2314 if (unlikely(folio != xas_reload(&xas))) {
2315 folio_put(folio);
2316 xas_reset(&xas);
2317 continue;
2318 }
2319
2320 if (folio_order(folio) == HPAGE_PMD_ORDER &&
2321 folio->index == start) {
2322 /* Maybe PMD-mapped */
2323 result = SCAN_PTE_MAPPED_HUGEPAGE;
2324 /*
2325 * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
2326 * by the caller won't touch the page cache, and so
2327 * it's safe to skip LRU and refcount checks before
2328 * returning.
2329 */
2330 folio_put(folio);
2331 break;
2332 }
2333
2334 node = folio_nid(folio);
2335 if (hpage_collapse_scan_abort(node, cc)) {
2336 result = SCAN_SCAN_ABORT;
2337 folio_put(folio);
2338 break;
2339 }
2340 cc->node_load[node]++;
2341
2342 if (!folio_test_lru(folio)) {
2343 result = SCAN_PAGE_LRU;
2344 folio_put(folio);
2345 break;
2346 }
2347
2348 if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
2349 result = SCAN_PAGE_COUNT;
2350 folio_put(folio);
2351 break;
2352 }
2353
2354 /*
2355 * We probably should check if the folio is referenced
2356 * here, but nobody would transfer pte_young() to
2357 * folio_test_referenced() for us. And rmap walk here
2358 * is just too costly...
2359 */
2360
2361 present += folio_nr_pages(folio);
2362 folio_put(folio);
2363
2364 if (need_resched()) {
2365 xas_pause(&xas);
2366 cond_resched_rcu();
2367 }
2368 }
2369 rcu_read_unlock();
2370
2371 if (result == SCAN_SUCCEED) {
2372 if (cc->is_khugepaged &&
2373 present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
2374 result = SCAN_EXCEED_NONE_PTE;
2375 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
2376 } else {
2377 result = collapse_file(mm, addr, file, start, cc);
2378 }
2379 }
2380
2381 trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
2382 return result;
2383 }
2384
khugepaged_scan_mm_slot(unsigned int pages,int * result,struct collapse_control * cc)2385 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2386 struct collapse_control *cc)
2387 __releases(&khugepaged_mm_lock)
2388 __acquires(&khugepaged_mm_lock)
2389 {
2390 struct vma_iterator vmi;
2391 struct khugepaged_mm_slot *mm_slot;
2392 struct mm_slot *slot;
2393 struct mm_struct *mm;
2394 struct vm_area_struct *vma;
2395 int progress = 0;
2396
2397 VM_BUG_ON(!pages);
2398 lockdep_assert_held(&khugepaged_mm_lock);
2399 *result = SCAN_FAIL;
2400
2401 if (khugepaged_scan.mm_slot) {
2402 mm_slot = khugepaged_scan.mm_slot;
2403 slot = &mm_slot->slot;
2404 } else {
2405 slot = list_entry(khugepaged_scan.mm_head.next,
2406 struct mm_slot, mm_node);
2407 mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
2408 khugepaged_scan.address = 0;
2409 khugepaged_scan.mm_slot = mm_slot;
2410 }
2411 spin_unlock(&khugepaged_mm_lock);
2412
2413 mm = slot->mm;
2414 /*
2415 * Don't wait for semaphore (to avoid long wait times). Just move to
2416 * the next mm on the list.
2417 */
2418 vma = NULL;
2419 if (unlikely(!mmap_read_trylock(mm)))
2420 goto breakouterloop_mmap_lock;
2421
2422 progress++;
2423 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2424 goto breakouterloop;
2425
2426 vma_iter_init(&vmi, mm, khugepaged_scan.address);
2427 for_each_vma(vmi, vma) {
2428 unsigned long hstart, hend;
2429
2430 cond_resched();
2431 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
2432 progress++;
2433 break;
2434 }
2435 if (!thp_vma_allowable_order(vma, vma->vm_flags,
2436 TVA_ENFORCE_SYSFS, PMD_ORDER)) {
2437 skip:
2438 progress++;
2439 continue;
2440 }
2441 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2442 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2443 if (khugepaged_scan.address > hend)
2444 goto skip;
2445 if (khugepaged_scan.address < hstart)
2446 khugepaged_scan.address = hstart;
2447 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2448
2449 while (khugepaged_scan.address < hend) {
2450 bool mmap_locked = true;
2451
2452 cond_resched();
2453 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2454 goto breakouterloop;
2455
2456 VM_BUG_ON(khugepaged_scan.address < hstart ||
2457 khugepaged_scan.address + HPAGE_PMD_SIZE >
2458 hend);
2459 if (!vma_is_anonymous(vma)) {
2460 struct file *file = get_file(vma->vm_file);
2461 pgoff_t pgoff = linear_page_index(vma,
2462 khugepaged_scan.address);
2463
2464 mmap_read_unlock(mm);
2465 mmap_locked = false;
2466 *result = hpage_collapse_scan_file(mm,
2467 khugepaged_scan.address, file, pgoff, cc);
2468 fput(file);
2469 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
2470 mmap_read_lock(mm);
2471 if (hpage_collapse_test_exit_or_disable(mm))
2472 goto breakouterloop;
2473 *result = collapse_pte_mapped_thp(mm,
2474 khugepaged_scan.address, false);
2475 if (*result == SCAN_PMD_MAPPED)
2476 *result = SCAN_SUCCEED;
2477 mmap_read_unlock(mm);
2478 }
2479 } else {
2480 *result = hpage_collapse_scan_pmd(mm, vma,
2481 khugepaged_scan.address, &mmap_locked, cc);
2482 }
2483
2484 if (*result == SCAN_SUCCEED)
2485 ++khugepaged_pages_collapsed;
2486
2487 /* move to next address */
2488 khugepaged_scan.address += HPAGE_PMD_SIZE;
2489 progress += HPAGE_PMD_NR;
2490 if (!mmap_locked)
2491 /*
2492 * We released mmap_lock so break loop. Note
2493 * that we drop mmap_lock before all hugepage
2494 * allocations, so if allocation fails, we are
2495 * guaranteed to break here and report the
2496 * correct result back to caller.
2497 */
2498 goto breakouterloop_mmap_lock;
2499 if (progress >= pages)
2500 goto breakouterloop;
2501 }
2502 }
2503 breakouterloop:
2504 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2505 breakouterloop_mmap_lock:
2506
2507 spin_lock(&khugepaged_mm_lock);
2508 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2509 /*
2510 * Release the current mm_slot if this mm is about to die, or
2511 * if we scanned all vmas of this mm.
2512 */
2513 if (hpage_collapse_test_exit(mm) || !vma) {
2514 /*
2515 * Make sure that if mm_users is reaching zero while
2516 * khugepaged runs here, khugepaged_exit will find
2517 * mm_slot not pointing to the exiting mm.
2518 */
2519 if (slot->mm_node.next != &khugepaged_scan.mm_head) {
2520 slot = list_entry(slot->mm_node.next,
2521 struct mm_slot, mm_node);
2522 khugepaged_scan.mm_slot =
2523 mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
2524 khugepaged_scan.address = 0;
2525 } else {
2526 khugepaged_scan.mm_slot = NULL;
2527 khugepaged_full_scans++;
2528 }
2529
2530 collect_mm_slot(mm_slot);
2531 }
2532
2533 return progress;
2534 }
2535
khugepaged_has_work(void)2536 static int khugepaged_has_work(void)
2537 {
2538 return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
2539 }
2540
khugepaged_wait_event(void)2541 static int khugepaged_wait_event(void)
2542 {
2543 return !list_empty(&khugepaged_scan.mm_head) ||
2544 kthread_should_stop();
2545 }
2546
khugepaged_do_scan(struct collapse_control * cc)2547 static void khugepaged_do_scan(struct collapse_control *cc)
2548 {
2549 unsigned int progress = 0, pass_through_head = 0;
2550 unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
2551 bool wait = true;
2552 int result = SCAN_SUCCEED;
2553
2554 lru_add_drain_all();
2555
2556 while (true) {
2557 cond_resched();
2558
2559 if (unlikely(kthread_should_stop()))
2560 break;
2561
2562 spin_lock(&khugepaged_mm_lock);
2563 if (!khugepaged_scan.mm_slot)
2564 pass_through_head++;
2565 if (khugepaged_has_work() &&
2566 pass_through_head < 2)
2567 progress += khugepaged_scan_mm_slot(pages - progress,
2568 &result, cc);
2569 else
2570 progress = pages;
2571 spin_unlock(&khugepaged_mm_lock);
2572
2573 if (progress >= pages)
2574 break;
2575
2576 if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
2577 /*
2578 * If fail to allocate the first time, try to sleep for
2579 * a while. When hit again, cancel the scan.
2580 */
2581 if (!wait)
2582 break;
2583 wait = false;
2584 khugepaged_alloc_sleep();
2585 }
2586 }
2587 }
2588
khugepaged_should_wakeup(void)2589 static bool khugepaged_should_wakeup(void)
2590 {
2591 return kthread_should_stop() ||
2592 time_after_eq(jiffies, khugepaged_sleep_expire);
2593 }
2594
khugepaged_wait_work(void)2595 static void khugepaged_wait_work(void)
2596 {
2597 if (khugepaged_has_work()) {
2598 const unsigned long scan_sleep_jiffies =
2599 msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
2600
2601 if (!scan_sleep_jiffies)
2602 return;
2603
2604 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
2605 wait_event_freezable_timeout(khugepaged_wait,
2606 khugepaged_should_wakeup(),
2607 scan_sleep_jiffies);
2608 return;
2609 }
2610
2611 if (hugepage_pmd_enabled())
2612 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2613 }
2614
khugepaged(void * none)2615 static int khugepaged(void *none)
2616 {
2617 struct khugepaged_mm_slot *mm_slot;
2618
2619 set_freezable();
2620 set_user_nice(current, MAX_NICE);
2621
2622 while (!kthread_should_stop()) {
2623 khugepaged_do_scan(&khugepaged_collapse_control);
2624 khugepaged_wait_work();
2625 }
2626
2627 spin_lock(&khugepaged_mm_lock);
2628 mm_slot = khugepaged_scan.mm_slot;
2629 khugepaged_scan.mm_slot = NULL;
2630 if (mm_slot)
2631 collect_mm_slot(mm_slot);
2632 spin_unlock(&khugepaged_mm_lock);
2633 return 0;
2634 }
2635
set_recommended_min_free_kbytes(void)2636 static void set_recommended_min_free_kbytes(void)
2637 {
2638 struct zone *zone;
2639 int nr_zones = 0;
2640 unsigned long recommended_min;
2641
2642 if (!hugepage_pmd_enabled()) {
2643 calculate_min_free_kbytes();
2644 goto update_wmarks;
2645 }
2646
2647 for_each_populated_zone(zone) {
2648 /*
2649 * We don't need to worry about fragmentation of
2650 * ZONE_MOVABLE since it only has movable pages.
2651 */
2652 if (zone_idx(zone) > gfp_zone(GFP_USER))
2653 continue;
2654
2655 nr_zones++;
2656 }
2657
2658 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
2659 recommended_min = pageblock_nr_pages * nr_zones * 2;
2660
2661 /*
2662 * Make sure that on average at least two pageblocks are almost free
2663 * of another type, one for a migratetype to fall back to and a
2664 * second to avoid subsequent fallbacks of other types There are 3
2665 * MIGRATE_TYPES we care about.
2666 */
2667 recommended_min += pageblock_nr_pages * nr_zones *
2668 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
2669
2670 /* don't ever allow to reserve more than 5% of the lowmem */
2671 recommended_min = min(recommended_min,
2672 (unsigned long) nr_free_buffer_pages() / 20);
2673 recommended_min <<= (PAGE_SHIFT-10);
2674
2675 if (recommended_min > min_free_kbytes) {
2676 if (user_min_free_kbytes >= 0)
2677 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
2678 min_free_kbytes, recommended_min);
2679
2680 min_free_kbytes = recommended_min;
2681 }
2682
2683 update_wmarks:
2684 setup_per_zone_wmarks();
2685 }
2686
start_stop_khugepaged(void)2687 int start_stop_khugepaged(void)
2688 {
2689 int err = 0;
2690
2691 mutex_lock(&khugepaged_mutex);
2692 if (hugepage_pmd_enabled()) {
2693 if (!khugepaged_thread)
2694 khugepaged_thread = kthread_run(khugepaged, NULL,
2695 "khugepaged");
2696 if (IS_ERR(khugepaged_thread)) {
2697 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
2698 err = PTR_ERR(khugepaged_thread);
2699 khugepaged_thread = NULL;
2700 goto fail;
2701 }
2702
2703 if (!list_empty(&khugepaged_scan.mm_head))
2704 wake_up_interruptible(&khugepaged_wait);
2705 } else if (khugepaged_thread) {
2706 kthread_stop(khugepaged_thread);
2707 khugepaged_thread = NULL;
2708 }
2709 set_recommended_min_free_kbytes();
2710 fail:
2711 mutex_unlock(&khugepaged_mutex);
2712 return err;
2713 }
2714
khugepaged_min_free_kbytes_update(void)2715 void khugepaged_min_free_kbytes_update(void)
2716 {
2717 mutex_lock(&khugepaged_mutex);
2718 if (hugepage_pmd_enabled() && khugepaged_thread)
2719 set_recommended_min_free_kbytes();
2720 mutex_unlock(&khugepaged_mutex);
2721 }
2722
current_is_khugepaged(void)2723 bool current_is_khugepaged(void)
2724 {
2725 return kthread_func(current) == khugepaged;
2726 }
2727
madvise_collapse_errno(enum scan_result r)2728 static int madvise_collapse_errno(enum scan_result r)
2729 {
2730 /*
2731 * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
2732 * actionable feedback to caller, so they may take an appropriate
2733 * fallback measure depending on the nature of the failure.
2734 */
2735 switch (r) {
2736 case SCAN_ALLOC_HUGE_PAGE_FAIL:
2737 return -ENOMEM;
2738 case SCAN_CGROUP_CHARGE_FAIL:
2739 case SCAN_EXCEED_NONE_PTE:
2740 return -EBUSY;
2741 /* Resource temporary unavailable - trying again might succeed */
2742 case SCAN_PAGE_COUNT:
2743 case SCAN_PAGE_LOCK:
2744 case SCAN_PAGE_LRU:
2745 case SCAN_DEL_PAGE_LRU:
2746 case SCAN_PAGE_FILLED:
2747 return -EAGAIN;
2748 /*
2749 * Other: Trying again likely not to succeed / error intrinsic to
2750 * specified memory range. khugepaged likely won't be able to collapse
2751 * either.
2752 */
2753 default:
2754 return -EINVAL;
2755 }
2756 }
2757
madvise_collapse(struct vm_area_struct * vma,unsigned long start,unsigned long end,bool * lock_dropped)2758 int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
2759 unsigned long end, bool *lock_dropped)
2760 {
2761 struct collapse_control *cc;
2762 struct mm_struct *mm = vma->vm_mm;
2763 unsigned long hstart, hend, addr;
2764 int thps = 0, last_fail = SCAN_FAIL;
2765 bool mmap_locked = true;
2766
2767 BUG_ON(vma->vm_start > start);
2768 BUG_ON(vma->vm_end < end);
2769
2770 if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
2771 return -EINVAL;
2772
2773 cc = kmalloc(sizeof(*cc), GFP_KERNEL);
2774 if (!cc)
2775 return -ENOMEM;
2776 cc->is_khugepaged = false;
2777
2778 mmgrab(mm);
2779 lru_add_drain_all();
2780
2781 hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2782 hend = end & HPAGE_PMD_MASK;
2783
2784 for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
2785 int result = SCAN_FAIL;
2786
2787 if (!mmap_locked) {
2788 cond_resched();
2789 mmap_read_lock(mm);
2790 mmap_locked = true;
2791 result = hugepage_vma_revalidate(mm, addr, false, &vma,
2792 cc);
2793 if (result != SCAN_SUCCEED) {
2794 last_fail = result;
2795 goto out_nolock;
2796 }
2797
2798 hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
2799 }
2800 mmap_assert_locked(mm);
2801 memset(cc->node_load, 0, sizeof(cc->node_load));
2802 nodes_clear(cc->alloc_nmask);
2803 if (!vma_is_anonymous(vma)) {
2804 struct file *file = get_file(vma->vm_file);
2805 pgoff_t pgoff = linear_page_index(vma, addr);
2806
2807 mmap_read_unlock(mm);
2808 mmap_locked = false;
2809 result = hpage_collapse_scan_file(mm, addr, file, pgoff,
2810 cc);
2811 fput(file);
2812 } else {
2813 result = hpage_collapse_scan_pmd(mm, vma, addr,
2814 &mmap_locked, cc);
2815 }
2816 if (!mmap_locked)
2817 *lock_dropped = true;
2818
2819 handle_result:
2820 switch (result) {
2821 case SCAN_SUCCEED:
2822 case SCAN_PMD_MAPPED:
2823 ++thps;
2824 break;
2825 case SCAN_PTE_MAPPED_HUGEPAGE:
2826 BUG_ON(mmap_locked);
2827 mmap_read_lock(mm);
2828 result = collapse_pte_mapped_thp(mm, addr, true);
2829 mmap_read_unlock(mm);
2830 goto handle_result;
2831 /* Whitelisted set of results where continuing OK */
2832 case SCAN_PMD_NULL:
2833 case SCAN_PTE_NON_PRESENT:
2834 case SCAN_PTE_UFFD_WP:
2835 case SCAN_PAGE_RO:
2836 case SCAN_LACK_REFERENCED_PAGE:
2837 case SCAN_PAGE_NULL:
2838 case SCAN_PAGE_COUNT:
2839 case SCAN_PAGE_LOCK:
2840 case SCAN_PAGE_COMPOUND:
2841 case SCAN_PAGE_LRU:
2842 case SCAN_DEL_PAGE_LRU:
2843 last_fail = result;
2844 break;
2845 default:
2846 last_fail = result;
2847 /* Other error, exit */
2848 goto out_maybelock;
2849 }
2850 }
2851
2852 out_maybelock:
2853 /* Caller expects us to hold mmap_lock on return */
2854 if (!mmap_locked)
2855 mmap_read_lock(mm);
2856 out_nolock:
2857 mmap_assert_locked(mm);
2858 mmdrop(mm);
2859 kfree(cc);
2860
2861 return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
2862 : madvise_collapse_errno(last_fail);
2863 }
2864