Lines Matching +full:ext +full:- +full:gen

1 // SPDX-License-Identifier: GPL-2.0
31 #include <linux/backing-dev.h>
45 #include <linux/memory-tiers.h>
189 if ((_folio)->lru.prev != _base) { \
192 prev = lru_to_folio(&(_folio->lru)); \
193 prefetchw(&prev->_field); \
210 return sc->target_mem_cgroup; in cgroup_reclaim()
219 return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); in root_reclaim()
223 * writeback_throttling_sane - is the usual dirty throttling mechanism available?
248 if (sc->proactive && sc->proactive_swappiness) in sc_swappiness()
249 return *sc->proactive_swappiness; in sc_swappiness()
274 /* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
286 for ((idx) = 0, (zone) = (pgdat)->node_zones; \
297 WARN_ON_ONCE(rs && task->reclaim_state); in set_task_reclaim_state()
299 /* Check for the nulling of an already-nulled member */ in set_task_reclaim_state()
300 WARN_ON_ONCE(!rs && !task->reclaim_state); in set_task_reclaim_state()
302 task->reclaim_state = rs; in set_task_reclaim_state()
306 * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
307 * scan_control->nr_reclaimed.
312 * Currently, reclaim_state->reclaimed includes three types of pages in flush_reclaim_state()
319 * single memcg. For example, a memcg-aware shrinker can free one object in flush_reclaim_state()
322 * overestimating the reclaimed amount (potentially under-reclaiming). in flush_reclaim_state()
324 * Only count such pages for global reclaim to prevent under-reclaiming in flush_reclaim_state()
339 if (current->reclaim_state && root_reclaim(sc)) { in flush_reclaim_state()
340 sc->nr_reclaimed += current->reclaim_state->reclaimed; in flush_reclaim_state()
341 current->reclaim_state->reclaimed = 0; in flush_reclaim_state()
349 if (sc && sc->no_demotion) in can_demote()
363 * For non-memcg reclaim, is there in can_reclaim_anon_pages()
397 * If there are no reclaimable file-backed or anonymous pages, in zone_reclaimable_pages()
408 * lruvec_lru_size - Returns the number of pages on the given LRU list.
411 * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
461 BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
462 PGDEMOTE_##type - PGDEMOTE_KSWAPD); \
463 BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
464 PGSCAN_##type - PGSCAN_KSWAPD); \
476 return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; in reclaimer_offset()
477 if (sc->proactive) in reclaimer_offset()
478 return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD; in reclaimer_offset()
479 return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; in reclaimer_offset()
487 * private data at folio->private. in is_page_cache_freeable()
489 return folio_ref_count(folio) - folio_test_private(folio) == in is_page_cache_freeable()
495 * -ENOSPC. We need to propagate that into the address_space for a subsequent
523 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) in skip_throttle_noprogress()
531 for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) { in skip_throttle_noprogress()
544 wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; in reclaim_throttle()
554 current->flags & (PF_USER_WORKER|PF_KTHREAD)) { in reclaim_throttle()
562 * parallel reclaimers which is a short-lived event so the timeout is in reclaim_throttle()
564 * potentially long-lived events so use a longer timeout. This is shaky in reclaim_throttle()
573 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { in reclaim_throttle()
574 WRITE_ONCE(pgdat->nr_reclaim_start, in reclaim_throttle()
604 atomic_dec(&pgdat->nr_writeback_throttled); in reclaim_throttle()
606 trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), in reclaim_throttle()
607 jiffies_to_usecs(timeout - ret), in reclaim_throttle()
624 * This is an inaccurate read as the per-cpu deltas may not in __acct_reclaim_writeback()
630 nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - in __acct_reclaim_writeback()
631 READ_ONCE(pgdat->nr_reclaim_start); in __acct_reclaim_writeback()
634 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); in __acct_reclaim_writeback()
651 * Calls ->writepage().
658 * will be non-blocking. To prevent this allocation from being in pageout()
677 * folio->mapping == NULL while being dirty with clean buffers. in pageout()
688 if (mapping->a_ops->writepage == NULL) in pageout()
711 res = mapping->a_ops->writepage(&folio->page, &wbc); in pageout()
745 spin_lock(&mapping->host->i_lock); in __remove_mapping()
746 xa_lock_irq(&mapping->i_pages); in __remove_mapping()
766 * escape unnoticed. The smp_rmb is needed to ensure the folio->flags in __remove_mapping()
767 * load is not satisfied before that of folio->_refcount. in __remove_mapping()
782 swp_entry_t swap = folio->swap; in __remove_mapping()
788 xa_unlock_irq(&mapping->i_pages); in __remove_mapping()
793 free_folio = mapping->a_ops->free_folio; in __remove_mapping()
814 xa_unlock_irq(&mapping->i_pages); in __remove_mapping()
816 inode_add_lru(mapping->host); in __remove_mapping()
817 spin_unlock(&mapping->host->i_lock); in __remove_mapping()
826 xa_unlock_irq(&mapping->i_pages); in __remove_mapping()
828 spin_unlock(&mapping->host->i_lock); in __remove_mapping()
833 * remove_mapping() - Attempt to remove a folio from its mapping.
859 * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
891 set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); in lru_gen_set_refs()
895 set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); in lru_gen_set_refs()
911 referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, in folio_check_references()
924 * 2) Skip the non-shared swapbacked folio mapped solely by in folio_check_references()
925 * the exiting or OOM-reaped process. in folio_check_references()
927 if (referenced_ptes == -1) in folio_check_references()
960 * Activate file-backed executable folios after first usage. in folio_check_references()
1004 if (mapping && mapping->a_ops->is_dirty_writeback) in folio_check_dirty_writeback()
1005 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); in folio_check_dirty_writeback()
1016 allowed_mask = mtc->nmask; in alloc_migrate_folio()
1026 mtc->nmask = NULL; in alloc_migrate_folio()
1027 mtc->gfp_mask |= __GFP_THISNODE; in alloc_migrate_folio()
1032 mtc->gfp_mask &= ~__GFP_THISNODE; in alloc_migrate_folio()
1033 mtc->nmask = allowed_mask; in alloc_migrate_folio()
1045 int target_nid = next_demotion_node(pgdat->node_id); in demote_folio_list()
1085 * We can "enter_fs" for swap-cache with only __GFP_IO in may_enter_fs()
1087 * ->flags can be updated non-atomicially (scan_swap_map_slots), in may_enter_fs()
1112 do_demote_pass = can_demote(pgdat->node_id, sc); in shrink_folio_list()
1125 list_del(&folio->lru); in shrink_folio_list()
1142 sc->nr_scanned += nr_pages; in shrink_folio_list()
1147 if (!sc->may_unmap && folio_mapped(folio)) in shrink_folio_list()
1157 stat->nr_dirty += nr_pages; in shrink_folio_list()
1160 stat->nr_unqueued_dirty += nr_pages; in shrink_folio_list()
1169 stat->nr_congested += nr_pages; in shrink_folio_list()
1219 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { in shrink_folio_list()
1220 stat->nr_immediate += nr_pages; in shrink_folio_list()
1226 !may_enter_fs(folio, sc->gfp_mask)) { in shrink_folio_list()
1228 * This is slightly racy - in shrink_folio_list()
1232 * interpreted as the readahead flag - but in shrink_folio_list()
1242 stat->nr_writeback += nr_pages; in shrink_folio_list()
1250 list_add_tail(&folio->lru, folio_list); in shrink_folio_list()
1262 stat->nr_ref_keep += nr_pages; in shrink_folio_list()
1275 list_add(&folio->lru, &demote_folios); in shrink_folio_list()
1287 if (!(sc->gfp_mask & __GFP_IO)) in shrink_folio_list()
1299 if (data_race(!list_empty(&folio->_deferred_list) && in shrink_folio_list()
1344 sc->nr_scanned -= (nr_pages - 1); in shrink_folio_list()
1375 stat->nr_unmap_fail += nr_pages; in shrink_folio_list()
1378 stat->nr_lazyfree_fail += nr_pages; in shrink_folio_list()
1398 * injecting inefficient single-folio I/O into in shrink_folio_list()
1409 !test_bit(PGDAT_DIRTY, &pgdat->flags))) { in shrink_folio_list()
1425 if (!may_enter_fs(folio, sc->gfp_mask)) in shrink_folio_list()
1427 if (!sc->may_writepage) in shrink_folio_list()
1446 sc->nr_scanned -= (nr_pages - 1); in shrink_folio_list()
1452 sc->nr_scanned -= (nr_pages - 1); in shrink_folio_list()
1455 stat->nr_pageout += nr_pages; in shrink_folio_list()
1463 * A synchronous write - probably a ramdisk. Go in shrink_folio_list()
1491 * and mark the folio clean - it can be freed. in shrink_folio_list()
1493 * Rarely, folios can have buffers and no ->mapping. in shrink_folio_list()
1502 if (!filemap_release_folio(folio, sc->gfp_mask)) in shrink_folio_list()
1537 sc->target_mem_cgroup)) in shrink_folio_list()
1562 sc->nr_scanned -= (nr_pages - 1); in shrink_folio_list()
1574 stat->nr_activate[type] += nr_pages; in shrink_folio_list()
1580 list_add(&folio->lru, &ret_folios); in shrink_folio_list()
1589 stat->nr_demoted += nr_demoted; in shrink_folio_list()
1611 if (!sc->proactive) { in shrink_folio_list()
1617 pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; in shrink_folio_list()
1649 list_move(&folio->lru, &clean_folios); in reclaim_clean_pages_from_list()
1660 nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, in reclaim_clean_pages_from_list()
1665 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, in reclaim_clean_pages_from_list()
1666 -(long)nr_reclaimed); in reclaim_clean_pages_from_list()
1673 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, in reclaim_clean_pages_from_list()
1675 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, in reclaim_clean_pages_from_list()
1676 -(long)stat.nr_lazyfree_fail); in reclaim_clean_pages_from_list()
1693 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); in update_lru_sizes()
1701 * lruvec->lru_lock is heavily contended. Some of the functions that
1724 struct list_head *src = &lruvec->lists[lru]; in isolate_lru_folios()
1747 (folio_zonenum(folio) > sc->reclaim_idx)) { in isolate_lru_folios()
1765 if (!sc->may_unmap && folio_mapped(folio)) in isolate_lru_folios()
1770 * sure the folio is not being freed elsewhere -- the in isolate_lru_folios()
1786 list_move(&folio->lru, move_to); in isolate_lru_folios()
1809 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, in isolate_lru_folios()
1816 * folio_isolate_lru() - Try to isolate a folio from its LRU list.
1886 * won't get blocked by normal direct-reclaimers, forming a circular in too_many_isolated()
1889 if (gfp_has_io_fs(sc->gfp_mask)) in too_many_isolated()
1917 list_del(&folio->lru); in move_folios_to_lru()
1919 spin_unlock_irq(&lruvec->lru_lock); in move_folios_to_lru()
1921 spin_lock_irq(&lruvec->lru_lock); in move_folios_to_lru()
1933 * list_add(&folio->lru,) in move_folios_to_lru()
1934 * list_add(&folio->lru,) in move_folios_to_lru()
1943 spin_unlock_irq(&lruvec->lru_lock); in move_folios_to_lru()
1946 spin_lock_irq(&lruvec->lru_lock); in move_folios_to_lru()
1965 spin_unlock_irq(&lruvec->lru_lock); in move_folios_to_lru()
1968 spin_lock_irq(&lruvec->lru_lock); in move_folios_to_lru()
1975 * If a kernel thread (such as nfsd for loop-back mounts) services a backing
1981 return !(current->flags & PF_LOCAL_THROTTLE); in current_may_throttle()
2017 spin_lock_irq(&lruvec->lru_lock); in shrink_inactive_list()
2029 spin_unlock_irq(&lruvec->lru_lock); in shrink_inactive_list()
2036 spin_lock_irq(&lruvec->lru_lock); in shrink_inactive_list()
2041 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); in shrink_inactive_list()
2047 spin_unlock_irq(&lruvec->lru_lock); in shrink_inactive_list()
2049 lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); in shrink_inactive_list()
2077 sc->nr.dirty += stat.nr_dirty; in shrink_inactive_list()
2078 sc->nr.congested += stat.nr_congested; in shrink_inactive_list()
2079 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; in shrink_inactive_list()
2080 sc->nr.writeback += stat.nr_writeback; in shrink_inactive_list()
2081 sc->nr.immediate += stat.nr_immediate; in shrink_inactive_list()
2082 sc->nr.taken += nr_taken; in shrink_inactive_list()
2084 sc->nr.file_taken += nr_taken; in shrink_inactive_list()
2086 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, in shrink_inactive_list()
2087 nr_scanned, nr_reclaimed, &stat, sc->priority, file); in shrink_inactive_list()
2102 * It is safe to rely on the active flag against the non-LRU folios in here
2103 * because nobody will play with that bit on a non-LRU folio.
2105 * The downside is that we have to touch folio->_refcount against each folio.
2106 * But we had to alter folio->flags anyway.
2126 spin_lock_irq(&lruvec->lru_lock); in shrink_active_list()
2137 spin_unlock_irq(&lruvec->lru_lock); in shrink_active_list()
2144 list_del(&folio->lru); in shrink_active_list()
2160 if (folio_referenced(folio, 0, sc->target_mem_cgroup, in shrink_active_list()
2163 * Identify referenced, file-backed active folios and in shrink_active_list()
2167 * are not likely to be evicted by use-once streaming in shrink_active_list()
2173 list_add(&folio->lru, &l_active); in shrink_active_list()
2178 folio_clear_active(folio); /* we are de-activating */ in shrink_active_list()
2180 list_add(&folio->lru, &l_inactive); in shrink_active_list()
2186 spin_lock_irq(&lruvec->lru_lock); in shrink_active_list()
2194 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); in shrink_active_list()
2195 spin_unlock_irq(&lruvec->lru_lock); in shrink_active_list()
2199 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, in shrink_active_list()
2200 nr_deactivate, nr_rotated, sc->priority, file); in shrink_active_list()
2220 list_del(&folio->lru); in reclaim_folio_list()
2223 trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat); in reclaim_folio_list()
2246 list_move(&folio->lru, &node_folio_list); in reclaim_pages()
2265 if (sc->may_deactivate & (1 << is_file_lru(lru))) in shrink_list()
2268 sc->skipped_deactivate = 1; in shrink_list()
2280 * to the established workingset on the scan-resistant active list,
2294 * -------------------------------------
2313 gb = (inactive + active) >> (30 - PAGE_SHIFT); in inactive_is_low()
2337 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); in prepare_scan_control()
2340 * Flush the memory cgroup stats in rate-limited way as we don't need in prepare_scan_control()
2344 mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup); in prepare_scan_control()
2349 spin_lock_irq(&target_lruvec->lru_lock); in prepare_scan_control()
2350 sc->anon_cost = target_lruvec->anon_cost; in prepare_scan_control()
2351 sc->file_cost = target_lruvec->file_cost; in prepare_scan_control()
2352 spin_unlock_irq(&target_lruvec->lru_lock); in prepare_scan_control()
2358 if (!sc->force_deactivate) { in prepare_scan_control()
2368 if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || in prepare_scan_control()
2370 sc->may_deactivate |= DEACTIVATE_ANON; in prepare_scan_control()
2372 sc->may_deactivate &= ~DEACTIVATE_ANON; in prepare_scan_control()
2376 if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || in prepare_scan_control()
2378 sc->may_deactivate |= DEACTIVATE_FILE; in prepare_scan_control()
2380 sc->may_deactivate &= ~DEACTIVATE_FILE; in prepare_scan_control()
2382 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; in prepare_scan_control()
2390 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) && in prepare_scan_control()
2391 !sc->no_cache_trim_mode) in prepare_scan_control()
2392 sc->cache_trim_mode = 1; in prepare_scan_control()
2394 sc->cache_trim_mode = 0; in prepare_scan_control()
2411 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); in prepare_scan_control()
2415 for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) { in prepare_scan_control()
2426 sc->file_is_tiny = in prepare_scan_control()
2428 !(sc->may_deactivate & DEACTIVATE_ANON) && in prepare_scan_control()
2429 anon >> sc->priority; in prepare_scan_control()
2454 total_cost = sc->anon_cost + sc->file_cost; in calculate_pressure_balance()
2455 anon_cost = total_cost + sc->anon_cost; in calculate_pressure_balance()
2456 file_cost = total_cost + sc->file_cost; in calculate_pressure_balance()
2462 fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1); in calculate_pressure_balance()
2489 if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { in get_scan_count()
2511 if (!sc->priority && swappiness) { in get_scan_count()
2517 * If the system is almost out of file pages, force-scan anon. in get_scan_count()
2519 if (sc->file_is_tiny) { in get_scan_count()
2528 if (sc->cache_trim_mode) { in get_scan_count()
2543 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); in get_scan_count()
2544 mem_cgroup_protection(sc->target_mem_cgroup, memcg, in get_scan_count()
2554 * becomes extremely binary -- from nothing as we in get_scan_count()
2569 * the best-effort low protection. However, we still in get_scan_count()
2570 * ideally want to honor how well-behaved groups are in in get_scan_count()
2581 if (!sc->memcg_low_reclaim && low > min) { in get_scan_count()
2583 sc->memcg_low_skipped = 1; in get_scan_count()
2591 scan = lruvec_size - lruvec_size * protection / in get_scan_count()
2597 * sc->priority further than desirable. in get_scan_count()
2604 scan >>= sc->priority; in get_scan_count()
2623 * round-off error. in get_scan_count()
2657 return can_demote(pgdat->node_id, sc); in can_age_anon_pages()
2685 unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
2689 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
2690 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
2696 #define for_each_gen_type_zone(gen, type, zone) \ argument
2697 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
2713 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; in get_lruvec()
2716 if (!lruvec->pgdat) in get_lruvec()
2717 lruvec->pgdat = pgdat; in get_lruvec()
2724 return &pgdat->__lruvec; in get_lruvec()
2732 if (!sc->may_swap) in get_swappiness()
2735 if (!can_demote(pgdat->node_id, sc) && in get_swappiness()
2744 return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; in get_nr_gens()
2772 * To get rid of non-leaf entries that no longer have enough leaf entries, the
2773 * aging uses the double-buffering technique to flip to the other filter each
2774 * time it produces a new generation. For non-leaf entries that have enough
2800 key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); in get_item_key()
2809 int gen = filter_gen_from_seq(seq); in test_bloom_filter() local
2811 filter = READ_ONCE(mm_state->filters[gen]); in test_bloom_filter()
2825 int gen = filter_gen_from_seq(seq); in update_bloom_filter() local
2827 filter = READ_ONCE(mm_state->filters[gen]); in update_bloom_filter()
2842 int gen = filter_gen_from_seq(seq); in reset_bloom_filter() local
2844 filter = mm_state->filters[gen]; in reset_bloom_filter()
2852 WRITE_ONCE(mm_state->filters[gen], filter); in reset_bloom_filter()
2870 return &memcg->mm_list; in get_mm_list()
2879 return &lruvec->mm_state; in get_mm_state()
2886 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); in get_next_mm()
2887 struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); in get_next_mm()
2889 mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); in get_next_mm()
2890 key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); in get_next_mm()
2892 if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) in get_next_mm()
2895 clear_bit(key, &mm->lru_gen.bitmap); in get_next_mm()
2906 VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); in lru_gen_add_mm()
2908 VM_WARN_ON_ONCE(mm->lru_gen.memcg); in lru_gen_add_mm()
2909 mm->lru_gen.memcg = memcg; in lru_gen_add_mm()
2911 spin_lock(&mm_list->lock); in lru_gen_add_mm()
2918 if (mm_state->tail == &mm_list->fifo) in lru_gen_add_mm()
2919 mm_state->tail = &mm->lru_gen.list; in lru_gen_add_mm()
2922 list_add_tail(&mm->lru_gen.list, &mm_list->fifo); in lru_gen_add_mm()
2924 spin_unlock(&mm_list->lock); in lru_gen_add_mm()
2933 if (list_empty(&mm->lru_gen.list)) in lru_gen_del_mm()
2937 memcg = mm->lru_gen.memcg; in lru_gen_del_mm()
2941 spin_lock(&mm_list->lock); in lru_gen_del_mm()
2948 if (mm_state->head == &mm->lru_gen.list) in lru_gen_del_mm()
2949 mm_state->head = mm_state->head->prev; in lru_gen_del_mm()
2952 if (mm_state->tail == &mm->lru_gen.list) in lru_gen_del_mm()
2953 mm_state->tail = mm_state->tail->next; in lru_gen_del_mm()
2956 list_del_init(&mm->lru_gen.list); in lru_gen_del_mm()
2958 spin_unlock(&mm_list->lock); in lru_gen_del_mm()
2961 mem_cgroup_put(mm->lru_gen.memcg); in lru_gen_del_mm()
2962 mm->lru_gen.memcg = NULL; in lru_gen_del_mm()
2970 struct task_struct *task = rcu_dereference_protected(mm->owner, true); in lru_gen_migrate_mm()
2972 VM_WARN_ON_ONCE(task->mm != mm); in lru_gen_migrate_mm()
2973 lockdep_assert_held(&task->alloc_lock); in lru_gen_migrate_mm()
2980 if (!mm->lru_gen.memcg) in lru_gen_migrate_mm()
2986 if (memcg == mm->lru_gen.memcg) in lru_gen_migrate_mm()
2989 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); in lru_gen_migrate_mm()
3019 struct lruvec *lruvec = walk->lruvec; in reset_mm_stats()
3022 lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); in reset_mm_stats()
3024 hist = lru_hist_from_seq(walk->seq); in reset_mm_stats()
3027 WRITE_ONCE(mm_state->stats[hist][i], in reset_mm_stats()
3028 mm_state->stats[hist][i] + walk->mm_stats[i]); in reset_mm_stats()
3029 walk->mm_stats[i] = 0; in reset_mm_stats()
3033 hist = lru_hist_from_seq(walk->seq + 1); in reset_mm_stats()
3036 WRITE_ONCE(mm_state->stats[hist][i], 0); in reset_mm_stats()
3045 struct lruvec *lruvec = walk->lruvec; in iterate_mm_list()
3051 * mm_state->seq is incremented after each iteration of mm_list. There in iterate_mm_list()
3060 spin_lock(&mm_list->lock); in iterate_mm_list()
3062 VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq); in iterate_mm_list()
3064 if (walk->seq <= mm_state->seq) in iterate_mm_list()
3067 if (!mm_state->head) in iterate_mm_list()
3068 mm_state->head = &mm_list->fifo; in iterate_mm_list()
3070 if (mm_state->head == &mm_list->fifo) in iterate_mm_list()
3074 mm_state->head = mm_state->head->next; in iterate_mm_list()
3075 if (mm_state->head == &mm_list->fifo) { in iterate_mm_list()
3076 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); in iterate_mm_list()
3082 if (!mm_state->tail || mm_state->tail == mm_state->head) { in iterate_mm_list()
3083 mm_state->tail = mm_state->head->next; in iterate_mm_list()
3084 walk->force_scan = true; in iterate_mm_list()
3091 spin_unlock(&mm_list->lock); in iterate_mm_list()
3094 reset_bloom_filter(mm_state, walk->seq + 1); in iterate_mm_list()
3111 spin_lock(&mm_list->lock); in iterate_mm_list_nowalk()
3113 VM_WARN_ON_ONCE(mm_state->seq + 1 < seq); in iterate_mm_list_nowalk()
3115 if (seq > mm_state->seq) { in iterate_mm_list_nowalk()
3116 mm_state->head = NULL; in iterate_mm_list_nowalk()
3117 mm_state->tail = NULL; in iterate_mm_list_nowalk()
3118 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); in iterate_mm_list_nowalk()
3122 spin_unlock(&mm_list->lock); in iterate_mm_list_nowalk()
3132 * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3147 * 1. The D term may discount the other two terms over time so that long-lived
3160 struct lru_gen_folio *lrugen = &lruvec->lrugen; in read_ctrl_pos()
3161 int hist = lru_hist_from_seq(lrugen->min_seq[type]); in read_ctrl_pos()
3163 pos->gain = gain; in read_ctrl_pos()
3164 pos->refaulted = pos->total = 0; in read_ctrl_pos()
3166 for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) { in read_ctrl_pos()
3167 pos->refaulted += lrugen->avg_refaulted[type][i] + in read_ctrl_pos()
3168 atomic_long_read(&lrugen->refaulted[hist][type][i]); in read_ctrl_pos()
3169 pos->total += lrugen->avg_total[type][i] + in read_ctrl_pos()
3170 lrugen->protected[hist][type][i] + in read_ctrl_pos()
3171 atomic_long_read(&lrugen->evicted[hist][type][i]); in read_ctrl_pos()
3178 struct lru_gen_folio *lrugen = &lruvec->lrugen; in reset_ctrl_pos()
3180 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; in reset_ctrl_pos()
3182 lockdep_assert_held(&lruvec->lru_lock); in reset_ctrl_pos()
3193 sum = lrugen->avg_refaulted[type][tier] + in reset_ctrl_pos()
3194 atomic_long_read(&lrugen->refaulted[hist][type][tier]); in reset_ctrl_pos()
3195 WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); in reset_ctrl_pos()
3197 sum = lrugen->avg_total[type][tier] + in reset_ctrl_pos()
3198 lrugen->protected[hist][type][tier] + in reset_ctrl_pos()
3199 atomic_long_read(&lrugen->evicted[hist][type][tier]); in reset_ctrl_pos()
3200 WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); in reset_ctrl_pos()
3204 atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); in reset_ctrl_pos()
3205 atomic_long_set(&lrugen->evicted[hist][type][tier], 0); in reset_ctrl_pos()
3206 WRITE_ONCE(lrugen->protected[hist][type][tier], 0); in reset_ctrl_pos()
3217 return pv->refaulted < MIN_LRU_BATCH || in positive_ctrl_err()
3218 pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= in positive_ctrl_err()
3219 (sp->refaulted + 1) * pv->total * pv->gain; in positive_ctrl_err()
3227 static int folio_update_gen(struct folio *folio, int gen) in folio_update_gen() argument
3229 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); in folio_update_gen()
3231 VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); in folio_update_gen()
3235 set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); in folio_update_gen()
3236 return -1; in folio_update_gen()
3242 return -1; in folio_update_gen()
3245 new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); in folio_update_gen()
3246 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); in folio_update_gen()
3248 return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; in folio_update_gen()
3255 struct lru_gen_folio *lrugen = &lruvec->lrugen; in folio_inc_gen()
3256 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); in folio_inc_gen()
3257 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); in folio_inc_gen()
3262 new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; in folio_inc_gen()
3274 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); in folio_inc_gen()
3291 walk->batched++; in update_batch_size()
3293 walk->nr_pages[old_gen][type][zone] -= delta; in update_batch_size()
3294 walk->nr_pages[new_gen][type][zone] += delta; in update_batch_size()
3299 int gen, type, zone; in reset_batch_size() local
3300 struct lruvec *lruvec = walk->lruvec; in reset_batch_size()
3301 struct lru_gen_folio *lrugen = &lruvec->lrugen; in reset_batch_size()
3303 walk->batched = 0; in reset_batch_size()
3305 for_each_gen_type_zone(gen, type, zone) { in reset_batch_size()
3307 int delta = walk->nr_pages[gen][type][zone]; in reset_batch_size()
3312 walk->nr_pages[gen][type][zone] = 0; in reset_batch_size()
3313 WRITE_ONCE(lrugen->nr_pages[gen][type][zone], in reset_batch_size()
3314 lrugen->nr_pages[gen][type][zone] + delta); in reset_batch_size()
3316 if (lru_gen_is_active(lruvec, gen)) in reset_batch_size()
3325 struct vm_area_struct *vma = args->vma; in should_skip_vma()
3326 struct lru_gen_mm_walk *walk = args->private; in should_skip_vma()
3337 if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) in should_skip_vma()
3340 if (vma == get_gate_vma(vma->vm_mm)) in should_skip_vma()
3344 return !walk->swappiness; in should_skip_vma()
3346 if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) in should_skip_vma()
3349 mapping = vma->vm_file->f_mapping; in should_skip_vma()
3354 return !walk->swappiness; in should_skip_vma()
3356 if (walk->swappiness > MAX_SWAPPINESS) in should_skip_vma()
3360 return !mapping->a_ops->read_folio; in should_skip_vma()
3364 * Some userspace memory allocators map many single-page VMAs. Instead of
3373 VMA_ITERATOR(vmi, args->mm, start); in get_next_vma()
3378 for_each_vma(vmi, args->vma) { in get_next_vma()
3379 if (end && end <= args->vma->vm_start) in get_next_vma()
3382 if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) in get_next_vma()
3385 *vm_start = max(start, args->vma->vm_start); in get_next_vma()
3386 *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; in get_next_vma()
3399 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); in get_pte_pfn()
3402 return -1; in get_pte_pfn()
3405 return -1; in get_pte_pfn()
3407 if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) in get_pte_pfn()
3408 return -1; in get_pte_pfn()
3411 return -1; in get_pte_pfn()
3413 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) in get_pte_pfn()
3414 return -1; in get_pte_pfn()
3424 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); in get_pmd_pfn()
3427 return -1; in get_pmd_pfn()
3430 return -1; in get_pmd_pfn()
3432 if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) in get_pmd_pfn()
3433 return -1; in get_pmd_pfn()
3436 return -1; in get_pmd_pfn()
3438 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) in get_pmd_pfn()
3439 return -1; in get_pmd_pfn()
3452 if (folio_nid(folio) != pgdat->node_id) in get_pfn_folio()
3504 struct lru_gen_mm_walk *walk = args->private; in walk_pte_range()
3505 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); in walk_pte_range()
3506 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); in walk_pte_range()
3507 DEFINE_MAX_SEQ(walk->lruvec); in walk_pte_range()
3508 int gen = lru_gen_from_seq(max_seq); in walk_pte_range() local
3511 pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); in walk_pte_range()
3533 walk->mm_stats[MM_LEAF_TOTAL]++; in walk_pte_range()
3535 pfn = get_pte_pfn(ptent, args->vma, addr, pgdat); in walk_pte_range()
3536 if (pfn == -1) in walk_pte_range()
3543 if (!ptep_clear_young_notify(args->vma, addr, pte + i)) in walk_pte_range()
3547 walk_update_folio(walk, last, gen, dirty); in walk_pte_range()
3557 walk->mm_stats[MM_LEAF_YOUNG]++; in walk_pte_range()
3560 walk_update_folio(walk, last, gen, dirty); in walk_pte_range()
3580 struct lru_gen_mm_walk *walk = args->private; in walk_pmd_range_locked()
3581 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); in walk_pmd_range_locked()
3582 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); in walk_pmd_range_locked()
3583 DEFINE_MAX_SEQ(walk->lruvec); in walk_pmd_range_locked()
3584 int gen = lru_gen_from_seq(max_seq); in walk_pmd_range_locked() local
3589 if (*first == -1) { in walk_pmd_range_locked()
3595 i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); in walk_pmd_range_locked()
3597 __set_bit(i - 1, bitmap); in walk_pmd_range_locked()
3603 ptl = pmd_lockptr(args->mm, pmd); in walk_pmd_range_locked()
3620 if (!walk->force_scan && should_clear_pmd_young() && in walk_pmd_range_locked()
3621 !mm_has_notifiers(args->mm)) in walk_pmd_range_locked()
3627 if (pfn == -1) in walk_pmd_range_locked()
3638 walk_update_folio(walk, last, gen, dirty); in walk_pmd_range_locked()
3647 walk->mm_stats[MM_LEAF_YOUNG]++; in walk_pmd_range_locked()
3652 walk_update_folio(walk, last, gen, dirty); in walk_pmd_range_locked()
3657 *first = -1; in walk_pmd_range_locked()
3669 unsigned long first = -1; in walk_pmd_range()
3670 struct lru_gen_mm_walk *walk = args->private; in walk_pmd_range()
3671 struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); in walk_pmd_range()
3683 vma = args->vma; in walk_pmd_range()
3690 walk->mm_stats[MM_LEAF_TOTAL]++; in walk_pmd_range()
3695 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); in walk_pmd_range()
3698 walk->mm_stats[MM_LEAF_TOTAL]++; in walk_pmd_range()
3700 if (pfn != -1) in walk_pmd_range()
3705 if (!walk->force_scan && should_clear_pmd_young() && in walk_pmd_range()
3706 !mm_has_notifiers(args->mm)) { in walk_pmd_range()
3713 if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i)) in walk_pmd_range()
3716 walk->mm_stats[MM_NONLEAF_FOUND]++; in walk_pmd_range()
3721 walk->mm_stats[MM_NONLEAF_ADDED]++; in walk_pmd_range()
3724 update_bloom_filter(mm_state, walk->seq + 1, pmd + i); in walk_pmd_range()
3727 walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); in walk_pmd_range()
3740 struct lru_gen_mm_walk *walk = args->private; in walk_pud_range()
3756 if (need_resched() || walk->batched >= MAX_LRU_BATCH) { in walk_pud_range()
3767 if (!end || !args->vma) in walk_pud_range()
3770 walk->next_addr = max(end, args->vma->vm_start); in walk_pud_range()
3772 return -EAGAIN; in walk_pud_range()
3783 struct lruvec *lruvec = walk->lruvec; in walk_mm()
3785 walk->next_addr = FIRST_USER_ADDRESS; in walk_mm()
3790 err = -EBUSY; in walk_mm()
3793 if (walk->seq != max_seq) in walk_mm()
3798 err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); in walk_mm()
3803 if (walk->batched) { in walk_mm()
3804 spin_lock_irq(&lruvec->lru_lock); in walk_mm()
3806 spin_unlock_irq(&lruvec->lru_lock); in walk_mm()
3810 } while (err == -EAGAIN); in walk_mm()
3815 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; in set_mm_walk()
3820 walk = &pgdat->mm_walk; in set_mm_walk()
3827 current->reclaim_state->mm_walk = walk; in set_mm_walk()
3834 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; in clear_mm_walk()
3836 VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); in clear_mm_walk()
3837 VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); in clear_mm_walk()
3839 current->reclaim_state->mm_walk = NULL; in clear_mm_walk()
3849 struct lru_gen_folio *lrugen = &lruvec->lrugen; in inc_min_seq()
3850 int hist = lru_hist_from_seq(lrugen->min_seq[type]); in inc_min_seq()
3851 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); in inc_min_seq()
3858 struct list_head *head = &lrugen->folios[old_gen][type][zone]; in inc_min_seq()
3871 list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); in inc_min_seq()
3878 WRITE_ONCE(lrugen->protected[hist][type][tier], in inc_min_seq()
3879 lrugen->protected[hist][type][tier] + delta); in inc_min_seq()
3882 if (!--remaining) in inc_min_seq()
3888 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); in inc_min_seq()
3895 int gen, type, zone; in try_to_inc_min_seq() local
3897 struct lru_gen_folio *lrugen = &lruvec->lrugen; in try_to_inc_min_seq()
3904 while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { in try_to_inc_min_seq()
3905 gen = lru_gen_from_seq(min_seq[type]); in try_to_inc_min_seq()
3908 if (!list_empty(&lrugen->folios[gen][type][zone])) in try_to_inc_min_seq()
3920 unsigned long seq = lrugen->max_seq - MIN_NR_GENS; in try_to_inc_min_seq()
3929 if (min_seq[type] <= lrugen->min_seq[type]) in try_to_inc_min_seq()
3933 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); in try_to_inc_min_seq()
3945 struct lru_gen_folio *lrugen = &lruvec->lrugen; in inc_max_seq()
3947 if (seq < READ_ONCE(lrugen->max_seq)) in inc_max_seq()
3950 spin_lock_irq(&lruvec->lru_lock); in inc_max_seq()
3954 success = seq == lrugen->max_seq; in inc_max_seq()
3965 spin_unlock_irq(&lruvec->lru_lock); in inc_max_seq()
3976 prev = lru_gen_from_seq(lrugen->max_seq - 1); in inc_max_seq()
3977 next = lru_gen_from_seq(lrugen->max_seq + 1); in inc_max_seq()
3982 long delta = lrugen->nr_pages[prev][type][zone] - in inc_max_seq()
3983 lrugen->nr_pages[next][type][zone]; in inc_max_seq()
3989 __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); in inc_max_seq()
3996 WRITE_ONCE(lrugen->timestamps[next], jiffies); in inc_max_seq()
3998 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); in inc_max_seq()
4000 spin_unlock_irq(&lruvec->lru_lock); in inc_max_seq()
4011 struct lru_gen_folio *lrugen = &lruvec->lrugen; in try_to_inc_max_seq()
4014 VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); in try_to_inc_max_seq()
4020 if (seq <= READ_ONCE(mm_state->seq)) in try_to_inc_max_seq()
4040 walk->lruvec = lruvec; in try_to_inc_max_seq()
4041 walk->seq = seq; in try_to_inc_max_seq()
4042 walk->swappiness = swappiness; in try_to_inc_max_seq()
4043 walk->force_scan = force_scan; in try_to_inc_max_seq()
4068 if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) in set_initial_priority()
4076 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) in set_initial_priority()
4079 /* round down reclaimable and round up sc->nr_to_reclaim */ in set_initial_priority()
4080 priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); in set_initial_priority()
4086 sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); in set_initial_priority()
4091 int gen, type, zone; in lruvec_is_sizable() local
4094 struct lru_gen_folio *lrugen = &lruvec->lrugen; in lruvec_is_sizable()
4103 gen = lru_gen_from_seq(seq); in lruvec_is_sizable()
4106 total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); in lruvec_is_sizable()
4111 return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; in lruvec_is_sizable()
4117 int gen; in lruvec_is_reclaimable() local
4129 gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); in lruvec_is_reclaimable()
4130 birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); in lruvec_is_reclaimable()
4165 .gfp_mask = sc->gfp_mask, in lru_gen_age_node()
4194 pte_t *pte = pvmw->pte; in lru_gen_look_around()
4195 unsigned long addr = pvmw->address; in lru_gen_look_around()
4196 struct vm_area_struct *vma = pvmw->vma; in lru_gen_look_around()
4197 struct folio *folio = pfn_folio(pvmw->pfn); in lru_gen_look_around()
4203 int gen = lru_gen_from_seq(max_seq); in lru_gen_look_around() local
4205 lockdep_assert_held(pvmw->ptl); in lru_gen_look_around()
4211 if (spin_is_contended(pvmw->ptl)) in lru_gen_look_around()
4215 if (vma->vm_flags & VM_SPECIAL) in lru_gen_look_around()
4219 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; in lru_gen_look_around()
4221 start = max(addr & PMD_MASK, vma->vm_start); in lru_gen_look_around()
4222 end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; in lru_gen_look_around()
4224 if (end - start == PAGE_SIZE) in lru_gen_look_around()
4227 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { in lru_gen_look_around()
4228 if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) in lru_gen_look_around()
4230 else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) in lru_gen_look_around()
4231 start = end - MIN_LRU_BATCH * PAGE_SIZE; in lru_gen_look_around()
4233 start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; in lru_gen_look_around()
4240 pte -= (addr - start) / PAGE_SIZE; in lru_gen_look_around()
4247 if (pfn == -1) in lru_gen_look_around()
4258 walk_update_folio(walk, last, gen, dirty); in lru_gen_look_around()
4270 walk_update_folio(walk, last, gen, dirty); in lru_gen_look_around()
4276 update_bloom_filter(mm_state, max_seq, pvmw->pmd); in lru_gen_look_around()
4302 spin_lock_irqsave(&pgdat->memcg_lru.lock, flags); in lru_gen_rotate_memcg()
4304 VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); in lru_gen_rotate_memcg()
4307 new = old = lruvec->lrugen.gen; in lru_gen_rotate_memcg()
4315 new = get_memcg_gen(pgdat->memcg_lru.seq); in lru_gen_rotate_memcg()
4317 new = get_memcg_gen(pgdat->memcg_lru.seq + 1); in lru_gen_rotate_memcg()
4321 WRITE_ONCE(lruvec->lrugen.seg, seg); in lru_gen_rotate_memcg()
4322 WRITE_ONCE(lruvec->lrugen.gen, new); in lru_gen_rotate_memcg()
4324 hlist_nulls_del_rcu(&lruvec->lrugen.list); in lru_gen_rotate_memcg()
4327 hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); in lru_gen_rotate_memcg()
4329 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); in lru_gen_rotate_memcg()
4331 pgdat->memcg_lru.nr_memcgs[old]--; in lru_gen_rotate_memcg()
4332 pgdat->memcg_lru.nr_memcgs[new]++; in lru_gen_rotate_memcg()
4334 if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) in lru_gen_rotate_memcg()
4335 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); in lru_gen_rotate_memcg()
4337 spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags); in lru_gen_rotate_memcg()
4344 int gen; in lru_gen_online_memcg() local
4352 spin_lock_irq(&pgdat->memcg_lru.lock); in lru_gen_online_memcg()
4354 VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); in lru_gen_online_memcg()
4356 gen = get_memcg_gen(pgdat->memcg_lru.seq); in lru_gen_online_memcg()
4358 lruvec->lrugen.gen = gen; in lru_gen_online_memcg()
4360 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); in lru_gen_online_memcg()
4361 pgdat->memcg_lru.nr_memcgs[gen]++; in lru_gen_online_memcg()
4363 spin_unlock_irq(&pgdat->memcg_lru.lock); in lru_gen_online_memcg()
4380 int gen; in lru_gen_release_memcg() local
4387 spin_lock_irq(&pgdat->memcg_lru.lock); in lru_gen_release_memcg()
4389 if (hlist_nulls_unhashed(&lruvec->lrugen.list)) in lru_gen_release_memcg()
4392 gen = lruvec->lrugen.gen; in lru_gen_release_memcg()
4394 hlist_nulls_del_init_rcu(&lruvec->lrugen.list); in lru_gen_release_memcg()
4395 pgdat->memcg_lru.nr_memcgs[gen]--; in lru_gen_release_memcg()
4397 if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) in lru_gen_release_memcg()
4398 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); in lru_gen_release_memcg()
4400 spin_unlock_irq(&pgdat->memcg_lru.lock); in lru_gen_release_memcg()
4409 if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD) in lru_gen_soft_reclaim()
4424 int gen = folio_lru_gen(folio); in sort_folio() local
4431 struct lru_gen_folio *lrugen = &lruvec->lrugen; in sort_folio()
4433 VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); in sort_folio()
4446 if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { in sort_folio()
4447 list_move(&folio->lru, &lrugen->folios[gen][type][zone]); in sort_folio()
4453 gen = folio_inc_gen(lruvec, folio, false); in sort_folio()
4454 list_move(&folio->lru, &lrugen->folios[gen][type][zone]); in sort_folio()
4458 int hist = lru_hist_from_seq(lrugen->min_seq[type]); in sort_folio()
4460 WRITE_ONCE(lrugen->protected[hist][type][tier], in sort_folio()
4461 lrugen->protected[hist][type][tier] + delta); in sort_folio()
4467 if (!folio_test_lru(folio) || zone > sc->reclaim_idx) { in sort_folio()
4468 gen = folio_inc_gen(lruvec, folio, false); in sort_folio()
4469 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); in sort_folio()
4476 sc->nr.file_taken += delta; in sort_folio()
4478 sc->nr.unqueued_dirty += delta; in sort_folio()
4483 gen = folio_inc_gen(lruvec, folio, true); in sort_folio()
4484 list_move(&folio->lru, &lrugen->folios[gen][type][zone]); in sort_folio()
4496 if (!(sc->gfp_mask & __GFP_IO) && in isolate_folio()
4513 set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); in isolate_folio()
4528 int gen; in scan_folios() local
4535 struct lru_gen_folio *lrugen = &lruvec->lrugen; in scan_folios()
4543 gen = lru_gen_from_seq(lrugen->min_seq[type]); in scan_folios()
4545 for (i = MAX_NR_ZONES; i > 0; i--) { in scan_folios()
4548 int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; in scan_folios()
4549 struct list_head *head = &lrugen->folios[gen][type][zone]; in scan_folios()
4565 list_add(&folio->lru, list); in scan_folios()
4568 list_move(&folio->lru, &moved); in scan_folios()
4572 if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH) in scan_folios()
4594 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, in scan_folios()
4598 sc->nr.file_taken += isolated; in scan_folios()
4623 return tier - 1; in get_tier_idx()
4640 read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv); in get_type_to_scan()
4680 struct lru_gen_folio *lrugen = &lruvec->lrugen; in evict_folios()
4684 spin_lock_irq(&lruvec->lru_lock); in evict_folios()
4690 if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) in evict_folios()
4693 spin_unlock_irq(&lruvec->lru_lock); in evict_folios()
4699 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; in evict_folios()
4700 sc->nr_reclaimed += reclaimed; in evict_folios()
4701 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, in evict_folios()
4702 scanned, reclaimed, &stat, sc->priority, in evict_folios()
4709 list_del(&folio->lru); in evict_folios()
4717 list_move(&folio->lru, &clean); in evict_folios()
4723 set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); in evict_folios()
4726 spin_lock_irq(&lruvec->lru_lock); in evict_folios()
4730 walk = current->reclaim_state->mm_walk; in evict_folios()
4731 if (walk && walk->batched) { in evict_folios()
4732 walk->lruvec = lruvec; in evict_folios()
4745 spin_unlock_irq(&lruvec->lru_lock); in evict_folios()
4760 int gen, type, zone; in should_run_aging() local
4762 struct lru_gen_folio *lrugen = &lruvec->lrugen; in should_run_aging()
4774 gen = lru_gen_from_seq(seq); in should_run_aging()
4777 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); in should_run_aging()
4798 if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) in get_nr_to_scan()
4799 return -1; in get_nr_to_scan()
4808 if (!success || sc->priority == DEF_PRIORITY) in get_nr_to_scan()
4809 return nr_to_scan >> sc->priority; in get_nr_to_scan()
4812 return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; in get_nr_to_scan()
4824 if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) in should_abort_scan()
4827 /* check the order to exclude compaction-induced reclaim */ in should_abort_scan()
4828 if (!current_is_kswapd() || sc->order) in should_abort_scan()
4834 for (i = 0; i <= sc->reclaim_idx; i++) { in should_abort_scan()
4835 struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; in should_abort_scan()
4838 if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0)) in should_abort_scan()
4877 if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) in try_to_shrink_lruvec()
4887 unsigned long scanned = sc->nr_scanned; in shrink_one()
4888 unsigned long reclaimed = sc->nr_reclaimed; in shrink_one()
4898 if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL) in shrink_one()
4906 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); in shrink_one()
4908 if (!sc->proactive) in shrink_one()
4909 vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, in shrink_one()
4910 sc->nr_reclaimed - reclaimed); in shrink_one()
4921 return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ? in shrink_one()
4928 int gen; in shrink_many() local
4936 gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); in shrink_many()
4944 hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { in shrink_many()
4953 if (gen != READ_ONCE(lrugen->gen)) in shrink_many()
4986 if (gen != get_nulls_value(pos)) in shrink_many()
5000 VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); in lru_gen_shrink_lruvec()
5006 set_mm_walk(NULL, sc->proactive); in lru_gen_shrink_lruvec()
5019 unsigned long reclaimed = sc->nr_reclaimed; in lru_gen_shrink_node()
5028 if (!sc->may_writepage || !sc->may_unmap) in lru_gen_shrink_node()
5035 set_mm_walk(pgdat, sc->proactive); in lru_gen_shrink_node()
5040 sc->nr_reclaimed = 0; in lru_gen_shrink_node()
5043 shrink_one(&pgdat->__lruvec, sc); in lru_gen_shrink_node()
5048 sc->nr_reclaimed += reclaimed; in lru_gen_shrink_node()
5054 if (sc->nr_reclaimed > reclaimed) in lru_gen_shrink_node()
5055 pgdat->kswapd_failures = 0; in lru_gen_shrink_node()
5064 struct lru_gen_folio *lrugen = &lruvec->lrugen; in state_is_valid()
5066 if (lrugen->enabled) { in state_is_valid()
5070 if (!list_empty(&lruvec->lists[lru])) in state_is_valid()
5074 int gen, type, zone; in state_is_valid() local
5076 for_each_gen_type_zone(gen, type, zone) { in state_is_valid()
5077 if (!list_empty(&lrugen->folios[gen][type][zone])) in state_is_valid()
5093 struct list_head *head = &lruvec->lists[lru]; in fill_evictable()
5102 VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); in fill_evictable()
5108 if (!--remaining) in fill_evictable()
5118 int gen, type, zone; in drain_evictable() local
5121 for_each_gen_type_zone(gen, type, zone) { in drain_evictable()
5122 struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; in drain_evictable()
5137 if (!--remaining) in drain_evictable()
5171 spin_lock_irq(&lruvec->lru_lock); in lru_gen_change_state()
5176 lruvec->lrugen.enabled = enabled; in lru_gen_change_state()
5179 spin_unlock_irq(&lruvec->lru_lock); in lru_gen_change_state()
5181 spin_lock_irq(&lruvec->lru_lock); in lru_gen_change_state()
5184 spin_unlock_irq(&lruvec->lru_lock); in lru_gen_change_state()
5205 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5212 return -EINVAL; in min_ttl_ms_store()
5237 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5247 caps = -1; in enabled_store()
5249 return -EINVAL; in enabled_store()
5287 m->private = kvmalloc(PATH_MAX, GFP_KERNEL); in lru_gen_seq_start()
5288 if (!m->private) in lru_gen_seq_start()
5289 return ERR_PTR(-ENOMEM); in lru_gen_seq_start()
5296 if (!nr_to_skip--) in lru_gen_seq_start()
5309 kvfree(m->private); in lru_gen_seq_stop()
5310 m->private = NULL; in lru_gen_seq_stop()
5315 int nid = lruvec_pgdat(v)->node_id; in lru_gen_seq_next()
5339 struct lru_gen_folio *lrugen = &lruvec->lrugen; in lru_gen_seq_show_full()
5350 n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); in lru_gen_seq_show_full()
5351 n[1] = READ_ONCE(lrugen->avg_total[type][tier]); in lru_gen_seq_show_full()
5354 n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); in lru_gen_seq_show_full()
5355 n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); in lru_gen_seq_show_full()
5356 n[2] = READ_ONCE(lrugen->protected[hist][type][tier]); in lru_gen_seq_show_full()
5375 n = READ_ONCE(mm_state->stats[hist][i]); in lru_gen_seq_show_full()
5378 n = READ_ONCE(mm_state->stats[hist][i]); in lru_gen_seq_show_full()
5386 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5390 bool full = !debugfs_real_fops(m->file)->write; in lru_gen_seq_show()
5392 struct lru_gen_folio *lrugen = &lruvec->lrugen; in lru_gen_seq_show()
5393 int nid = lruvec_pgdat(lruvec)->node_id; in lru_gen_seq_show()
5399 const char *path = memcg ? m->private : ""; in lru_gen_seq_show()
5403 cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); in lru_gen_seq_show()
5413 seq = max_seq - MAX_NR_GENS + 1; in lru_gen_seq_show()
5419 int gen = lru_gen_from_seq(seq); in lru_gen_seq_show() local
5420 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); in lru_gen_seq_show()
5422 seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); in lru_gen_seq_show()
5429 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); in lru_gen_seq_show()
5456 return -EINVAL; in run_aging()
5458 return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; in run_aging()
5467 return -EINVAL; in run_eviction()
5469 sc->nr_reclaimed = 0; in run_eviction()
5477 if (sc->nr_reclaimed >= nr_to_reclaim) in run_eviction()
5486 return -EINTR; in run_eviction()
5493 int err = -EINVAL; in run_cmd()
5497 return -EINVAL; in run_cmd()
5509 return -EINVAL; in run_cmd()
5526 case '-': in run_cmd()
5536 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5544 int err = -EINVAL; in lru_gen_seq_write()
5549 .reclaim_idx = MAX_NR_ZONES - 1, in lru_gen_seq_write()
5555 return -ENOMEM; in lru_gen_seq_write()
5559 return -EFAULT; in lru_gen_seq_write()
5566 err = -ENOMEM; in lru_gen_seq_write()
5580 unsigned int swappiness = -1; in lru_gen_seq_write()
5581 unsigned long opt = -1; in lru_gen_seq_write()
5590 err = -EINVAL; in lru_gen_seq_write()
5637 spin_lock_init(&pgdat->memcg_lru.lock); in lru_gen_init_pgdat()
5641 INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); in lru_gen_init_pgdat()
5648 int gen, type, zone; in lru_gen_init_lruvec() local
5649 struct lru_gen_folio *lrugen = &lruvec->lrugen; in lru_gen_init_lruvec()
5652 lrugen->max_seq = MIN_NR_GENS + 1; in lru_gen_init_lruvec()
5653 lrugen->enabled = lru_gen_enabled(); in lru_gen_init_lruvec()
5656 lrugen->timestamps[i] = jiffies; in lru_gen_init_lruvec()
5658 for_each_gen_type_zone(gen, type, zone) in lru_gen_init_lruvec()
5659 INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); in lru_gen_init_lruvec()
5662 mm_state->seq = MIN_NR_GENS; in lru_gen_init_lruvec()
5674 INIT_LIST_HEAD(&mm_list->fifo); in lru_gen_init_memcg()
5675 spin_lock_init(&mm_list->lock); in lru_gen_init_memcg()
5684 VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo)); in lru_gen_exit_memcg()
5690 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, in lru_gen_exit_memcg()
5691 sizeof(lruvec->lrugen.nr_pages))); in lru_gen_exit_memcg()
5693 lruvec->lrugen.list.next = LIST_POISON1; in lru_gen_exit_memcg()
5699 bitmap_free(mm_state->filters[i]); in lru_gen_exit_memcg()
5700 mm_state->filters[i] = NULL; in lru_gen_exit_memcg()
5748 unsigned long nr_to_reclaim = sc->nr_to_reclaim; in shrink_lruvec()
5774 sc->priority == DEF_PRIORITY); in shrink_lruvec()
5785 nr[lru] -= nr_to_scan; in shrink_lruvec()
5837 nr_scanned = targets[lru] - nr[lru]; in shrink_lruvec()
5838 nr[lru] = targets[lru] * (100 - percentage) / 100; in shrink_lruvec()
5839 nr[lru] -= min(nr[lru], nr_scanned); in shrink_lruvec()
5842 nr_scanned = targets[lru] - nr[lru]; in shrink_lruvec()
5843 nr[lru] = targets[lru] * (100 - percentage) / 100; in shrink_lruvec()
5844 nr[lru] -= min(nr[lru], nr_scanned); in shrink_lruvec()
5847 sc->nr_reclaimed += nr_reclaimed; in shrink_lruvec()
5862 if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && in in_reclaim_compaction()
5863 (sc->order > PAGE_ALLOC_COSTLY_ORDER || in in_reclaim_compaction()
5864 sc->priority < DEF_PRIORITY - 2)) in in_reclaim_compaction()
5871 * Reclaim/compaction is used for high-order allocation requests. It reclaims
5872 * order-0 pages before compacting the zone. should_continue_reclaim() returns
5896 * first, by assuming that zero delta of sc->nr_scanned means full LRU in should_continue_reclaim()
5898 * where always a non-zero amount of pages were scanned. in should_continue_reclaim()
5904 for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { in should_continue_reclaim()
5908 if (zone_watermark_ok(zone, sc->order, watermark, in should_continue_reclaim()
5909 sc->reclaim_idx, 0)) in should_continue_reclaim()
5912 if (compaction_suitable(zone, sc->order, watermark, in should_continue_reclaim()
5913 sc->reclaim_idx)) in should_continue_reclaim()
5921 pages_for_compaction = compact_gap(sc->order); in should_continue_reclaim()
5923 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) in should_continue_reclaim()
5931 struct mem_cgroup *target_memcg = sc->target_mem_cgroup; in shrink_node_memcgs()
5947 if (current_is_kswapd() || sc->memcg_full_walk) in shrink_node_memcgs()
5957 * This loop can become CPU-bound when target memcgs in shrink_node_memcgs()
5958 * aren't eligible for reclaim - either because they in shrink_node_memcgs()
5979 if (!sc->memcg_low_reclaim) { in shrink_node_memcgs()
5980 sc->memcg_low_skipped = 1; in shrink_node_memcgs()
5986 reclaimed = sc->nr_reclaimed; in shrink_node_memcgs()
5987 scanned = sc->nr_scanned; in shrink_node_memcgs()
5991 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, in shrink_node_memcgs()
5992 sc->priority); in shrink_node_memcgs()
5995 if (!sc->proactive) in shrink_node_memcgs()
5996 vmpressure(sc->gfp_mask, memcg, false, in shrink_node_memcgs()
5997 sc->nr_scanned - scanned, in shrink_node_memcgs()
5998 sc->nr_reclaimed - reclaimed); in shrink_node_memcgs()
6001 if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) { in shrink_node_memcgs()
6015 memset(&sc->nr, 0, sizeof(sc->nr)); in shrink_node()
6020 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); in shrink_node()
6023 memset(&sc->nr, 0, sizeof(sc->nr)); in shrink_node()
6025 nr_reclaimed = sc->nr_reclaimed; in shrink_node()
6026 nr_scanned = sc->nr_scanned; in shrink_node()
6034 nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed; in shrink_node()
6037 if (!sc->proactive) in shrink_node()
6038 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, in shrink_node()
6039 sc->nr_scanned - nr_scanned, nr_node_reclaimed); in shrink_node()
6047 * it implies that the long-lived page allocation rate in shrink_node()
6062 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) in shrink_node()
6063 set_bit(PGDAT_WRITEBACK, &pgdat->flags); in shrink_node()
6066 if (sc->nr.unqueued_dirty && in shrink_node()
6067 sc->nr.unqueued_dirty == sc->nr.file_taken) in shrink_node()
6068 set_bit(PGDAT_DIRTY, &pgdat->flags); in shrink_node()
6077 if (sc->nr.immediate) in shrink_node()
6088 if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { in shrink_node()
6090 set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); in shrink_node()
6093 set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); in shrink_node()
6103 !sc->hibernation_mode && in shrink_node()
6104 (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || in shrink_node()
6105 test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) in shrink_node()
6118 pgdat->kswapd_failures = 0; in shrink_node()
6119 else if (sc->cache_trim_mode) in shrink_node()
6120 sc->cache_trim_mode_failed = 1; in shrink_node()
6124 * Returns true if compaction should go ahead for a costly-order request, or
6132 if (!gfp_compaction_allowed(sc->gfp_mask)) in compaction_ready()
6136 if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), in compaction_ready()
6137 sc->reclaim_idx, 0)) in compaction_ready()
6151 if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx)) in compaction_ready()
6163 if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { in consider_reclaim_throttle()
6166 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; in consider_reclaim_throttle()
6183 if (sc->priority == 1 && !sc->nr_reclaimed) in consider_reclaim_throttle()
6188 * This is the direct reclaim path, for page-allocating processes. We only
6210 orig_mask = sc->gfp_mask; in shrink_zones()
6212 sc->gfp_mask |= __GFP_HIGHMEM; in shrink_zones()
6213 sc->reclaim_idx = gfp_zone(sc->gfp_mask); in shrink_zones()
6217 sc->reclaim_idx, sc->nodemask) { in shrink_zones()
6231 * non-zero order, only frequent costly order in shrink_zones()
6237 sc->order > PAGE_ALLOC_COSTLY_ORDER && in shrink_zones()
6239 sc->compaction_ready = true; in shrink_zones()
6249 if (zone->zone_pgdat == last_pgdat) in shrink_zones()
6259 nr_soft_reclaimed = memcg1_soft_limit_reclaim(zone->zone_pgdat, in shrink_zones()
6260 sc->order, sc->gfp_mask, in shrink_zones()
6262 sc->nr_reclaimed += nr_soft_reclaimed; in shrink_zones()
6263 sc->nr_scanned += nr_soft_scanned; in shrink_zones()
6268 first_pgdat = zone->zone_pgdat; in shrink_zones()
6271 if (zone->zone_pgdat == last_pgdat) in shrink_zones()
6273 last_pgdat = zone->zone_pgdat; in shrink_zones()
6274 shrink_node(zone->zone_pgdat, sc); in shrink_zones()
6284 sc->gfp_mask = orig_mask; in shrink_zones()
6297 target_lruvec->refaults[WORKINGSET_ANON] = refaults; in snapshot_refaults()
6299 target_lruvec->refaults[WORKINGSET_FILE] = refaults; in snapshot_refaults()
6309 * high - the zone may be full of dirty or under-writeback pages, which this
6321 int initial_priority = sc->priority; in do_try_to_free_pages()
6329 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); in do_try_to_free_pages()
6332 if (!sc->proactive) in do_try_to_free_pages()
6333 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, in do_try_to_free_pages()
6334 sc->priority); in do_try_to_free_pages()
6335 sc->nr_scanned = 0; in do_try_to_free_pages()
6338 if (sc->nr_reclaimed >= sc->nr_to_reclaim) in do_try_to_free_pages()
6341 if (sc->compaction_ready) in do_try_to_free_pages()
6348 if (sc->priority < DEF_PRIORITY - 2) in do_try_to_free_pages()
6349 sc->may_writepage = 1; in do_try_to_free_pages()
6350 } while (--sc->priority >= 0); in do_try_to_free_pages()
6353 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, in do_try_to_free_pages()
6354 sc->nodemask) { in do_try_to_free_pages()
6355 if (zone->zone_pgdat == last_pgdat) in do_try_to_free_pages()
6357 last_pgdat = zone->zone_pgdat; in do_try_to_free_pages()
6359 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); in do_try_to_free_pages()
6364 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, in do_try_to_free_pages()
6365 zone->zone_pgdat); in do_try_to_free_pages()
6366 clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); in do_try_to_free_pages()
6372 if (sc->nr_reclaimed) in do_try_to_free_pages()
6373 return sc->nr_reclaimed; in do_try_to_free_pages()
6376 if (sc->compaction_ready) in do_try_to_free_pages()
6388 if (!sc->memcg_full_walk) { in do_try_to_free_pages()
6389 sc->priority = initial_priority; in do_try_to_free_pages()
6390 sc->memcg_full_walk = 1; in do_try_to_free_pages()
6403 if (sc->skipped_deactivate) { in do_try_to_free_pages()
6404 sc->priority = initial_priority; in do_try_to_free_pages()
6405 sc->force_deactivate = 1; in do_try_to_free_pages()
6406 sc->skipped_deactivate = 0; in do_try_to_free_pages()
6411 if (sc->memcg_low_skipped) { in do_try_to_free_pages()
6412 sc->priority = initial_priority; in do_try_to_free_pages()
6413 sc->force_deactivate = 0; in do_try_to_free_pages()
6414 sc->memcg_low_reclaim = 1; in do_try_to_free_pages()
6415 sc->memcg_low_skipped = 0; in do_try_to_free_pages()
6430 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) in allow_direct_reclaim()
6448 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { in allow_direct_reclaim()
6449 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) in allow_direct_reclaim()
6450 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); in allow_direct_reclaim()
6452 wake_up_interruptible(&pgdat->kswapd_wait); in allow_direct_reclaim()
6481 if (current->flags & PF_KTHREAD) in throttle_direct_reclaim()
6511 pgdat = zone->zone_pgdat; in throttle_direct_reclaim()
6527 * transaction in the case of a filesystem like ext[3|4]. In this case, in throttle_direct_reclaim()
6533 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, in throttle_direct_reclaim()
6537 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, in throttle_direct_reclaim()
6604 .reclaim_idx = MAX_NR_ZONES - 1, in mem_cgroup_shrink_node()
6608 WARN_ON_ONCE(!current->reclaim_state); in mem_cgroup_shrink_node()
6645 .reclaim_idx = MAX_NR_ZONES - 1, in try_to_free_mem_cgroup_pages()
6706 * Check for watermark boosts top-down as the higher zones in pgdat_watermark_boosted()
6712 for (i = highest_zoneidx; i >= 0; i--) { in pgdat_watermark_boosted()
6713 zone = pgdat->node_zones + i; in pgdat_watermark_boosted()
6717 if (zone->watermark_boost) in pgdat_watermark_boosted()
6731 unsigned long mark = -1; in pgdat_balanced()
6735 * Check watermarks bottom-up as lower zones are more likely to in pgdat_balanced()
6764 * the cumulative error from the vmstat per-cpu cache in pgdat_balanced()
6771 * counter won't actually be per-cpu cached. But keep in pgdat_balanced()
6775 if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark) in pgdat_balanced()
6785 * need balancing by definition. This can happen if a zone-restricted in pgdat_balanced()
6788 if (mark == -1) in pgdat_balanced()
6799 clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); in clear_pgdat_congested()
6800 clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); in clear_pgdat_congested()
6801 clear_bit(PGDAT_DIRTY, &pgdat->flags); in clear_pgdat_congested()
6802 clear_bit(PGDAT_WRITEBACK, &pgdat->flags); in clear_pgdat_congested()
6827 if (waitqueue_active(&pgdat->pfmemalloc_wait)) in prepare_kswapd_sleep()
6828 wake_up_all(&pgdat->pfmemalloc_wait); in prepare_kswapd_sleep()
6831 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) in prepare_kswapd_sleep()
6855 unsigned long nr_reclaimed = sc->nr_reclaimed; in kswapd_shrink_node()
6858 sc->nr_to_reclaim = 0; in kswapd_shrink_node()
6859 for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { in kswapd_shrink_node()
6860 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); in kswapd_shrink_node()
6871 * high-order allocations. If twice the allocation size has been in kswapd_shrink_node()
6872 * reclaimed then recheck watermarks only at order-0 to prevent in kswapd_shrink_node()
6873 * excessive reclaim. Assume that a process requested a high-order in kswapd_shrink_node()
6876 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) in kswapd_shrink_node()
6877 sc->order = 0; in kswapd_shrink_node()
6880 return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim; in kswapd_shrink_node()
6892 set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); in update_reclaim_active()
6894 clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); in update_reclaim_active()
6917 * kswapd scans the zones in the highmem->normal->dma direction. It skips
6952 nr_boost_reclaim += zone->watermark_boost; in balance_pgdat()
6953 zone_boosts[i] = zone->watermark_boost; in balance_pgdat()
6972 * purpose -- on 64-bit systems it is expected that in balance_pgdat()
6973 * buffer_heads are stripped during active rotation. On 32-bit in balance_pgdat()
6980 for (i = MAX_NR_ZONES - 1; i >= 0; i--) { in balance_pgdat()
6981 zone = pgdat->node_zones + i; in balance_pgdat()
6995 * re-evaluate if boosting is required when kswapd next wakes. in balance_pgdat()
7012 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) in balance_pgdat()
7017 * intent is to relieve pressure not issue sub-optimal IO in balance_pgdat()
7035 if (sc.priority < DEF_PRIORITY - 2) in balance_pgdat()
7058 if (waitqueue_active(&pgdat->pfmemalloc_wait) && in balance_pgdat()
7060 wake_up_all(&pgdat->pfmemalloc_wait); in balance_pgdat()
7073 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; in balance_pgdat()
7074 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); in balance_pgdat()
7085 sc.priority--; in balance_pgdat()
7099 pgdat->kswapd_failures++; in balance_pgdat()
7113 zone = pgdat->node_zones + i; in balance_pgdat()
7114 spin_lock_irqsave(&zone->lock, flags); in balance_pgdat()
7115 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); in balance_pgdat()
7116 spin_unlock_irqrestore(&zone->lock, flags); in balance_pgdat()
7141 * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
7150 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); in kswapd_highest_zoneidx()
7164 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); in kswapd_try_to_sleep()
7196 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, in kswapd_try_to_sleep()
7200 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) in kswapd_try_to_sleep()
7201 WRITE_ONCE(pgdat->kswapd_order, reclaim_order); in kswapd_try_to_sleep()
7204 finish_wait(&pgdat->kswapd_wait, &wait); in kswapd_try_to_sleep()
7205 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); in kswapd_try_to_sleep()
7214 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); in kswapd_try_to_sleep()
7221 * per-cpu vmstat threshold while kswapd is awake and restore in kswapd_try_to_sleep()
7236 finish_wait(&pgdat->kswapd_wait, &wait); in kswapd_try_to_sleep()
7249 * If there are applications that are active memory-allocators
7255 unsigned int highest_zoneidx = MAX_NR_ZONES - 1; in kswapd()
7271 tsk->flags |= PF_MEMALLOC | PF_KSWAPD; in kswapd()
7274 WRITE_ONCE(pgdat->kswapd_order, 0); in kswapd()
7275 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); in kswapd()
7276 atomic_set(&pgdat->nr_writeback_throttled, 0); in kswapd()
7280 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); in kswapd()
7289 alloc_order = READ_ONCE(pgdat->kswapd_order); in kswapd()
7292 WRITE_ONCE(pgdat->kswapd_order, 0); in kswapd()
7293 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); in kswapd()
7306 * Reclaim begins at the requested order but if a high-order in kswapd()
7308 * order-0. If that happens, kswapd will consider sleeping in kswapd()
7313 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, in kswapd()
7321 tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); in kswapd()
7327 * A zone is low on free memory or too fragmented for high-order memory. If
7345 pgdat = zone->zone_pgdat; in wakeup_kswapd()
7346 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); in wakeup_kswapd()
7349 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); in wakeup_kswapd()
7351 if (READ_ONCE(pgdat->kswapd_order) < order) in wakeup_kswapd()
7352 WRITE_ONCE(pgdat->kswapd_order, order); in wakeup_kswapd()
7354 if (!waitqueue_active(&pgdat->kswapd_wait)) in wakeup_kswapd()
7358 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || in wakeup_kswapd()
7363 * fragmented for high-order allocations. Wake up kcompactd in wakeup_kswapd()
7373 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, in wakeup_kswapd()
7375 wake_up_interruptible(&pgdat->kswapd_wait); in wakeup_kswapd()
7380 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
7392 .reclaim_idx = MAX_NR_ZONES - 1, in shrink_all_memory()
7418 * This kswapd start function will be called by init and node-hot-add.
7425 if (!pgdat->kswapd) { in kswapd_run()
7426 pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid); in kswapd_run()
7427 if (IS_ERR(pgdat->kswapd)) { in kswapd_run()
7430 nid, PTR_ERR(pgdat->kswapd)); in kswapd_run()
7432 pgdat->kswapd = NULL; in kswapd_run()
7434 wake_up_process(pgdat->kswapd); in kswapd_run()
7450 kswapd = pgdat->kswapd; in kswapd_stop()
7453 pgdat->kswapd = NULL; in kswapd_stop()
7497 * If non-zero call node_reclaim when the number of free pages falls below
7532 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; in node_unmapped_file_pages()
7560 return nr_pagecache_reclaimable - delta; in node_pagecache_reclaimable()
7584 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, in __node_reclaim()
7597 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || in __node_reclaim()
7598 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { in __node_reclaim()
7605 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); in __node_reclaim()
7633 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && in node_reclaim()
7635 pgdat->min_slab_pages) in node_reclaim()
7641 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) in node_reclaim()
7650 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) in node_reclaim()
7653 if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) in node_reclaim()
7657 clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); in node_reclaim()
7669 * check_move_unevictable_folios - Move evictable folios to appropriate zone
7684 for (i = 0; i < fbatch->nr; i++) { in check_move_unevictable_folios()
7685 struct folio *folio = fbatch->folios[i]; in check_move_unevictable_folios()