xref: /linux/mm/swap_table.h (revision 334fbe734e687404f346eba7d5d96ed2b44d35ab)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _MM_SWAP_TABLE_H
3 #define _MM_SWAP_TABLE_H
4 
5 #include <linux/rcupdate.h>
6 #include <linux/atomic.h>
7 #include "swap.h"
8 
9 /* A typical flat array in each cluster as swap table */
10 struct swap_table {
11 	atomic_long_t entries[SWAPFILE_CLUSTER];
12 };
13 
14 #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
15 
16 /*
17  * A swap table entry represents the status of a swap slot on a swap
18  * (physical or virtual) device. The swap table in each cluster is a
19  * 1:1 map of the swap slots in this cluster.
20  *
21  * Swap table entry type and bits layouts:
22  *
23  * NULL:     |---------------- 0 ---------------| - Free slot
24  * Shadow:   | SWAP_COUNT |---- SHADOW_VAL ---|1| - Swapped out slot
25  * PFN:      | SWAP_COUNT |------ PFN -------|10| - Cached slot
26  * Pointer:  |----------- Pointer ----------|100| - (Unused)
27  * Bad:      |------------- 1 -------------|1000| - Bad slot
28  *
29  * SWAP_COUNT is `SWP_TB_COUNT_BITS` long, each entry is an atomic long.
30  *
31  * Usages:
32  *
33  * - NULL: Swap slot is unused, could be allocated.
34  *
35  * - Shadow: Swap slot is used and not cached (usually swapped out). It reuses
36  *   the XA_VALUE format to be compatible with working set shadows. SHADOW_VAL
37  *   part might be all 0 if the working shadow info is absent. In such a case,
38  *   we still want to keep the shadow format as a placeholder.
39  *
40  *   Memcg ID is embedded in SHADOW_VAL.
41  *
42  * - PFN: Swap slot is in use, and cached. Memcg info is recorded on the page
43  *   struct.
44  *
45  * - Pointer: Unused yet. `0b100` is reserved for potential pointer usage
46  *   because only the lower three bits can be used as a marker for 8 bytes
47  *   aligned pointers.
48  *
49  * - Bad: Swap slot is reserved, protects swap header or holes on swap devices.
50  */
51 
52 #if defined(MAX_POSSIBLE_PHYSMEM_BITS)
53 #define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
54 #elif defined(MAX_PHYSMEM_BITS)
55 #define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
56 #else
57 #define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT)
58 #endif
59 
60 /* NULL Entry, all 0 */
61 #define SWP_TB_NULL		0UL
62 
63 /* Swapped out: shadow */
64 #define SWP_TB_SHADOW_MARK	0b1UL
65 
66 /* Cached: PFN */
67 #define SWP_TB_PFN_BITS		(SWAP_CACHE_PFN_BITS + SWP_TB_PFN_MARK_BITS)
68 #define SWP_TB_PFN_MARK		0b10UL
69 #define SWP_TB_PFN_MARK_BITS	2
70 #define SWP_TB_PFN_MARK_MASK	(BIT(SWP_TB_PFN_MARK_BITS) - 1)
71 
72 /* SWAP_COUNT part for PFN or shadow, the width can be shrunk or extended */
73 #define SWP_TB_COUNT_BITS      min(4, BITS_PER_LONG - SWP_TB_PFN_BITS)
74 #define SWP_TB_COUNT_MASK      (~((~0UL) >> SWP_TB_COUNT_BITS))
75 #define SWP_TB_COUNT_SHIFT     (BITS_PER_LONG - SWP_TB_COUNT_BITS)
76 #define SWP_TB_COUNT_MAX       ((1 << SWP_TB_COUNT_BITS) - 1)
77 
78 /* Bad slot: ends with 0b1000 and rests of bits are all 1 */
79 #define SWP_TB_BAD		((~0UL) << 3)
80 
81 /* Macro for shadow offset calculation */
82 #define SWAP_COUNT_SHIFT	SWP_TB_COUNT_BITS
83 
84 /*
85  * Helpers for casting one type of info into a swap table entry.
86  */
null_to_swp_tb(void)87 static inline unsigned long null_to_swp_tb(void)
88 {
89 	BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t));
90 	return 0;
91 }
92 
__count_to_swp_tb(unsigned char count)93 static inline unsigned long __count_to_swp_tb(unsigned char count)
94 {
95 	/*
96 	 * At least three values are needed to distinguish free (0),
97 	 * used (count > 0 && count < SWP_TB_COUNT_MAX), and
98 	 * overflow (count == SWP_TB_COUNT_MAX).
99 	 */
100 	BUILD_BUG_ON(SWP_TB_COUNT_MAX < 2 || SWP_TB_COUNT_BITS < 2);
101 	VM_WARN_ON(count > SWP_TB_COUNT_MAX);
102 	return ((unsigned long)count) << SWP_TB_COUNT_SHIFT;
103 }
104 
pfn_to_swp_tb(unsigned long pfn,unsigned int count)105 static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned int count)
106 {
107 	unsigned long swp_tb;
108 
109 	BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *));
110 	BUILD_BUG_ON(SWAP_CACHE_PFN_BITS >
111 		     (BITS_PER_LONG - SWP_TB_PFN_MARK_BITS - SWP_TB_COUNT_BITS));
112 
113 	swp_tb = (pfn << SWP_TB_PFN_MARK_BITS) | SWP_TB_PFN_MARK;
114 	VM_WARN_ON_ONCE(swp_tb & SWP_TB_COUNT_MASK);
115 
116 	return swp_tb | __count_to_swp_tb(count);
117 }
118 
folio_to_swp_tb(struct folio * folio,unsigned int count)119 static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned int count)
120 {
121 	return pfn_to_swp_tb(folio_pfn(folio), count);
122 }
123 
shadow_to_swp_tb(void * shadow,unsigned int count)124 static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned int count)
125 {
126 	BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) !=
127 		     BITS_PER_BYTE * sizeof(unsigned long));
128 	BUILD_BUG_ON((unsigned long)xa_mk_value(0) != SWP_TB_SHADOW_MARK);
129 
130 	VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow));
131 	VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_COUNT_MASK));
132 
133 	return (unsigned long)shadow | __count_to_swp_tb(count) | SWP_TB_SHADOW_MARK;
134 }
135 
136 /*
137  * Helpers for swap table entry type checking.
138  */
swp_tb_is_null(unsigned long swp_tb)139 static inline bool swp_tb_is_null(unsigned long swp_tb)
140 {
141 	return !swp_tb;
142 }
143 
swp_tb_is_folio(unsigned long swp_tb)144 static inline bool swp_tb_is_folio(unsigned long swp_tb)
145 {
146 	return ((swp_tb & SWP_TB_PFN_MARK_MASK) == SWP_TB_PFN_MARK);
147 }
148 
swp_tb_is_shadow(unsigned long swp_tb)149 static inline bool swp_tb_is_shadow(unsigned long swp_tb)
150 {
151 	return xa_is_value((void *)swp_tb);
152 }
153 
swp_tb_is_bad(unsigned long swp_tb)154 static inline bool swp_tb_is_bad(unsigned long swp_tb)
155 {
156 	return swp_tb == SWP_TB_BAD;
157 }
158 
swp_tb_is_countable(unsigned long swp_tb)159 static inline bool swp_tb_is_countable(unsigned long swp_tb)
160 {
161 	return (swp_tb_is_shadow(swp_tb) || swp_tb_is_folio(swp_tb) ||
162 		swp_tb_is_null(swp_tb));
163 }
164 
165 /*
166  * Helpers for retrieving info from swap table.
167  */
swp_tb_to_folio(unsigned long swp_tb)168 static inline struct folio *swp_tb_to_folio(unsigned long swp_tb)
169 {
170 	VM_WARN_ON(!swp_tb_is_folio(swp_tb));
171 	return pfn_folio((swp_tb & ~SWP_TB_COUNT_MASK) >> SWP_TB_PFN_MARK_BITS);
172 }
173 
swp_tb_to_shadow(unsigned long swp_tb)174 static inline void *swp_tb_to_shadow(unsigned long swp_tb)
175 {
176 	VM_WARN_ON(!swp_tb_is_shadow(swp_tb));
177 	/* No shift needed, xa_value is stored as it is in the lower bits. */
178 	return (void *)(swp_tb & ~SWP_TB_COUNT_MASK);
179 }
180 
__swp_tb_get_count(unsigned long swp_tb)181 static inline unsigned char __swp_tb_get_count(unsigned long swp_tb)
182 {
183 	VM_WARN_ON(!swp_tb_is_countable(swp_tb));
184 	return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT);
185 }
186 
swp_tb_get_count(unsigned long swp_tb)187 static inline int swp_tb_get_count(unsigned long swp_tb)
188 {
189 	if (swp_tb_is_countable(swp_tb))
190 		return __swp_tb_get_count(swp_tb);
191 	return -EINVAL;
192 }
193 
__swp_tb_mk_count(unsigned long swp_tb,int count)194 static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count)
195 {
196 	return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count));
197 }
198 
199 /*
200  * Helpers for accessing or modifying the swap table of a cluster,
201  * the swap cluster must be locked.
202  */
__swap_table_set(struct swap_cluster_info * ci,unsigned int off,unsigned long swp_tb)203 static inline void __swap_table_set(struct swap_cluster_info *ci,
204 				    unsigned int off, unsigned long swp_tb)
205 {
206 	atomic_long_t *table = rcu_dereference_protected(ci->table, true);
207 
208 	lockdep_assert_held(&ci->lock);
209 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
210 	atomic_long_set(&table[off], swp_tb);
211 }
212 
__swap_table_xchg(struct swap_cluster_info * ci,unsigned int off,unsigned long swp_tb)213 static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci,
214 					      unsigned int off, unsigned long swp_tb)
215 {
216 	atomic_long_t *table = rcu_dereference_protected(ci->table, true);
217 
218 	lockdep_assert_held(&ci->lock);
219 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
220 	/* Ordering is guaranteed by cluster lock, relax */
221 	return atomic_long_xchg_relaxed(&table[off], swp_tb);
222 }
223 
__swap_table_get(struct swap_cluster_info * ci,unsigned int off)224 static inline unsigned long __swap_table_get(struct swap_cluster_info *ci,
225 					     unsigned int off)
226 {
227 	atomic_long_t *table;
228 
229 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
230 	table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock));
231 
232 	return atomic_long_read(&table[off]);
233 }
234 
swap_table_get(struct swap_cluster_info * ci,unsigned int off)235 static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
236 					unsigned int off)
237 {
238 	atomic_long_t *table;
239 	unsigned long swp_tb;
240 
241 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
242 
243 	rcu_read_lock();
244 	table = rcu_dereference(ci->table);
245 	swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb();
246 	rcu_read_unlock();
247 
248 	return swp_tb;
249 }
250 #endif
251