1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support six policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * weighted interleave
23 * Allocate memory interleaved over a set of nodes based on
24 * a set of weights (per-node), with normal fallback if it
25 * fails. Otherwise operates the same as interleave.
26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27 * on node 0 for every 1 page allocated on node 1.
28 *
29 * bind Only allocate memory on a specific set of nodes,
30 * no fallback.
31 * FIXME: memory is allocated starting with the first node
32 * to the last. It would be better if bind would truly restrict
33 * the allocation to memory nodes instead
34 *
35 * preferred Try a specific node first before normal fallback.
36 * As a special case NUMA_NO_NODE here means do the allocation
37 * on the local CPU. This is normally identical to default,
38 * but useful to set in a VMA when you have a non default
39 * process policy.
40 *
41 * preferred many Try a set of nodes first before normal fallback. This is
42 * similar to preferred without the special case.
43 *
44 * default Allocate on the local node first, or when on a VMA
45 * use the process policy. This is what Linux always did
46 * in a NUMA aware kernel and still does by, ahem, default.
47 *
48 * The process policy is applied for most non interrupt memory allocations
49 * in that process' context. Interrupts ignore the policies and always
50 * try to allocate on the local CPU. The VMA policy is only applied for memory
51 * allocations for a VMA in the VM.
52 *
53 * Currently there are a few corner cases in swapping where the policy
54 * is not applied, but the majority should be handled. When process policy
55 * is used it is not remembered over swap outs/swap ins.
56 *
57 * Only the highest zone in the zone hierarchy gets policied. Allocations
58 * requesting a lower zone just use default policy. This implies that
59 * on systems with highmem kernel lowmem allocation don't get policied.
60 * Same with GFP_DMA allocations.
61 *
62 * For shmem/tmpfs shared memory the policy is shared between
63 * all users and remembered even when nobody has memory mapped.
64 */
65
66 /* Notebook:
67 fix mmap readahead to honour policy and enable policy for any page cache
68 object
69 statistics for bigpages
70 global policy for page cache? currently it uses process policy. Requires
71 first item above.
72 handle mremap for shared memory (currently ignored for the policy)
73 grows down?
74 make bind policy root only? It can trigger oom much faster and the
75 kernel is not always grateful with that.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/task.h>
89 #include <linux/nodemask.h>
90 #include <linux/cpuset.h>
91 #include <linux/slab.h>
92 #include <linux/string.h>
93 #include <linux/export.h>
94 #include <linux/nsproxy.h>
95 #include <linux/interrupt.h>
96 #include <linux/init.h>
97 #include <linux/compat.h>
98 #include <linux/ptrace.h>
99 #include <linux/swap.h>
100 #include <linux/seq_file.h>
101 #include <linux/proc_fs.h>
102 #include <linux/migrate.h>
103 #include <linux/ksm.h>
104 #include <linux/rmap.h>
105 #include <linux/security.h>
106 #include <linux/syscalls.h>
107 #include <linux/ctype.h>
108 #include <linux/mm_inline.h>
109 #include <linux/mmu_notifier.h>
110 #include <linux/printk.h>
111 #include <linux/swapops.h>
112 #include <linux/gcd.h>
113
114 #include <asm/tlbflush.h>
115 #include <asm/tlb.h>
116 #include <linux/uaccess.h>
117 #include <linux/memory.h>
118
119 #include "internal.h"
120
121 /* Internal flags */
122 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
123 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
124 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
125
126 static struct kmem_cache *policy_cache;
127 static struct kmem_cache *sn_cache;
128
129 /* Highest zone. An specific allocation for a zone below that is not
130 policied. */
131 enum zone_type policy_zone = 0;
132
133 /*
134 * run-time system-wide default policy => local allocation
135 */
136 static struct mempolicy default_policy = {
137 .refcnt = ATOMIC_INIT(1), /* never free it */
138 .mode = MPOL_LOCAL,
139 };
140
141 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
142
143 /*
144 * weightiness balances the tradeoff between small weights (cycles through nodes
145 * faster, more fair/even distribution) and large weights (smaller errors
146 * between actual bandwidth ratios and weight ratios). 32 is a number that has
147 * been found to perform at a reasonable compromise between the two goals.
148 */
149 static const int weightiness = 32;
150
151 /*
152 * A null weighted_interleave_state is interpreted as having .mode="auto",
153 * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
154 */
155 struct weighted_interleave_state {
156 bool mode_auto;
157 u8 iw_table[];
158 };
159 static struct weighted_interleave_state __rcu *wi_state;
160 static unsigned int *node_bw_table;
161
162 /*
163 * wi_state_lock protects both wi_state and node_bw_table.
164 * node_bw_table is only used by writers to update wi_state.
165 */
166 static DEFINE_MUTEX(wi_state_lock);
167
get_il_weight(int node)168 static u8 get_il_weight(int node)
169 {
170 struct weighted_interleave_state *state;
171 u8 weight = 1;
172
173 rcu_read_lock();
174 state = rcu_dereference(wi_state);
175 if (state)
176 weight = state->iw_table[node];
177 rcu_read_unlock();
178 return weight;
179 }
180
181 /*
182 * Convert bandwidth values into weighted interleave weights.
183 * Call with wi_state_lock.
184 */
reduce_interleave_weights(unsigned int * bw,u8 * new_iw)185 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
186 {
187 u64 sum_bw = 0;
188 unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
189 int nid;
190
191 for_each_node_state(nid, N_MEMORY)
192 sum_bw += bw[nid];
193
194 /* Scale bandwidths to whole numbers in the range [1, weightiness] */
195 for_each_node_state(nid, N_MEMORY) {
196 /*
197 * Try not to perform 64-bit division.
198 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
199 * If sum_bw > scaling_factor, then round the weight up to 1.
200 */
201 scaling_factor = weightiness * bw[nid];
202 if (bw[nid] && sum_bw < scaling_factor) {
203 cast_sum_bw = (unsigned int)sum_bw;
204 new_iw[nid] = scaling_factor / cast_sum_bw;
205 } else {
206 new_iw[nid] = 1;
207 }
208 if (!iw_gcd)
209 iw_gcd = new_iw[nid];
210 iw_gcd = gcd(iw_gcd, new_iw[nid]);
211 }
212
213 /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
214 for_each_node_state(nid, N_MEMORY)
215 new_iw[nid] /= iw_gcd;
216 }
217
mempolicy_set_node_perf(unsigned int node,struct access_coordinate * coords)218 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
219 {
220 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
221 unsigned int *old_bw, *new_bw;
222 unsigned int bw_val;
223 int i;
224
225 bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
226 new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
227 if (!new_bw)
228 return -ENOMEM;
229
230 new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
231 GFP_KERNEL);
232 if (!new_wi_state) {
233 kfree(new_bw);
234 return -ENOMEM;
235 }
236 new_wi_state->mode_auto = true;
237 for (i = 0; i < nr_node_ids; i++)
238 new_wi_state->iw_table[i] = 1;
239
240 /*
241 * Update bandwidth info, even in manual mode. That way, when switching
242 * to auto mode in the future, iw_table can be overwritten using
243 * accurate bw data.
244 */
245 mutex_lock(&wi_state_lock);
246
247 old_bw = node_bw_table;
248 if (old_bw)
249 memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
250 new_bw[node] = bw_val;
251 node_bw_table = new_bw;
252
253 old_wi_state = rcu_dereference_protected(wi_state,
254 lockdep_is_held(&wi_state_lock));
255 if (old_wi_state && !old_wi_state->mode_auto) {
256 /* Manual mode; skip reducing weights and updating wi_state */
257 mutex_unlock(&wi_state_lock);
258 kfree(new_wi_state);
259 goto out;
260 }
261
262 /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
263 reduce_interleave_weights(new_bw, new_wi_state->iw_table);
264 rcu_assign_pointer(wi_state, new_wi_state);
265
266 mutex_unlock(&wi_state_lock);
267 if (old_wi_state) {
268 synchronize_rcu();
269 kfree(old_wi_state);
270 }
271 out:
272 kfree(old_bw);
273 return 0;
274 }
275
276 /**
277 * numa_nearest_node - Find nearest node by state
278 * @node: Node id to start the search
279 * @state: State to filter the search
280 *
281 * Lookup the closest node by distance if @nid is not in state.
282 *
283 * Return: this @node if it is in state, otherwise the closest node by distance
284 */
numa_nearest_node(int node,unsigned int state)285 int numa_nearest_node(int node, unsigned int state)
286 {
287 int min_dist = INT_MAX, dist, n, min_node;
288
289 if (state >= NR_NODE_STATES)
290 return -EINVAL;
291
292 if (node == NUMA_NO_NODE || node_state(node, state))
293 return node;
294
295 min_node = node;
296 for_each_node_state(n, state) {
297 dist = node_distance(node, n);
298 if (dist < min_dist) {
299 min_dist = dist;
300 min_node = n;
301 }
302 }
303
304 return min_node;
305 }
306 EXPORT_SYMBOL_GPL(numa_nearest_node);
307
308 /**
309 * nearest_node_nodemask - Find the node in @mask at the nearest distance
310 * from @node.
311 *
312 * @node: a valid node ID to start the search from.
313 * @mask: a pointer to a nodemask representing the allowed nodes.
314 *
315 * This function iterates over all nodes in @mask and calculates the
316 * distance from the starting @node, then it returns the node ID that is
317 * the closest to @node, or MAX_NUMNODES if no node is found.
318 *
319 * Note that @node must be a valid node ID usable with node_distance(),
320 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
321 * or unexpected behavior.
322 */
nearest_node_nodemask(int node,nodemask_t * mask)323 int nearest_node_nodemask(int node, nodemask_t *mask)
324 {
325 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
326
327 for_each_node_mask(n, *mask) {
328 dist = node_distance(node, n);
329 if (dist < min_dist) {
330 min_dist = dist;
331 min_node = n;
332 }
333 }
334
335 return min_node;
336 }
337 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
338
get_task_policy(struct task_struct * p)339 struct mempolicy *get_task_policy(struct task_struct *p)
340 {
341 struct mempolicy *pol = p->mempolicy;
342 int node;
343
344 if (pol)
345 return pol;
346
347 node = numa_node_id();
348 if (node != NUMA_NO_NODE) {
349 pol = &preferred_node_policy[node];
350 /* preferred_node_policy is not initialised early in boot */
351 if (pol->mode)
352 return pol;
353 }
354
355 return &default_policy;
356 }
357
358 static const struct mempolicy_operations {
359 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
360 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
361 } mpol_ops[MPOL_MAX];
362
mpol_store_user_nodemask(const struct mempolicy * pol)363 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
364 {
365 return pol->flags & MPOL_MODE_FLAGS;
366 }
367
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)368 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
369 const nodemask_t *rel)
370 {
371 nodemask_t tmp;
372 nodes_fold(tmp, *orig, nodes_weight(*rel));
373 nodes_onto(*ret, tmp, *rel);
374 }
375
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)376 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
377 {
378 if (nodes_empty(*nodes))
379 return -EINVAL;
380 pol->nodes = *nodes;
381 return 0;
382 }
383
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)384 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
385 {
386 if (nodes_empty(*nodes))
387 return -EINVAL;
388
389 nodes_clear(pol->nodes);
390 node_set(first_node(*nodes), pol->nodes);
391 return 0;
392 }
393
394 /*
395 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
396 * any, for the new policy. mpol_new() has already validated the nodes
397 * parameter with respect to the policy mode and flags.
398 *
399 * Must be called holding task's alloc_lock to protect task's mems_allowed
400 * and mempolicy. May also be called holding the mmap_lock for write.
401 */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)402 static int mpol_set_nodemask(struct mempolicy *pol,
403 const nodemask_t *nodes, struct nodemask_scratch *nsc)
404 {
405 int ret;
406
407 /*
408 * Default (pol==NULL) resp. local memory policies are not a
409 * subject of any remapping. They also do not need any special
410 * constructor.
411 */
412 if (!pol || pol->mode == MPOL_LOCAL)
413 return 0;
414
415 /* Check N_MEMORY */
416 nodes_and(nsc->mask1,
417 cpuset_current_mems_allowed, node_states[N_MEMORY]);
418
419 VM_BUG_ON(!nodes);
420
421 if (pol->flags & MPOL_F_RELATIVE_NODES)
422 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
423 else
424 nodes_and(nsc->mask2, *nodes, nsc->mask1);
425
426 if (mpol_store_user_nodemask(pol))
427 pol->w.user_nodemask = *nodes;
428 else
429 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
430
431 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
432 return ret;
433 }
434
435 /*
436 * This function just creates a new policy, does some check and simple
437 * initialization. You must invoke mpol_set_nodemask() to set nodes.
438 */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)439 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
440 nodemask_t *nodes)
441 {
442 struct mempolicy *policy;
443
444 if (mode == MPOL_DEFAULT) {
445 if (nodes && !nodes_empty(*nodes))
446 return ERR_PTR(-EINVAL);
447 return NULL;
448 }
449 VM_BUG_ON(!nodes);
450
451 /*
452 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
453 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
454 * All other modes require a valid pointer to a non-empty nodemask.
455 */
456 if (mode == MPOL_PREFERRED) {
457 if (nodes_empty(*nodes)) {
458 if (((flags & MPOL_F_STATIC_NODES) ||
459 (flags & MPOL_F_RELATIVE_NODES)))
460 return ERR_PTR(-EINVAL);
461
462 mode = MPOL_LOCAL;
463 }
464 } else if (mode == MPOL_LOCAL) {
465 if (!nodes_empty(*nodes) ||
466 (flags & MPOL_F_STATIC_NODES) ||
467 (flags & MPOL_F_RELATIVE_NODES))
468 return ERR_PTR(-EINVAL);
469 } else if (nodes_empty(*nodes))
470 return ERR_PTR(-EINVAL);
471
472 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
473 if (!policy)
474 return ERR_PTR(-ENOMEM);
475 atomic_set(&policy->refcnt, 1);
476 policy->mode = mode;
477 policy->flags = flags;
478 policy->home_node = NUMA_NO_NODE;
479
480 return policy;
481 }
482
483 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)484 void __mpol_put(struct mempolicy *pol)
485 {
486 if (!atomic_dec_and_test(&pol->refcnt))
487 return;
488 kmem_cache_free(policy_cache, pol);
489 }
490
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)491 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
492 {
493 }
494
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)495 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
496 {
497 nodemask_t tmp;
498
499 if (pol->flags & MPOL_F_STATIC_NODES)
500 nodes_and(tmp, pol->w.user_nodemask, *nodes);
501 else if (pol->flags & MPOL_F_RELATIVE_NODES)
502 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
503 else {
504 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
505 *nodes);
506 pol->w.cpuset_mems_allowed = *nodes;
507 }
508
509 if (nodes_empty(tmp))
510 tmp = *nodes;
511
512 pol->nodes = tmp;
513 }
514
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)515 static void mpol_rebind_preferred(struct mempolicy *pol,
516 const nodemask_t *nodes)
517 {
518 pol->w.cpuset_mems_allowed = *nodes;
519 }
520
521 /*
522 * mpol_rebind_policy - Migrate a policy to a different set of nodes
523 *
524 * Per-vma policies are protected by mmap_lock. Allocations using per-task
525 * policies are protected by task->mems_allowed_seq to prevent a premature
526 * OOM/allocation failure due to parallel nodemask modification.
527 */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)528 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
529 {
530 if (!pol || pol->mode == MPOL_LOCAL)
531 return;
532 if (!mpol_store_user_nodemask(pol) &&
533 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
534 return;
535
536 mpol_ops[pol->mode].rebind(pol, newmask);
537 }
538
539 /*
540 * Wrapper for mpol_rebind_policy() that just requires task
541 * pointer, and updates task mempolicy.
542 *
543 * Called with task's alloc_lock held.
544 */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)545 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
546 {
547 mpol_rebind_policy(tsk->mempolicy, new);
548 }
549
550 /*
551 * Rebind each vma in mm to new nodemask.
552 *
553 * Call holding a reference to mm. Takes mm->mmap_lock during call.
554 */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)555 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
556 {
557 struct vm_area_struct *vma;
558 VMA_ITERATOR(vmi, mm, 0);
559
560 mmap_write_lock(mm);
561 for_each_vma(vmi, vma) {
562 vma_start_write(vma);
563 mpol_rebind_policy(vma->vm_policy, new);
564 }
565 mmap_write_unlock(mm);
566 }
567
568 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
569 [MPOL_DEFAULT] = {
570 .rebind = mpol_rebind_default,
571 },
572 [MPOL_INTERLEAVE] = {
573 .create = mpol_new_nodemask,
574 .rebind = mpol_rebind_nodemask,
575 },
576 [MPOL_PREFERRED] = {
577 .create = mpol_new_preferred,
578 .rebind = mpol_rebind_preferred,
579 },
580 [MPOL_BIND] = {
581 .create = mpol_new_nodemask,
582 .rebind = mpol_rebind_nodemask,
583 },
584 [MPOL_LOCAL] = {
585 .rebind = mpol_rebind_default,
586 },
587 [MPOL_PREFERRED_MANY] = {
588 .create = mpol_new_nodemask,
589 .rebind = mpol_rebind_preferred,
590 },
591 [MPOL_WEIGHTED_INTERLEAVE] = {
592 .create = mpol_new_nodemask,
593 .rebind = mpol_rebind_nodemask,
594 },
595 };
596
597 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
598 unsigned long flags);
599 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
600 pgoff_t ilx, int *nid);
601
strictly_unmovable(unsigned long flags)602 static bool strictly_unmovable(unsigned long flags)
603 {
604 /*
605 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
606 * if any misplaced page is found.
607 */
608 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
609 MPOL_MF_STRICT;
610 }
611
612 struct migration_mpol { /* for alloc_migration_target_by_mpol() */
613 struct mempolicy *pol;
614 pgoff_t ilx;
615 };
616
617 struct queue_pages {
618 struct list_head *pagelist;
619 unsigned long flags;
620 nodemask_t *nmask;
621 unsigned long start;
622 unsigned long end;
623 struct vm_area_struct *first;
624 struct folio *large; /* note last large folio encountered */
625 long nr_failed; /* could not be isolated at this time */
626 };
627
628 /*
629 * Check if the folio's nid is in qp->nmask.
630 *
631 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
632 * in the invert of qp->nmask.
633 */
queue_folio_required(struct folio * folio,struct queue_pages * qp)634 static inline bool queue_folio_required(struct folio *folio,
635 struct queue_pages *qp)
636 {
637 int nid = folio_nid(folio);
638 unsigned long flags = qp->flags;
639
640 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
641 }
642
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)643 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
644 {
645 struct folio *folio;
646 struct queue_pages *qp = walk->private;
647
648 if (unlikely(is_pmd_migration_entry(*pmd))) {
649 qp->nr_failed++;
650 return;
651 }
652 folio = pmd_folio(*pmd);
653 if (is_huge_zero_folio(folio)) {
654 walk->action = ACTION_CONTINUE;
655 return;
656 }
657 if (!queue_folio_required(folio, qp))
658 return;
659 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
660 !vma_migratable(walk->vma) ||
661 !migrate_folio_add(folio, qp->pagelist, qp->flags))
662 qp->nr_failed++;
663 }
664
665 /*
666 * Scan through folios, checking if they satisfy the required conditions,
667 * moving them from LRU to local pagelist for migration if they do (or not).
668 *
669 * queue_folios_pte_range() has two possible return values:
670 * 0 - continue walking to scan for more, even if an existing folio on the
671 * wrong node could not be isolated and queued for migration.
672 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
673 * and an existing folio was on a node that does not follow the policy.
674 */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)675 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
676 unsigned long end, struct mm_walk *walk)
677 {
678 struct vm_area_struct *vma = walk->vma;
679 struct folio *folio;
680 struct queue_pages *qp = walk->private;
681 unsigned long flags = qp->flags;
682 pte_t *pte, *mapped_pte;
683 pte_t ptent;
684 spinlock_t *ptl;
685 int max_nr, nr;
686
687 ptl = pmd_trans_huge_lock(pmd, vma);
688 if (ptl) {
689 queue_folios_pmd(pmd, walk);
690 spin_unlock(ptl);
691 goto out;
692 }
693
694 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
695 if (!pte) {
696 walk->action = ACTION_AGAIN;
697 return 0;
698 }
699 for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
700 max_nr = (end - addr) >> PAGE_SHIFT;
701 nr = 1;
702 ptent = ptep_get(pte);
703 if (pte_none(ptent))
704 continue;
705 if (!pte_present(ptent)) {
706 if (is_migration_entry(pte_to_swp_entry(ptent)))
707 qp->nr_failed++;
708 continue;
709 }
710 folio = vm_normal_folio(vma, addr, ptent);
711 if (!folio || folio_is_zone_device(folio))
712 continue;
713 if (folio_test_large(folio) && max_nr != 1)
714 nr = folio_pte_batch(folio, pte, ptent, max_nr);
715 /*
716 * vm_normal_folio() filters out zero pages, but there might
717 * still be reserved folios to skip, perhaps in a VDSO.
718 */
719 if (folio_test_reserved(folio))
720 continue;
721 if (!queue_folio_required(folio, qp))
722 continue;
723 if (folio_test_large(folio)) {
724 /*
725 * A large folio can only be isolated from LRU once,
726 * but may be mapped by many PTEs (and Copy-On-Write may
727 * intersperse PTEs of other, order 0, folios). This is
728 * a common case, so don't mistake it for failure (but
729 * there can be other cases of multi-mapped pages which
730 * this quick check does not help to filter out - and a
731 * search of the pagelist might grow to be prohibitive).
732 *
733 * migrate_pages(&pagelist) returns nr_failed folios, so
734 * check "large" now so that queue_pages_range() returns
735 * a comparable nr_failed folios. This does imply that
736 * if folio could not be isolated for some racy reason
737 * at its first PTE, later PTEs will not give it another
738 * chance of isolation; but keeps the accounting simple.
739 */
740 if (folio == qp->large)
741 continue;
742 qp->large = folio;
743 }
744 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
745 !vma_migratable(vma) ||
746 !migrate_folio_add(folio, qp->pagelist, flags)) {
747 qp->nr_failed += nr;
748 if (strictly_unmovable(flags))
749 break;
750 }
751 }
752 pte_unmap_unlock(mapped_pte, ptl);
753 cond_resched();
754 out:
755 if (qp->nr_failed && strictly_unmovable(flags))
756 return -EIO;
757 return 0;
758 }
759
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)760 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
761 unsigned long addr, unsigned long end,
762 struct mm_walk *walk)
763 {
764 #ifdef CONFIG_HUGETLB_PAGE
765 struct queue_pages *qp = walk->private;
766 unsigned long flags = qp->flags;
767 struct folio *folio;
768 spinlock_t *ptl;
769 pte_t entry;
770
771 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
772 entry = huge_ptep_get(walk->mm, addr, pte);
773 if (!pte_present(entry)) {
774 if (unlikely(is_hugetlb_entry_migration(entry)))
775 qp->nr_failed++;
776 goto unlock;
777 }
778 folio = pfn_folio(pte_pfn(entry));
779 if (!queue_folio_required(folio, qp))
780 goto unlock;
781 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
782 !vma_migratable(walk->vma)) {
783 qp->nr_failed++;
784 goto unlock;
785 }
786 /*
787 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
788 * Choosing not to migrate a shared folio is not counted as a failure.
789 *
790 * See folio_maybe_mapped_shared() on possible imprecision when we
791 * cannot easily detect if a folio is shared.
792 */
793 if ((flags & MPOL_MF_MOVE_ALL) ||
794 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
795 if (!folio_isolate_hugetlb(folio, qp->pagelist))
796 qp->nr_failed++;
797 unlock:
798 spin_unlock(ptl);
799 if (qp->nr_failed && strictly_unmovable(flags))
800 return -EIO;
801 #endif
802 return 0;
803 }
804
805 #ifdef CONFIG_NUMA_BALANCING
806 /*
807 * This is used to mark a range of virtual addresses to be inaccessible.
808 * These are later cleared by a NUMA hinting fault. Depending on these
809 * faults, pages may be migrated for better NUMA placement.
810 *
811 * This is assuming that NUMA faults are handled using PROT_NONE. If
812 * an architecture makes a different choice, it will need further
813 * changes to the core.
814 */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)815 unsigned long change_prot_numa(struct vm_area_struct *vma,
816 unsigned long addr, unsigned long end)
817 {
818 struct mmu_gather tlb;
819 long nr_updated;
820
821 tlb_gather_mmu(&tlb, vma->vm_mm);
822
823 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
824 if (nr_updated > 0) {
825 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
826 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
827 }
828
829 tlb_finish_mmu(&tlb);
830
831 return nr_updated;
832 }
833 #endif /* CONFIG_NUMA_BALANCING */
834
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)835 static int queue_pages_test_walk(unsigned long start, unsigned long end,
836 struct mm_walk *walk)
837 {
838 struct vm_area_struct *next, *vma = walk->vma;
839 struct queue_pages *qp = walk->private;
840 unsigned long flags = qp->flags;
841
842 /* range check first */
843 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
844
845 if (!qp->first) {
846 qp->first = vma;
847 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
848 (qp->start < vma->vm_start))
849 /* hole at head side of range */
850 return -EFAULT;
851 }
852 next = find_vma(vma->vm_mm, vma->vm_end);
853 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
854 ((vma->vm_end < qp->end) &&
855 (!next || vma->vm_end < next->vm_start)))
856 /* hole at middle or tail of range */
857 return -EFAULT;
858
859 /*
860 * Need check MPOL_MF_STRICT to return -EIO if possible
861 * regardless of vma_migratable
862 */
863 if (!vma_migratable(vma) &&
864 !(flags & MPOL_MF_STRICT))
865 return 1;
866
867 /*
868 * Check page nodes, and queue pages to move, in the current vma.
869 * But if no moving, and no strict checking, the scan can be skipped.
870 */
871 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
872 return 0;
873 return 1;
874 }
875
876 static const struct mm_walk_ops queue_pages_walk_ops = {
877 .hugetlb_entry = queue_folios_hugetlb,
878 .pmd_entry = queue_folios_pte_range,
879 .test_walk = queue_pages_test_walk,
880 .walk_lock = PGWALK_RDLOCK,
881 };
882
883 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
884 .hugetlb_entry = queue_folios_hugetlb,
885 .pmd_entry = queue_folios_pte_range,
886 .test_walk = queue_pages_test_walk,
887 .walk_lock = PGWALK_WRLOCK,
888 };
889
890 /*
891 * Walk through page tables and collect pages to be migrated.
892 *
893 * If pages found in a given range are not on the required set of @nodes,
894 * and migration is allowed, they are isolated and queued to @pagelist.
895 *
896 * queue_pages_range() may return:
897 * 0 - all pages already on the right node, or successfully queued for moving
898 * (or neither strict checking nor moving requested: only range checking).
899 * >0 - this number of misplaced folios could not be queued for moving
900 * (a hugetlbfs page or a transparent huge page being counted as 1).
901 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
902 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
903 */
904 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)905 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
906 nodemask_t *nodes, unsigned long flags,
907 struct list_head *pagelist)
908 {
909 int err;
910 struct queue_pages qp = {
911 .pagelist = pagelist,
912 .flags = flags,
913 .nmask = nodes,
914 .start = start,
915 .end = end,
916 .first = NULL,
917 };
918 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
919 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
920
921 err = walk_page_range(mm, start, end, ops, &qp);
922
923 if (!qp.first)
924 /* whole range in hole */
925 err = -EFAULT;
926
927 return err ? : qp.nr_failed;
928 }
929
930 /*
931 * Apply policy to a single VMA
932 * This must be called with the mmap_lock held for writing.
933 */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)934 static int vma_replace_policy(struct vm_area_struct *vma,
935 struct mempolicy *pol)
936 {
937 int err;
938 struct mempolicy *old;
939 struct mempolicy *new;
940
941 vma_assert_write_locked(vma);
942
943 new = mpol_dup(pol);
944 if (IS_ERR(new))
945 return PTR_ERR(new);
946
947 if (vma->vm_ops && vma->vm_ops->set_policy) {
948 err = vma->vm_ops->set_policy(vma, new);
949 if (err)
950 goto err_out;
951 }
952
953 old = vma->vm_policy;
954 vma->vm_policy = new; /* protected by mmap_lock */
955 mpol_put(old);
956
957 return 0;
958 err_out:
959 mpol_put(new);
960 return err;
961 }
962
963 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)964 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
965 struct vm_area_struct **prev, unsigned long start,
966 unsigned long end, struct mempolicy *new_pol)
967 {
968 unsigned long vmstart, vmend;
969
970 vmend = min(end, vma->vm_end);
971 if (start > vma->vm_start) {
972 *prev = vma;
973 vmstart = start;
974 } else {
975 vmstart = vma->vm_start;
976 }
977
978 if (mpol_equal(vma->vm_policy, new_pol)) {
979 *prev = vma;
980 return 0;
981 }
982
983 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
984 if (IS_ERR(vma))
985 return PTR_ERR(vma);
986
987 *prev = vma;
988 return vma_replace_policy(vma, new_pol);
989 }
990
991 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)992 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
993 nodemask_t *nodes)
994 {
995 struct mempolicy *new, *old;
996 NODEMASK_SCRATCH(scratch);
997 int ret;
998
999 if (!scratch)
1000 return -ENOMEM;
1001
1002 new = mpol_new(mode, flags, nodes);
1003 if (IS_ERR(new)) {
1004 ret = PTR_ERR(new);
1005 goto out;
1006 }
1007
1008 task_lock(current);
1009 ret = mpol_set_nodemask(new, nodes, scratch);
1010 if (ret) {
1011 task_unlock(current);
1012 mpol_put(new);
1013 goto out;
1014 }
1015
1016 old = current->mempolicy;
1017 current->mempolicy = new;
1018 if (new && (new->mode == MPOL_INTERLEAVE ||
1019 new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1020 current->il_prev = MAX_NUMNODES-1;
1021 current->il_weight = 0;
1022 }
1023 task_unlock(current);
1024 mpol_put(old);
1025 ret = 0;
1026 out:
1027 NODEMASK_SCRATCH_FREE(scratch);
1028 return ret;
1029 }
1030
1031 /*
1032 * Return nodemask for policy for get_mempolicy() query
1033 *
1034 * Called with task's alloc_lock held
1035 */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)1036 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1037 {
1038 nodes_clear(*nodes);
1039 if (pol == &default_policy)
1040 return;
1041
1042 switch (pol->mode) {
1043 case MPOL_BIND:
1044 case MPOL_INTERLEAVE:
1045 case MPOL_PREFERRED:
1046 case MPOL_PREFERRED_MANY:
1047 case MPOL_WEIGHTED_INTERLEAVE:
1048 *nodes = pol->nodes;
1049 break;
1050 case MPOL_LOCAL:
1051 /* return empty node mask for local allocation */
1052 break;
1053 default:
1054 BUG();
1055 }
1056 }
1057
lookup_node(struct mm_struct * mm,unsigned long addr)1058 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1059 {
1060 struct page *p = NULL;
1061 int ret;
1062
1063 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1064 if (ret > 0) {
1065 ret = page_to_nid(p);
1066 put_page(p);
1067 }
1068 return ret;
1069 }
1070
1071 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)1072 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1073 unsigned long addr, unsigned long flags)
1074 {
1075 int err;
1076 struct mm_struct *mm = current->mm;
1077 struct vm_area_struct *vma = NULL;
1078 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1079
1080 if (flags &
1081 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1082 return -EINVAL;
1083
1084 if (flags & MPOL_F_MEMS_ALLOWED) {
1085 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1086 return -EINVAL;
1087 *policy = 0; /* just so it's initialized */
1088 task_lock(current);
1089 *nmask = cpuset_current_mems_allowed;
1090 task_unlock(current);
1091 return 0;
1092 }
1093
1094 if (flags & MPOL_F_ADDR) {
1095 pgoff_t ilx; /* ignored here */
1096 /*
1097 * Do NOT fall back to task policy if the
1098 * vma/shared policy at addr is NULL. We
1099 * want to return MPOL_DEFAULT in this case.
1100 */
1101 mmap_read_lock(mm);
1102 vma = vma_lookup(mm, addr);
1103 if (!vma) {
1104 mmap_read_unlock(mm);
1105 return -EFAULT;
1106 }
1107 pol = __get_vma_policy(vma, addr, &ilx);
1108 } else if (addr)
1109 return -EINVAL;
1110
1111 if (!pol)
1112 pol = &default_policy; /* indicates default behavior */
1113
1114 if (flags & MPOL_F_NODE) {
1115 if (flags & MPOL_F_ADDR) {
1116 /*
1117 * Take a refcount on the mpol, because we are about to
1118 * drop the mmap_lock, after which only "pol" remains
1119 * valid, "vma" is stale.
1120 */
1121 pol_refcount = pol;
1122 vma = NULL;
1123 mpol_get(pol);
1124 mmap_read_unlock(mm);
1125 err = lookup_node(mm, addr);
1126 if (err < 0)
1127 goto out;
1128 *policy = err;
1129 } else if (pol == current->mempolicy &&
1130 pol->mode == MPOL_INTERLEAVE) {
1131 *policy = next_node_in(current->il_prev, pol->nodes);
1132 } else if (pol == current->mempolicy &&
1133 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1134 if (current->il_weight)
1135 *policy = current->il_prev;
1136 else
1137 *policy = next_node_in(current->il_prev,
1138 pol->nodes);
1139 } else {
1140 err = -EINVAL;
1141 goto out;
1142 }
1143 } else {
1144 *policy = pol == &default_policy ? MPOL_DEFAULT :
1145 pol->mode;
1146 /*
1147 * Internal mempolicy flags must be masked off before exposing
1148 * the policy to userspace.
1149 */
1150 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1151 }
1152
1153 err = 0;
1154 if (nmask) {
1155 if (mpol_store_user_nodemask(pol)) {
1156 *nmask = pol->w.user_nodemask;
1157 } else {
1158 task_lock(current);
1159 get_policy_nodemask(pol, nmask);
1160 task_unlock(current);
1161 }
1162 }
1163
1164 out:
1165 mpol_cond_put(pol);
1166 if (vma)
1167 mmap_read_unlock(mm);
1168 if (pol_refcount)
1169 mpol_put(pol_refcount);
1170 return err;
1171 }
1172
1173 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1174 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1175 unsigned long flags)
1176 {
1177 /*
1178 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1179 * Choosing not to migrate a shared folio is not counted as a failure.
1180 *
1181 * See folio_maybe_mapped_shared() on possible imprecision when we
1182 * cannot easily detect if a folio is shared.
1183 */
1184 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1185 if (folio_isolate_lru(folio)) {
1186 list_add_tail(&folio->lru, foliolist);
1187 node_stat_mod_folio(folio,
1188 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1189 folio_nr_pages(folio));
1190 } else {
1191 /*
1192 * Non-movable folio may reach here. And, there may be
1193 * temporary off LRU folios or non-LRU movable folios.
1194 * Treat them as unmovable folios since they can't be
1195 * isolated, so they can't be moved at the moment.
1196 */
1197 return false;
1198 }
1199 }
1200 return true;
1201 }
1202
1203 /*
1204 * Migrate pages from one node to a target node.
1205 * Returns error or the number of pages not migrated.
1206 */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1207 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1208 int flags)
1209 {
1210 nodemask_t nmask;
1211 struct vm_area_struct *vma;
1212 LIST_HEAD(pagelist);
1213 long nr_failed;
1214 long err = 0;
1215 struct migration_target_control mtc = {
1216 .nid = dest,
1217 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1218 .reason = MR_SYSCALL,
1219 };
1220
1221 nodes_clear(nmask);
1222 node_set(source, nmask);
1223
1224 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1225
1226 mmap_read_lock(mm);
1227 vma = find_vma(mm, 0);
1228 if (unlikely(!vma)) {
1229 mmap_read_unlock(mm);
1230 return 0;
1231 }
1232
1233 /*
1234 * This does not migrate the range, but isolates all pages that
1235 * need migration. Between passing in the full user address
1236 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1237 * but passes back the count of pages which could not be isolated.
1238 */
1239 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1240 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1241 mmap_read_unlock(mm);
1242
1243 if (!list_empty(&pagelist)) {
1244 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1245 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1246 if (err)
1247 putback_movable_pages(&pagelist);
1248 }
1249
1250 if (err >= 0)
1251 err += nr_failed;
1252 return err;
1253 }
1254
1255 /*
1256 * Move pages between the two nodesets so as to preserve the physical
1257 * layout as much as possible.
1258 *
1259 * Returns the number of page that could not be moved.
1260 */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1261 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1262 const nodemask_t *to, int flags)
1263 {
1264 long nr_failed = 0;
1265 long err = 0;
1266 nodemask_t tmp;
1267
1268 lru_cache_disable();
1269
1270 /*
1271 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1272 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1273 * bit in 'tmp', and return that <source, dest> pair for migration.
1274 * The pair of nodemasks 'to' and 'from' define the map.
1275 *
1276 * If no pair of bits is found that way, fallback to picking some
1277 * pair of 'source' and 'dest' bits that are not the same. If the
1278 * 'source' and 'dest' bits are the same, this represents a node
1279 * that will be migrating to itself, so no pages need move.
1280 *
1281 * If no bits are left in 'tmp', or if all remaining bits left
1282 * in 'tmp' correspond to the same bit in 'to', return false
1283 * (nothing left to migrate).
1284 *
1285 * This lets us pick a pair of nodes to migrate between, such that
1286 * if possible the dest node is not already occupied by some other
1287 * source node, minimizing the risk of overloading the memory on a
1288 * node that would happen if we migrated incoming memory to a node
1289 * before migrating outgoing memory source that same node.
1290 *
1291 * A single scan of tmp is sufficient. As we go, we remember the
1292 * most recent <s, d> pair that moved (s != d). If we find a pair
1293 * that not only moved, but what's better, moved to an empty slot
1294 * (d is not set in tmp), then we break out then, with that pair.
1295 * Otherwise when we finish scanning from_tmp, we at least have the
1296 * most recent <s, d> pair that moved. If we get all the way through
1297 * the scan of tmp without finding any node that moved, much less
1298 * moved to an empty node, then there is nothing left worth migrating.
1299 */
1300
1301 tmp = *from;
1302 while (!nodes_empty(tmp)) {
1303 int s, d;
1304 int source = NUMA_NO_NODE;
1305 int dest = 0;
1306
1307 for_each_node_mask(s, tmp) {
1308
1309 /*
1310 * do_migrate_pages() tries to maintain the relative
1311 * node relationship of the pages established between
1312 * threads and memory areas.
1313 *
1314 * However if the number of source nodes is not equal to
1315 * the number of destination nodes we can not preserve
1316 * this node relative relationship. In that case, skip
1317 * copying memory from a node that is in the destination
1318 * mask.
1319 *
1320 * Example: [2,3,4] -> [3,4,5] moves everything.
1321 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1322 */
1323
1324 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1325 (node_isset(s, *to)))
1326 continue;
1327
1328 d = node_remap(s, *from, *to);
1329 if (s == d)
1330 continue;
1331
1332 source = s; /* Node moved. Memorize */
1333 dest = d;
1334
1335 /* dest not in remaining from nodes? */
1336 if (!node_isset(dest, tmp))
1337 break;
1338 }
1339 if (source == NUMA_NO_NODE)
1340 break;
1341
1342 node_clear(source, tmp);
1343 err = migrate_to_node(mm, source, dest, flags);
1344 if (err > 0)
1345 nr_failed += err;
1346 if (err < 0)
1347 break;
1348 }
1349
1350 lru_cache_enable();
1351 if (err < 0)
1352 return err;
1353 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1354 }
1355
1356 /*
1357 * Allocate a new folio for page migration, according to NUMA mempolicy.
1358 */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1359 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1360 unsigned long private)
1361 {
1362 struct migration_mpol *mmpol = (struct migration_mpol *)private;
1363 struct mempolicy *pol = mmpol->pol;
1364 pgoff_t ilx = mmpol->ilx;
1365 unsigned int order;
1366 int nid = numa_node_id();
1367 gfp_t gfp;
1368
1369 order = folio_order(src);
1370 ilx += src->index >> order;
1371
1372 if (folio_test_hugetlb(src)) {
1373 nodemask_t *nodemask;
1374 struct hstate *h;
1375
1376 h = folio_hstate(src);
1377 gfp = htlb_alloc_mask(h);
1378 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1379 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1380 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1381 }
1382
1383 if (folio_test_large(src))
1384 gfp = GFP_TRANSHUGE;
1385 else
1386 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1387
1388 return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1389 }
1390 #else
1391
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1392 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1393 unsigned long flags)
1394 {
1395 return false;
1396 }
1397
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1398 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1399 const nodemask_t *to, int flags)
1400 {
1401 return -ENOSYS;
1402 }
1403
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1404 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1405 unsigned long private)
1406 {
1407 return NULL;
1408 }
1409 #endif
1410
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1411 static long do_mbind(unsigned long start, unsigned long len,
1412 unsigned short mode, unsigned short mode_flags,
1413 nodemask_t *nmask, unsigned long flags)
1414 {
1415 struct mm_struct *mm = current->mm;
1416 struct vm_area_struct *vma, *prev;
1417 struct vma_iterator vmi;
1418 struct migration_mpol mmpol;
1419 struct mempolicy *new;
1420 unsigned long end;
1421 long err;
1422 long nr_failed;
1423 LIST_HEAD(pagelist);
1424
1425 if (flags & ~(unsigned long)MPOL_MF_VALID)
1426 return -EINVAL;
1427 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1428 return -EPERM;
1429
1430 if (start & ~PAGE_MASK)
1431 return -EINVAL;
1432
1433 if (mode == MPOL_DEFAULT)
1434 flags &= ~MPOL_MF_STRICT;
1435
1436 len = PAGE_ALIGN(len);
1437 end = start + len;
1438
1439 if (end < start)
1440 return -EINVAL;
1441 if (end == start)
1442 return 0;
1443
1444 new = mpol_new(mode, mode_flags, nmask);
1445 if (IS_ERR(new))
1446 return PTR_ERR(new);
1447
1448 /*
1449 * If we are using the default policy then operation
1450 * on discontinuous address spaces is okay after all
1451 */
1452 if (!new)
1453 flags |= MPOL_MF_DISCONTIG_OK;
1454
1455 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1456 lru_cache_disable();
1457 {
1458 NODEMASK_SCRATCH(scratch);
1459 if (scratch) {
1460 mmap_write_lock(mm);
1461 err = mpol_set_nodemask(new, nmask, scratch);
1462 if (err)
1463 mmap_write_unlock(mm);
1464 } else
1465 err = -ENOMEM;
1466 NODEMASK_SCRATCH_FREE(scratch);
1467 }
1468 if (err)
1469 goto mpol_out;
1470
1471 /*
1472 * Lock the VMAs before scanning for pages to migrate,
1473 * to ensure we don't miss a concurrently inserted page.
1474 */
1475 nr_failed = queue_pages_range(mm, start, end, nmask,
1476 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1477
1478 if (nr_failed < 0) {
1479 err = nr_failed;
1480 nr_failed = 0;
1481 } else {
1482 vma_iter_init(&vmi, mm, start);
1483 prev = vma_prev(&vmi);
1484 for_each_vma_range(vmi, vma, end) {
1485 err = mbind_range(&vmi, vma, &prev, start, end, new);
1486 if (err)
1487 break;
1488 }
1489 }
1490
1491 if (!err && !list_empty(&pagelist)) {
1492 /* Convert MPOL_DEFAULT's NULL to task or default policy */
1493 if (!new) {
1494 new = get_task_policy(current);
1495 mpol_get(new);
1496 }
1497 mmpol.pol = new;
1498 mmpol.ilx = 0;
1499
1500 /*
1501 * In the interleaved case, attempt to allocate on exactly the
1502 * targeted nodes, for the first VMA to be migrated; for later
1503 * VMAs, the nodes will still be interleaved from the targeted
1504 * nodemask, but one by one may be selected differently.
1505 */
1506 if (new->mode == MPOL_INTERLEAVE ||
1507 new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1508 struct folio *folio;
1509 unsigned int order;
1510 unsigned long addr = -EFAULT;
1511
1512 list_for_each_entry(folio, &pagelist, lru) {
1513 if (!folio_test_ksm(folio))
1514 break;
1515 }
1516 if (!list_entry_is_head(folio, &pagelist, lru)) {
1517 vma_iter_init(&vmi, mm, start);
1518 for_each_vma_range(vmi, vma, end) {
1519 addr = page_address_in_vma(folio,
1520 folio_page(folio, 0), vma);
1521 if (addr != -EFAULT)
1522 break;
1523 }
1524 }
1525 if (addr != -EFAULT) {
1526 order = folio_order(folio);
1527 /* We already know the pol, but not the ilx */
1528 mpol_cond_put(get_vma_policy(vma, addr, order,
1529 &mmpol.ilx));
1530 /* Set base from which to increment by index */
1531 mmpol.ilx -= folio->index >> order;
1532 }
1533 }
1534 }
1535
1536 mmap_write_unlock(mm);
1537
1538 if (!err && !list_empty(&pagelist)) {
1539 nr_failed |= migrate_pages(&pagelist,
1540 alloc_migration_target_by_mpol, NULL,
1541 (unsigned long)&mmpol, MIGRATE_SYNC,
1542 MR_MEMPOLICY_MBIND, NULL);
1543 }
1544
1545 if (nr_failed && (flags & MPOL_MF_STRICT))
1546 err = -EIO;
1547 if (!list_empty(&pagelist))
1548 putback_movable_pages(&pagelist);
1549 mpol_out:
1550 mpol_put(new);
1551 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1552 lru_cache_enable();
1553 return err;
1554 }
1555
1556 /*
1557 * User space interface with variable sized bitmaps for nodelists.
1558 */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1559 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1560 unsigned long maxnode)
1561 {
1562 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1563 int ret;
1564
1565 if (in_compat_syscall())
1566 ret = compat_get_bitmap(mask,
1567 (const compat_ulong_t __user *)nmask,
1568 maxnode);
1569 else
1570 ret = copy_from_user(mask, nmask,
1571 nlongs * sizeof(unsigned long));
1572
1573 if (ret)
1574 return -EFAULT;
1575
1576 if (maxnode % BITS_PER_LONG)
1577 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1578
1579 return 0;
1580 }
1581
1582 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1583 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1584 unsigned long maxnode)
1585 {
1586 --maxnode;
1587 nodes_clear(*nodes);
1588 if (maxnode == 0 || !nmask)
1589 return 0;
1590 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1591 return -EINVAL;
1592
1593 /*
1594 * When the user specified more nodes than supported just check
1595 * if the non supported part is all zero, one word at a time,
1596 * starting at the end.
1597 */
1598 while (maxnode > MAX_NUMNODES) {
1599 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1600 unsigned long t;
1601
1602 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1603 return -EFAULT;
1604
1605 if (maxnode - bits >= MAX_NUMNODES) {
1606 maxnode -= bits;
1607 } else {
1608 maxnode = MAX_NUMNODES;
1609 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1610 }
1611 if (t)
1612 return -EINVAL;
1613 }
1614
1615 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1616 }
1617
1618 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1619 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1620 nodemask_t *nodes)
1621 {
1622 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1623 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1624 bool compat = in_compat_syscall();
1625
1626 if (compat)
1627 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1628
1629 if (copy > nbytes) {
1630 if (copy > PAGE_SIZE)
1631 return -EINVAL;
1632 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1633 return -EFAULT;
1634 copy = nbytes;
1635 maxnode = nr_node_ids;
1636 }
1637
1638 if (compat)
1639 return compat_put_bitmap((compat_ulong_t __user *)mask,
1640 nodes_addr(*nodes), maxnode);
1641
1642 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1643 }
1644
1645 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1646 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1647 {
1648 *flags = *mode & MPOL_MODE_FLAGS;
1649 *mode &= ~MPOL_MODE_FLAGS;
1650
1651 if ((unsigned int)(*mode) >= MPOL_MAX)
1652 return -EINVAL;
1653 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1654 return -EINVAL;
1655 if (*flags & MPOL_F_NUMA_BALANCING) {
1656 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1657 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1658 else
1659 return -EINVAL;
1660 }
1661 return 0;
1662 }
1663
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1664 static long kernel_mbind(unsigned long start, unsigned long len,
1665 unsigned long mode, const unsigned long __user *nmask,
1666 unsigned long maxnode, unsigned int flags)
1667 {
1668 unsigned short mode_flags;
1669 nodemask_t nodes;
1670 int lmode = mode;
1671 int err;
1672
1673 start = untagged_addr(start);
1674 err = sanitize_mpol_flags(&lmode, &mode_flags);
1675 if (err)
1676 return err;
1677
1678 err = get_nodes(&nodes, nmask, maxnode);
1679 if (err)
1680 return err;
1681
1682 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1683 }
1684
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1685 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1686 unsigned long, home_node, unsigned long, flags)
1687 {
1688 struct mm_struct *mm = current->mm;
1689 struct vm_area_struct *vma, *prev;
1690 struct mempolicy *new, *old;
1691 unsigned long end;
1692 int err = -ENOENT;
1693 VMA_ITERATOR(vmi, mm, start);
1694
1695 start = untagged_addr(start);
1696 if (start & ~PAGE_MASK)
1697 return -EINVAL;
1698 /*
1699 * flags is used for future extension if any.
1700 */
1701 if (flags != 0)
1702 return -EINVAL;
1703
1704 /*
1705 * Check home_node is online to avoid accessing uninitialized
1706 * NODE_DATA.
1707 */
1708 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1709 return -EINVAL;
1710
1711 len = PAGE_ALIGN(len);
1712 end = start + len;
1713
1714 if (end < start)
1715 return -EINVAL;
1716 if (end == start)
1717 return 0;
1718 mmap_write_lock(mm);
1719 prev = vma_prev(&vmi);
1720 for_each_vma_range(vmi, vma, end) {
1721 /*
1722 * If any vma in the range got policy other than MPOL_BIND
1723 * or MPOL_PREFERRED_MANY we return error. We don't reset
1724 * the home node for vmas we already updated before.
1725 */
1726 old = vma_policy(vma);
1727 if (!old) {
1728 prev = vma;
1729 continue;
1730 }
1731 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1732 err = -EOPNOTSUPP;
1733 break;
1734 }
1735 new = mpol_dup(old);
1736 if (IS_ERR(new)) {
1737 err = PTR_ERR(new);
1738 break;
1739 }
1740
1741 vma_start_write(vma);
1742 new->home_node = home_node;
1743 err = mbind_range(&vmi, vma, &prev, start, end, new);
1744 mpol_put(new);
1745 if (err)
1746 break;
1747 }
1748 mmap_write_unlock(mm);
1749 return err;
1750 }
1751
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1752 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1753 unsigned long, mode, const unsigned long __user *, nmask,
1754 unsigned long, maxnode, unsigned int, flags)
1755 {
1756 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1757 }
1758
1759 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1760 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1761 unsigned long maxnode)
1762 {
1763 unsigned short mode_flags;
1764 nodemask_t nodes;
1765 int lmode = mode;
1766 int err;
1767
1768 err = sanitize_mpol_flags(&lmode, &mode_flags);
1769 if (err)
1770 return err;
1771
1772 err = get_nodes(&nodes, nmask, maxnode);
1773 if (err)
1774 return err;
1775
1776 return do_set_mempolicy(lmode, mode_flags, &nodes);
1777 }
1778
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1779 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1780 unsigned long, maxnode)
1781 {
1782 return kernel_set_mempolicy(mode, nmask, maxnode);
1783 }
1784
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1785 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1786 const unsigned long __user *old_nodes,
1787 const unsigned long __user *new_nodes)
1788 {
1789 struct mm_struct *mm = NULL;
1790 struct task_struct *task;
1791 nodemask_t task_nodes;
1792 int err;
1793 nodemask_t *old;
1794 nodemask_t *new;
1795 NODEMASK_SCRATCH(scratch);
1796
1797 if (!scratch)
1798 return -ENOMEM;
1799
1800 old = &scratch->mask1;
1801 new = &scratch->mask2;
1802
1803 err = get_nodes(old, old_nodes, maxnode);
1804 if (err)
1805 goto out;
1806
1807 err = get_nodes(new, new_nodes, maxnode);
1808 if (err)
1809 goto out;
1810
1811 /* Find the mm_struct */
1812 rcu_read_lock();
1813 task = pid ? find_task_by_vpid(pid) : current;
1814 if (!task) {
1815 rcu_read_unlock();
1816 err = -ESRCH;
1817 goto out;
1818 }
1819 get_task_struct(task);
1820
1821 err = -EINVAL;
1822
1823 /*
1824 * Check if this process has the right to modify the specified process.
1825 * Use the regular "ptrace_may_access()" checks.
1826 */
1827 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1828 rcu_read_unlock();
1829 err = -EPERM;
1830 goto out_put;
1831 }
1832 rcu_read_unlock();
1833
1834 task_nodes = cpuset_mems_allowed(task);
1835 /* Is the user allowed to access the target nodes? */
1836 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1837 err = -EPERM;
1838 goto out_put;
1839 }
1840
1841 task_nodes = cpuset_mems_allowed(current);
1842 nodes_and(*new, *new, task_nodes);
1843 if (nodes_empty(*new))
1844 goto out_put;
1845
1846 err = security_task_movememory(task);
1847 if (err)
1848 goto out_put;
1849
1850 mm = get_task_mm(task);
1851 put_task_struct(task);
1852
1853 if (!mm) {
1854 err = -EINVAL;
1855 goto out;
1856 }
1857
1858 err = do_migrate_pages(mm, old, new,
1859 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1860
1861 mmput(mm);
1862 out:
1863 NODEMASK_SCRATCH_FREE(scratch);
1864
1865 return err;
1866
1867 out_put:
1868 put_task_struct(task);
1869 goto out;
1870 }
1871
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1872 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1873 const unsigned long __user *, old_nodes,
1874 const unsigned long __user *, new_nodes)
1875 {
1876 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1877 }
1878
1879 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1880 static int kernel_get_mempolicy(int __user *policy,
1881 unsigned long __user *nmask,
1882 unsigned long maxnode,
1883 unsigned long addr,
1884 unsigned long flags)
1885 {
1886 int err;
1887 int pval;
1888 nodemask_t nodes;
1889
1890 if (nmask != NULL && maxnode < nr_node_ids)
1891 return -EINVAL;
1892
1893 addr = untagged_addr(addr);
1894
1895 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1896
1897 if (err)
1898 return err;
1899
1900 if (policy && put_user(pval, policy))
1901 return -EFAULT;
1902
1903 if (nmask)
1904 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1905
1906 return err;
1907 }
1908
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1909 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1910 unsigned long __user *, nmask, unsigned long, maxnode,
1911 unsigned long, addr, unsigned long, flags)
1912 {
1913 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1914 }
1915
vma_migratable(struct vm_area_struct * vma)1916 bool vma_migratable(struct vm_area_struct *vma)
1917 {
1918 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1919 return false;
1920
1921 /*
1922 * DAX device mappings require predictable access latency, so avoid
1923 * incurring periodic faults.
1924 */
1925 if (vma_is_dax(vma))
1926 return false;
1927
1928 if (is_vm_hugetlb_page(vma) &&
1929 !hugepage_migration_supported(hstate_vma(vma)))
1930 return false;
1931
1932 /*
1933 * Migration allocates pages in the highest zone. If we cannot
1934 * do so then migration (at least from node to node) is not
1935 * possible.
1936 */
1937 if (vma->vm_file &&
1938 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1939 < policy_zone)
1940 return false;
1941 return true;
1942 }
1943
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)1944 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1945 unsigned long addr, pgoff_t *ilx)
1946 {
1947 *ilx = 0;
1948 return (vma->vm_ops && vma->vm_ops->get_policy) ?
1949 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
1950 }
1951
1952 /*
1953 * get_vma_policy(@vma, @addr, @order, @ilx)
1954 * @vma: virtual memory area whose policy is sought
1955 * @addr: address in @vma for shared policy lookup
1956 * @order: 0, or appropriate huge_page_order for interleaving
1957 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
1958 * MPOL_WEIGHTED_INTERLEAVE
1959 *
1960 * Returns effective policy for a VMA at specified address.
1961 * Falls back to current->mempolicy or system default policy, as necessary.
1962 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1963 * count--added by the get_policy() vm_op, as appropriate--to protect against
1964 * freeing by another task. It is the caller's responsibility to free the
1965 * extra reference for shared policies.
1966 */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)1967 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1968 unsigned long addr, int order, pgoff_t *ilx)
1969 {
1970 struct mempolicy *pol;
1971
1972 pol = __get_vma_policy(vma, addr, ilx);
1973 if (!pol)
1974 pol = get_task_policy(current);
1975 if (pol->mode == MPOL_INTERLEAVE ||
1976 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1977 *ilx += vma->vm_pgoff >> order;
1978 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1979 }
1980 return pol;
1981 }
1982
vma_policy_mof(struct vm_area_struct * vma)1983 bool vma_policy_mof(struct vm_area_struct *vma)
1984 {
1985 struct mempolicy *pol;
1986
1987 if (vma->vm_ops && vma->vm_ops->get_policy) {
1988 bool ret = false;
1989 pgoff_t ilx; /* ignored here */
1990
1991 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1992 if (pol && (pol->flags & MPOL_F_MOF))
1993 ret = true;
1994 mpol_cond_put(pol);
1995
1996 return ret;
1997 }
1998
1999 pol = vma->vm_policy;
2000 if (!pol)
2001 pol = get_task_policy(current);
2002
2003 return pol->flags & MPOL_F_MOF;
2004 }
2005
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)2006 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2007 {
2008 enum zone_type dynamic_policy_zone = policy_zone;
2009
2010 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2011
2012 /*
2013 * if policy->nodes has movable memory only,
2014 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2015 *
2016 * policy->nodes is intersect with node_states[N_MEMORY].
2017 * so if the following test fails, it implies
2018 * policy->nodes has movable memory only.
2019 */
2020 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2021 dynamic_policy_zone = ZONE_MOVABLE;
2022
2023 return zone >= dynamic_policy_zone;
2024 }
2025
weighted_interleave_nodes(struct mempolicy * policy)2026 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2027 {
2028 unsigned int node;
2029 unsigned int cpuset_mems_cookie;
2030
2031 retry:
2032 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2033 cpuset_mems_cookie = read_mems_allowed_begin();
2034 node = current->il_prev;
2035 if (!current->il_weight || !node_isset(node, policy->nodes)) {
2036 node = next_node_in(node, policy->nodes);
2037 if (read_mems_allowed_retry(cpuset_mems_cookie))
2038 goto retry;
2039 if (node == MAX_NUMNODES)
2040 return node;
2041 current->il_prev = node;
2042 current->il_weight = get_il_weight(node);
2043 }
2044 current->il_weight--;
2045 return node;
2046 }
2047
2048 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)2049 static unsigned int interleave_nodes(struct mempolicy *policy)
2050 {
2051 unsigned int nid;
2052 unsigned int cpuset_mems_cookie;
2053
2054 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2055 do {
2056 cpuset_mems_cookie = read_mems_allowed_begin();
2057 nid = next_node_in(current->il_prev, policy->nodes);
2058 } while (read_mems_allowed_retry(cpuset_mems_cookie));
2059
2060 if (nid < MAX_NUMNODES)
2061 current->il_prev = nid;
2062 return nid;
2063 }
2064
2065 /*
2066 * Depending on the memory policy provide a node from which to allocate the
2067 * next slab entry.
2068 */
mempolicy_slab_node(void)2069 unsigned int mempolicy_slab_node(void)
2070 {
2071 struct mempolicy *policy;
2072 int node = numa_mem_id();
2073
2074 if (!in_task())
2075 return node;
2076
2077 policy = current->mempolicy;
2078 if (!policy)
2079 return node;
2080
2081 switch (policy->mode) {
2082 case MPOL_PREFERRED:
2083 return first_node(policy->nodes);
2084
2085 case MPOL_INTERLEAVE:
2086 return interleave_nodes(policy);
2087
2088 case MPOL_WEIGHTED_INTERLEAVE:
2089 return weighted_interleave_nodes(policy);
2090
2091 case MPOL_BIND:
2092 case MPOL_PREFERRED_MANY:
2093 {
2094 struct zoneref *z;
2095
2096 /*
2097 * Follow bind policy behavior and start allocation at the
2098 * first node.
2099 */
2100 struct zonelist *zonelist;
2101 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2102 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2103 z = first_zones_zonelist(zonelist, highest_zoneidx,
2104 &policy->nodes);
2105 return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2106 }
2107 case MPOL_LOCAL:
2108 return node;
2109
2110 default:
2111 BUG();
2112 }
2113 }
2114
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2115 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2116 nodemask_t *mask)
2117 {
2118 /*
2119 * barrier stabilizes the nodemask locally so that it can be iterated
2120 * over safely without concern for changes. Allocators validate node
2121 * selection does not violate mems_allowed, so this is safe.
2122 */
2123 barrier();
2124 memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2125 barrier();
2126 return nodes_weight(*mask);
2127 }
2128
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2129 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2130 {
2131 struct weighted_interleave_state *state;
2132 nodemask_t nodemask;
2133 unsigned int target, nr_nodes;
2134 u8 *table = NULL;
2135 unsigned int weight_total = 0;
2136 u8 weight;
2137 int nid = 0;
2138
2139 nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2140 if (!nr_nodes)
2141 return numa_node_id();
2142
2143 rcu_read_lock();
2144
2145 state = rcu_dereference(wi_state);
2146 /* Uninitialized wi_state means we should assume all weights are 1 */
2147 if (state)
2148 table = state->iw_table;
2149
2150 /* calculate the total weight */
2151 for_each_node_mask(nid, nodemask)
2152 weight_total += table ? table[nid] : 1;
2153
2154 /* Calculate the node offset based on totals */
2155 target = ilx % weight_total;
2156 nid = first_node(nodemask);
2157 while (target) {
2158 /* detect system default usage */
2159 weight = table ? table[nid] : 1;
2160 if (target < weight)
2161 break;
2162 target -= weight;
2163 nid = next_node_in(nid, nodemask);
2164 }
2165 rcu_read_unlock();
2166 return nid;
2167 }
2168
2169 /*
2170 * Do static interleaving for interleave index @ilx. Returns the ilx'th
2171 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2172 * exceeds the number of present nodes.
2173 */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2174 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2175 {
2176 nodemask_t nodemask;
2177 unsigned int target, nnodes;
2178 int i;
2179 int nid;
2180
2181 nnodes = read_once_policy_nodemask(pol, &nodemask);
2182 if (!nnodes)
2183 return numa_node_id();
2184 target = ilx % nnodes;
2185 nid = first_node(nodemask);
2186 for (i = 0; i < target; i++)
2187 nid = next_node(nid, nodemask);
2188 return nid;
2189 }
2190
2191 /*
2192 * Return a nodemask representing a mempolicy for filtering nodes for
2193 * page allocation, together with preferred node id (or the input node id).
2194 */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2195 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2196 pgoff_t ilx, int *nid)
2197 {
2198 nodemask_t *nodemask = NULL;
2199
2200 switch (pol->mode) {
2201 case MPOL_PREFERRED:
2202 /* Override input node id */
2203 *nid = first_node(pol->nodes);
2204 break;
2205 case MPOL_PREFERRED_MANY:
2206 nodemask = &pol->nodes;
2207 if (pol->home_node != NUMA_NO_NODE)
2208 *nid = pol->home_node;
2209 break;
2210 case MPOL_BIND:
2211 /* Restrict to nodemask (but not on lower zones) */
2212 if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2213 cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2214 nodemask = &pol->nodes;
2215 if (pol->home_node != NUMA_NO_NODE)
2216 *nid = pol->home_node;
2217 /*
2218 * __GFP_THISNODE shouldn't even be used with the bind policy
2219 * because we might easily break the expectation to stay on the
2220 * requested node and not break the policy.
2221 */
2222 WARN_ON_ONCE(gfp & __GFP_THISNODE);
2223 break;
2224 case MPOL_INTERLEAVE:
2225 /* Override input node id */
2226 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2227 interleave_nodes(pol) : interleave_nid(pol, ilx);
2228 break;
2229 case MPOL_WEIGHTED_INTERLEAVE:
2230 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2231 weighted_interleave_nodes(pol) :
2232 weighted_interleave_nid(pol, ilx);
2233 break;
2234 }
2235
2236 return nodemask;
2237 }
2238
2239 #ifdef CONFIG_HUGETLBFS
2240 /*
2241 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2242 * @vma: virtual memory area whose policy is sought
2243 * @addr: address in @vma for shared policy lookup and interleave policy
2244 * @gfp_flags: for requested zone
2245 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2246 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2247 *
2248 * Returns a nid suitable for a huge page allocation and a pointer
2249 * to the struct mempolicy for conditional unref after allocation.
2250 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2251 * to the mempolicy's @nodemask for filtering the zonelist.
2252 */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2253 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2254 struct mempolicy **mpol, nodemask_t **nodemask)
2255 {
2256 pgoff_t ilx;
2257 int nid;
2258
2259 nid = numa_node_id();
2260 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2261 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2262 return nid;
2263 }
2264
2265 /*
2266 * init_nodemask_of_mempolicy
2267 *
2268 * If the current task's mempolicy is "default" [NULL], return 'false'
2269 * to indicate default policy. Otherwise, extract the policy nodemask
2270 * for 'bind' or 'interleave' policy into the argument nodemask, or
2271 * initialize the argument nodemask to contain the single node for
2272 * 'preferred' or 'local' policy and return 'true' to indicate presence
2273 * of non-default mempolicy.
2274 *
2275 * We don't bother with reference counting the mempolicy [mpol_get/put]
2276 * because the current task is examining it's own mempolicy and a task's
2277 * mempolicy is only ever changed by the task itself.
2278 *
2279 * N.B., it is the caller's responsibility to free a returned nodemask.
2280 */
init_nodemask_of_mempolicy(nodemask_t * mask)2281 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2282 {
2283 struct mempolicy *mempolicy;
2284
2285 if (!(mask && current->mempolicy))
2286 return false;
2287
2288 task_lock(current);
2289 mempolicy = current->mempolicy;
2290 switch (mempolicy->mode) {
2291 case MPOL_PREFERRED:
2292 case MPOL_PREFERRED_MANY:
2293 case MPOL_BIND:
2294 case MPOL_INTERLEAVE:
2295 case MPOL_WEIGHTED_INTERLEAVE:
2296 *mask = mempolicy->nodes;
2297 break;
2298
2299 case MPOL_LOCAL:
2300 init_nodemask_of_node(mask, numa_node_id());
2301 break;
2302
2303 default:
2304 BUG();
2305 }
2306 task_unlock(current);
2307
2308 return true;
2309 }
2310 #endif
2311
2312 /*
2313 * mempolicy_in_oom_domain
2314 *
2315 * If tsk's mempolicy is "bind", check for intersection between mask and
2316 * the policy nodemask. Otherwise, return true for all other policies
2317 * including "interleave", as a tsk with "interleave" policy may have
2318 * memory allocated from all nodes in system.
2319 *
2320 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2321 */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2322 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2323 const nodemask_t *mask)
2324 {
2325 struct mempolicy *mempolicy;
2326 bool ret = true;
2327
2328 if (!mask)
2329 return ret;
2330
2331 task_lock(tsk);
2332 mempolicy = tsk->mempolicy;
2333 if (mempolicy && mempolicy->mode == MPOL_BIND)
2334 ret = nodes_intersects(mempolicy->nodes, *mask);
2335 task_unlock(tsk);
2336
2337 return ret;
2338 }
2339
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2340 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2341 int nid, nodemask_t *nodemask)
2342 {
2343 struct page *page;
2344 gfp_t preferred_gfp;
2345
2346 /*
2347 * This is a two pass approach. The first pass will only try the
2348 * preferred nodes but skip the direct reclaim and allow the
2349 * allocation to fail, while the second pass will try all the
2350 * nodes in system.
2351 */
2352 preferred_gfp = gfp | __GFP_NOWARN;
2353 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2354 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2355 if (!page)
2356 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2357
2358 return page;
2359 }
2360
2361 /**
2362 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2363 * @gfp: GFP flags.
2364 * @order: Order of the page allocation.
2365 * @pol: Pointer to the NUMA mempolicy.
2366 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2367 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2368 *
2369 * Return: The page on success or NULL if allocation fails.
2370 */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2371 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2372 struct mempolicy *pol, pgoff_t ilx, int nid)
2373 {
2374 nodemask_t *nodemask;
2375 struct page *page;
2376
2377 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2378
2379 if (pol->mode == MPOL_PREFERRED_MANY)
2380 return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2381
2382 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2383 /* filter "hugepage" allocation, unless from alloc_pages() */
2384 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2385 /*
2386 * For hugepage allocation and non-interleave policy which
2387 * allows the current node (or other explicitly preferred
2388 * node) we only try to allocate from the current/preferred
2389 * node and don't fall back to other nodes, as the cost of
2390 * remote accesses would likely offset THP benefits.
2391 *
2392 * If the policy is interleave or does not allow the current
2393 * node in its nodemask, we allocate the standard way.
2394 */
2395 if (pol->mode != MPOL_INTERLEAVE &&
2396 pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2397 (!nodemask || node_isset(nid, *nodemask))) {
2398 /*
2399 * First, try to allocate THP only on local node, but
2400 * don't reclaim unnecessarily, just compact.
2401 */
2402 page = __alloc_frozen_pages_noprof(
2403 gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2404 nid, NULL);
2405 if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2406 return page;
2407 /*
2408 * If hugepage allocations are configured to always
2409 * synchronous compact or the vma has been madvised
2410 * to prefer hugepage backing, retry allowing remote
2411 * memory with both reclaim and compact as well.
2412 */
2413 }
2414 }
2415
2416 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2417
2418 if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2419 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2420 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2421 if (static_branch_likely(&vm_numa_stat_key) &&
2422 page_to_nid(page) == nid) {
2423 preempt_disable();
2424 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2425 preempt_enable();
2426 }
2427 }
2428
2429 return page;
2430 }
2431
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2432 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2433 struct mempolicy *pol, pgoff_t ilx, int nid)
2434 {
2435 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2436 ilx, nid);
2437 if (!page)
2438 return NULL;
2439
2440 set_page_refcounted(page);
2441 return page_rmappable_folio(page);
2442 }
2443
2444 /**
2445 * vma_alloc_folio - Allocate a folio for a VMA.
2446 * @gfp: GFP flags.
2447 * @order: Order of the folio.
2448 * @vma: Pointer to VMA.
2449 * @addr: Virtual address of the allocation. Must be inside @vma.
2450 *
2451 * Allocate a folio for a specific address in @vma, using the appropriate
2452 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
2453 * VMA to prevent it from going away. Should be used for all allocations
2454 * for folios that will be mapped into user space, excepting hugetlbfs, and
2455 * excepting where direct use of folio_alloc_mpol() is more appropriate.
2456 *
2457 * Return: The folio on success or NULL if allocation fails.
2458 */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2459 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2460 unsigned long addr)
2461 {
2462 struct mempolicy *pol;
2463 pgoff_t ilx;
2464 struct folio *folio;
2465
2466 if (vma->vm_flags & VM_DROPPABLE)
2467 gfp |= __GFP_NOWARN;
2468
2469 pol = get_vma_policy(vma, addr, order, &ilx);
2470 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2471 mpol_cond_put(pol);
2472 return folio;
2473 }
2474 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2475
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2476 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2477 {
2478 struct mempolicy *pol = &default_policy;
2479
2480 /*
2481 * No reference counting needed for current->mempolicy
2482 * nor system default_policy
2483 */
2484 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2485 pol = get_task_policy(current);
2486
2487 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2488 numa_node_id());
2489 }
2490
2491 /**
2492 * alloc_pages - Allocate pages.
2493 * @gfp: GFP flags.
2494 * @order: Power of two of number of pages to allocate.
2495 *
2496 * Allocate 1 << @order contiguous pages. The physical address of the
2497 * first page is naturally aligned (eg an order-3 allocation will be aligned
2498 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2499 * process is honoured when in process context.
2500 *
2501 * Context: Can be called from any context, providing the appropriate GFP
2502 * flags are used.
2503 * Return: The page on success or NULL if allocation fails.
2504 */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2505 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2506 {
2507 struct page *page = alloc_frozen_pages_noprof(gfp, order);
2508
2509 if (page)
2510 set_page_refcounted(page);
2511 return page;
2512 }
2513 EXPORT_SYMBOL(alloc_pages_noprof);
2514
folio_alloc_noprof(gfp_t gfp,unsigned int order)2515 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2516 {
2517 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2518 }
2519 EXPORT_SYMBOL(folio_alloc_noprof);
2520
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2521 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2522 struct mempolicy *pol, unsigned long nr_pages,
2523 struct page **page_array)
2524 {
2525 int nodes;
2526 unsigned long nr_pages_per_node;
2527 int delta;
2528 int i;
2529 unsigned long nr_allocated;
2530 unsigned long total_allocated = 0;
2531
2532 nodes = nodes_weight(pol->nodes);
2533 nr_pages_per_node = nr_pages / nodes;
2534 delta = nr_pages - nodes * nr_pages_per_node;
2535
2536 for (i = 0; i < nodes; i++) {
2537 if (delta) {
2538 nr_allocated = alloc_pages_bulk_noprof(gfp,
2539 interleave_nodes(pol), NULL,
2540 nr_pages_per_node + 1,
2541 page_array);
2542 delta--;
2543 } else {
2544 nr_allocated = alloc_pages_bulk_noprof(gfp,
2545 interleave_nodes(pol), NULL,
2546 nr_pages_per_node, page_array);
2547 }
2548
2549 page_array += nr_allocated;
2550 total_allocated += nr_allocated;
2551 }
2552
2553 return total_allocated;
2554 }
2555
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2556 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2557 struct mempolicy *pol, unsigned long nr_pages,
2558 struct page **page_array)
2559 {
2560 struct weighted_interleave_state *state;
2561 struct task_struct *me = current;
2562 unsigned int cpuset_mems_cookie;
2563 unsigned long total_allocated = 0;
2564 unsigned long nr_allocated = 0;
2565 unsigned long rounds;
2566 unsigned long node_pages, delta;
2567 u8 *weights, weight;
2568 unsigned int weight_total = 0;
2569 unsigned long rem_pages = nr_pages;
2570 nodemask_t nodes;
2571 int nnodes, node;
2572 int resume_node = MAX_NUMNODES - 1;
2573 u8 resume_weight = 0;
2574 int prev_node;
2575 int i;
2576
2577 if (!nr_pages)
2578 return 0;
2579
2580 /* read the nodes onto the stack, retry if done during rebind */
2581 do {
2582 cpuset_mems_cookie = read_mems_allowed_begin();
2583 nnodes = read_once_policy_nodemask(pol, &nodes);
2584 } while (read_mems_allowed_retry(cpuset_mems_cookie));
2585
2586 /* if the nodemask has become invalid, we cannot do anything */
2587 if (!nnodes)
2588 return 0;
2589
2590 /* Continue allocating from most recent node and adjust the nr_pages */
2591 node = me->il_prev;
2592 weight = me->il_weight;
2593 if (weight && node_isset(node, nodes)) {
2594 node_pages = min(rem_pages, weight);
2595 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2596 page_array);
2597 page_array += nr_allocated;
2598 total_allocated += nr_allocated;
2599 /* if that's all the pages, no need to interleave */
2600 if (rem_pages <= weight) {
2601 me->il_weight -= rem_pages;
2602 return total_allocated;
2603 }
2604 /* Otherwise we adjust remaining pages, continue from there */
2605 rem_pages -= weight;
2606 }
2607 /* clear active weight in case of an allocation failure */
2608 me->il_weight = 0;
2609 prev_node = node;
2610
2611 /* create a local copy of node weights to operate on outside rcu */
2612 weights = kzalloc(nr_node_ids, GFP_KERNEL);
2613 if (!weights)
2614 return total_allocated;
2615
2616 rcu_read_lock();
2617 state = rcu_dereference(wi_state);
2618 if (state) {
2619 memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2620 rcu_read_unlock();
2621 } else {
2622 rcu_read_unlock();
2623 for (i = 0; i < nr_node_ids; i++)
2624 weights[i] = 1;
2625 }
2626
2627 /* calculate total, detect system default usage */
2628 for_each_node_mask(node, nodes)
2629 weight_total += weights[node];
2630
2631 /*
2632 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2633 * Track which node weighted interleave should resume from.
2634 *
2635 * if (rounds > 0) and (delta == 0), resume_node will always be
2636 * the node following prev_node and its weight.
2637 */
2638 rounds = rem_pages / weight_total;
2639 delta = rem_pages % weight_total;
2640 resume_node = next_node_in(prev_node, nodes);
2641 resume_weight = weights[resume_node];
2642 for (i = 0; i < nnodes; i++) {
2643 node = next_node_in(prev_node, nodes);
2644 weight = weights[node];
2645 node_pages = weight * rounds;
2646 /* If a delta exists, add this node's portion of the delta */
2647 if (delta > weight) {
2648 node_pages += weight;
2649 delta -= weight;
2650 } else if (delta) {
2651 /* when delta is depleted, resume from that node */
2652 node_pages += delta;
2653 resume_node = node;
2654 resume_weight = weight - delta;
2655 delta = 0;
2656 }
2657 /* node_pages can be 0 if an allocation fails and rounds == 0 */
2658 if (!node_pages)
2659 break;
2660 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2661 page_array);
2662 page_array += nr_allocated;
2663 total_allocated += nr_allocated;
2664 if (total_allocated == nr_pages)
2665 break;
2666 prev_node = node;
2667 }
2668 me->il_prev = resume_node;
2669 me->il_weight = resume_weight;
2670 kfree(weights);
2671 return total_allocated;
2672 }
2673
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2674 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2675 struct mempolicy *pol, unsigned long nr_pages,
2676 struct page **page_array)
2677 {
2678 gfp_t preferred_gfp;
2679 unsigned long nr_allocated = 0;
2680
2681 preferred_gfp = gfp | __GFP_NOWARN;
2682 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2683
2684 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2685 nr_pages, page_array);
2686
2687 if (nr_allocated < nr_pages)
2688 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2689 nr_pages - nr_allocated,
2690 page_array + nr_allocated);
2691 return nr_allocated;
2692 }
2693
2694 /* alloc pages bulk and mempolicy should be considered at the
2695 * same time in some situation such as vmalloc.
2696 *
2697 * It can accelerate memory allocation especially interleaving
2698 * allocate memory.
2699 */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2700 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2701 unsigned long nr_pages, struct page **page_array)
2702 {
2703 struct mempolicy *pol = &default_policy;
2704 nodemask_t *nodemask;
2705 int nid;
2706
2707 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2708 pol = get_task_policy(current);
2709
2710 if (pol->mode == MPOL_INTERLEAVE)
2711 return alloc_pages_bulk_interleave(gfp, pol,
2712 nr_pages, page_array);
2713
2714 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2715 return alloc_pages_bulk_weighted_interleave(
2716 gfp, pol, nr_pages, page_array);
2717
2718 if (pol->mode == MPOL_PREFERRED_MANY)
2719 return alloc_pages_bulk_preferred_many(gfp,
2720 numa_node_id(), pol, nr_pages, page_array);
2721
2722 nid = numa_node_id();
2723 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2724 return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2725 nr_pages, page_array);
2726 }
2727
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2728 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2729 {
2730 struct mempolicy *pol = mpol_dup(src->vm_policy);
2731
2732 if (IS_ERR(pol))
2733 return PTR_ERR(pol);
2734 dst->vm_policy = pol;
2735 return 0;
2736 }
2737
2738 /*
2739 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2740 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2741 * with the mems_allowed returned by cpuset_mems_allowed(). This
2742 * keeps mempolicies cpuset relative after its cpuset moves. See
2743 * further kernel/cpuset.c update_nodemask().
2744 *
2745 * current's mempolicy may be rebinded by the other task(the task that changes
2746 * cpuset's mems), so we needn't do rebind work for current task.
2747 */
2748
2749 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2750 struct mempolicy *__mpol_dup(struct mempolicy *old)
2751 {
2752 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2753
2754 if (!new)
2755 return ERR_PTR(-ENOMEM);
2756
2757 /* task's mempolicy is protected by alloc_lock */
2758 if (old == current->mempolicy) {
2759 task_lock(current);
2760 *new = *old;
2761 task_unlock(current);
2762 } else
2763 *new = *old;
2764
2765 if (current_cpuset_is_being_rebound()) {
2766 nodemask_t mems = cpuset_mems_allowed(current);
2767 mpol_rebind_policy(new, &mems);
2768 }
2769 atomic_set(&new->refcnt, 1);
2770 return new;
2771 }
2772
2773 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2774 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2775 {
2776 if (!a || !b)
2777 return false;
2778 if (a->mode != b->mode)
2779 return false;
2780 if (a->flags != b->flags)
2781 return false;
2782 if (a->home_node != b->home_node)
2783 return false;
2784 if (mpol_store_user_nodemask(a))
2785 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2786 return false;
2787
2788 switch (a->mode) {
2789 case MPOL_BIND:
2790 case MPOL_INTERLEAVE:
2791 case MPOL_PREFERRED:
2792 case MPOL_PREFERRED_MANY:
2793 case MPOL_WEIGHTED_INTERLEAVE:
2794 return !!nodes_equal(a->nodes, b->nodes);
2795 case MPOL_LOCAL:
2796 return true;
2797 default:
2798 BUG();
2799 return false;
2800 }
2801 }
2802
2803 /*
2804 * Shared memory backing store policy support.
2805 *
2806 * Remember policies even when nobody has shared memory mapped.
2807 * The policies are kept in Red-Black tree linked from the inode.
2808 * They are protected by the sp->lock rwlock, which should be held
2809 * for any accesses to the tree.
2810 */
2811
2812 /*
2813 * lookup first element intersecting start-end. Caller holds sp->lock for
2814 * reading or for writing
2815 */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2816 static struct sp_node *sp_lookup(struct shared_policy *sp,
2817 pgoff_t start, pgoff_t end)
2818 {
2819 struct rb_node *n = sp->root.rb_node;
2820
2821 while (n) {
2822 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2823
2824 if (start >= p->end)
2825 n = n->rb_right;
2826 else if (end <= p->start)
2827 n = n->rb_left;
2828 else
2829 break;
2830 }
2831 if (!n)
2832 return NULL;
2833 for (;;) {
2834 struct sp_node *w = NULL;
2835 struct rb_node *prev = rb_prev(n);
2836 if (!prev)
2837 break;
2838 w = rb_entry(prev, struct sp_node, nd);
2839 if (w->end <= start)
2840 break;
2841 n = prev;
2842 }
2843 return rb_entry(n, struct sp_node, nd);
2844 }
2845
2846 /*
2847 * Insert a new shared policy into the list. Caller holds sp->lock for
2848 * writing.
2849 */
sp_insert(struct shared_policy * sp,struct sp_node * new)2850 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2851 {
2852 struct rb_node **p = &sp->root.rb_node;
2853 struct rb_node *parent = NULL;
2854 struct sp_node *nd;
2855
2856 while (*p) {
2857 parent = *p;
2858 nd = rb_entry(parent, struct sp_node, nd);
2859 if (new->start < nd->start)
2860 p = &(*p)->rb_left;
2861 else if (new->end > nd->end)
2862 p = &(*p)->rb_right;
2863 else
2864 BUG();
2865 }
2866 rb_link_node(&new->nd, parent, p);
2867 rb_insert_color(&new->nd, &sp->root);
2868 }
2869
2870 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2871 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2872 pgoff_t idx)
2873 {
2874 struct mempolicy *pol = NULL;
2875 struct sp_node *sn;
2876
2877 if (!sp->root.rb_node)
2878 return NULL;
2879 read_lock(&sp->lock);
2880 sn = sp_lookup(sp, idx, idx+1);
2881 if (sn) {
2882 mpol_get(sn->policy);
2883 pol = sn->policy;
2884 }
2885 read_unlock(&sp->lock);
2886 return pol;
2887 }
2888
sp_free(struct sp_node * n)2889 static void sp_free(struct sp_node *n)
2890 {
2891 mpol_put(n->policy);
2892 kmem_cache_free(sn_cache, n);
2893 }
2894
2895 /**
2896 * mpol_misplaced - check whether current folio node is valid in policy
2897 *
2898 * @folio: folio to be checked
2899 * @vmf: structure describing the fault
2900 * @addr: virtual address in @vma for shared policy lookup and interleave policy
2901 *
2902 * Lookup current policy node id for vma,addr and "compare to" folio's
2903 * node id. Policy determination "mimics" alloc_page_vma().
2904 * Called from fault path where we know the vma and faulting address.
2905 *
2906 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2907 * policy, or a suitable node ID to allocate a replacement folio from.
2908 */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2909 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2910 unsigned long addr)
2911 {
2912 struct mempolicy *pol;
2913 pgoff_t ilx;
2914 struct zoneref *z;
2915 int curnid = folio_nid(folio);
2916 struct vm_area_struct *vma = vmf->vma;
2917 int thiscpu = raw_smp_processor_id();
2918 int thisnid = numa_node_id();
2919 int polnid = NUMA_NO_NODE;
2920 int ret = NUMA_NO_NODE;
2921
2922 /*
2923 * Make sure ptl is held so that we don't preempt and we
2924 * have a stable smp processor id
2925 */
2926 lockdep_assert_held(vmf->ptl);
2927 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2928 if (!(pol->flags & MPOL_F_MOF))
2929 goto out;
2930
2931 switch (pol->mode) {
2932 case MPOL_INTERLEAVE:
2933 polnid = interleave_nid(pol, ilx);
2934 break;
2935
2936 case MPOL_WEIGHTED_INTERLEAVE:
2937 polnid = weighted_interleave_nid(pol, ilx);
2938 break;
2939
2940 case MPOL_PREFERRED:
2941 if (node_isset(curnid, pol->nodes))
2942 goto out;
2943 polnid = first_node(pol->nodes);
2944 break;
2945
2946 case MPOL_LOCAL:
2947 polnid = numa_node_id();
2948 break;
2949
2950 case MPOL_BIND:
2951 case MPOL_PREFERRED_MANY:
2952 /*
2953 * Even though MPOL_PREFERRED_MANY can allocate pages outside
2954 * policy nodemask we don't allow numa migration to nodes
2955 * outside policy nodemask for now. This is done so that if we
2956 * want demotion to slow memory to happen, before allocating
2957 * from some DRAM node say 'x', we will end up using a
2958 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
2959 * we should not promote to node 'x' from slow memory node.
2960 */
2961 if (pol->flags & MPOL_F_MORON) {
2962 /*
2963 * Optimize placement among multiple nodes
2964 * via NUMA balancing
2965 */
2966 if (node_isset(thisnid, pol->nodes))
2967 break;
2968 goto out;
2969 }
2970
2971 /*
2972 * use current page if in policy nodemask,
2973 * else select nearest allowed node, if any.
2974 * If no allowed nodes, use current [!misplaced].
2975 */
2976 if (node_isset(curnid, pol->nodes))
2977 goto out;
2978 z = first_zones_zonelist(
2979 node_zonelist(thisnid, GFP_HIGHUSER),
2980 gfp_zone(GFP_HIGHUSER),
2981 &pol->nodes);
2982 polnid = zonelist_node_idx(z);
2983 break;
2984
2985 default:
2986 BUG();
2987 }
2988
2989 /* Migrate the folio towards the node whose CPU is referencing it */
2990 if (pol->flags & MPOL_F_MORON) {
2991 polnid = thisnid;
2992
2993 if (!should_numa_migrate_memory(current, folio, curnid,
2994 thiscpu))
2995 goto out;
2996 }
2997
2998 if (curnid != polnid)
2999 ret = polnid;
3000 out:
3001 mpol_cond_put(pol);
3002
3003 return ret;
3004 }
3005
3006 /*
3007 * Drop the (possibly final) reference to task->mempolicy. It needs to be
3008 * dropped after task->mempolicy is set to NULL so that any allocation done as
3009 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3010 * policy.
3011 */
mpol_put_task_policy(struct task_struct * task)3012 void mpol_put_task_policy(struct task_struct *task)
3013 {
3014 struct mempolicy *pol;
3015
3016 task_lock(task);
3017 pol = task->mempolicy;
3018 task->mempolicy = NULL;
3019 task_unlock(task);
3020 mpol_put(pol);
3021 }
3022
sp_delete(struct shared_policy * sp,struct sp_node * n)3023 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3024 {
3025 rb_erase(&n->nd, &sp->root);
3026 sp_free(n);
3027 }
3028
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)3029 static void sp_node_init(struct sp_node *node, unsigned long start,
3030 unsigned long end, struct mempolicy *pol)
3031 {
3032 node->start = start;
3033 node->end = end;
3034 node->policy = pol;
3035 }
3036
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)3037 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3038 struct mempolicy *pol)
3039 {
3040 struct sp_node *n;
3041 struct mempolicy *newpol;
3042
3043 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3044 if (!n)
3045 return NULL;
3046
3047 newpol = mpol_dup(pol);
3048 if (IS_ERR(newpol)) {
3049 kmem_cache_free(sn_cache, n);
3050 return NULL;
3051 }
3052 newpol->flags |= MPOL_F_SHARED;
3053 sp_node_init(n, start, end, newpol);
3054
3055 return n;
3056 }
3057
3058 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)3059 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3060 pgoff_t end, struct sp_node *new)
3061 {
3062 struct sp_node *n;
3063 struct sp_node *n_new = NULL;
3064 struct mempolicy *mpol_new = NULL;
3065 int ret = 0;
3066
3067 restart:
3068 write_lock(&sp->lock);
3069 n = sp_lookup(sp, start, end);
3070 /* Take care of old policies in the same range. */
3071 while (n && n->start < end) {
3072 struct rb_node *next = rb_next(&n->nd);
3073 if (n->start >= start) {
3074 if (n->end <= end)
3075 sp_delete(sp, n);
3076 else
3077 n->start = end;
3078 } else {
3079 /* Old policy spanning whole new range. */
3080 if (n->end > end) {
3081 if (!n_new)
3082 goto alloc_new;
3083
3084 *mpol_new = *n->policy;
3085 atomic_set(&mpol_new->refcnt, 1);
3086 sp_node_init(n_new, end, n->end, mpol_new);
3087 n->end = start;
3088 sp_insert(sp, n_new);
3089 n_new = NULL;
3090 mpol_new = NULL;
3091 break;
3092 } else
3093 n->end = start;
3094 }
3095 if (!next)
3096 break;
3097 n = rb_entry(next, struct sp_node, nd);
3098 }
3099 if (new)
3100 sp_insert(sp, new);
3101 write_unlock(&sp->lock);
3102 ret = 0;
3103
3104 err_out:
3105 if (mpol_new)
3106 mpol_put(mpol_new);
3107 if (n_new)
3108 kmem_cache_free(sn_cache, n_new);
3109
3110 return ret;
3111
3112 alloc_new:
3113 write_unlock(&sp->lock);
3114 ret = -ENOMEM;
3115 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3116 if (!n_new)
3117 goto err_out;
3118 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3119 if (!mpol_new)
3120 goto err_out;
3121 atomic_set(&mpol_new->refcnt, 1);
3122 goto restart;
3123 }
3124
3125 /**
3126 * mpol_shared_policy_init - initialize shared policy for inode
3127 * @sp: pointer to inode shared policy
3128 * @mpol: struct mempolicy to install
3129 *
3130 * Install non-NULL @mpol in inode's shared policy rb-tree.
3131 * On entry, the current task has a reference on a non-NULL @mpol.
3132 * This must be released on exit.
3133 * This is called at get_inode() calls and we can use GFP_KERNEL.
3134 */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3135 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3136 {
3137 int ret;
3138
3139 sp->root = RB_ROOT; /* empty tree == default mempolicy */
3140 rwlock_init(&sp->lock);
3141
3142 if (mpol) {
3143 struct sp_node *sn;
3144 struct mempolicy *npol;
3145 NODEMASK_SCRATCH(scratch);
3146
3147 if (!scratch)
3148 goto put_mpol;
3149
3150 /* contextualize the tmpfs mount point mempolicy to this file */
3151 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3152 if (IS_ERR(npol))
3153 goto free_scratch; /* no valid nodemask intersection */
3154
3155 task_lock(current);
3156 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3157 task_unlock(current);
3158 if (ret)
3159 goto put_npol;
3160
3161 /* alloc node covering entire file; adds ref to file's npol */
3162 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3163 if (sn)
3164 sp_insert(sp, sn);
3165 put_npol:
3166 mpol_put(npol); /* drop initial ref on file's npol */
3167 free_scratch:
3168 NODEMASK_SCRATCH_FREE(scratch);
3169 put_mpol:
3170 mpol_put(mpol); /* drop our incoming ref on sb mpol */
3171 }
3172 }
3173
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3174 int mpol_set_shared_policy(struct shared_policy *sp,
3175 struct vm_area_struct *vma, struct mempolicy *pol)
3176 {
3177 int err;
3178 struct sp_node *new = NULL;
3179 unsigned long sz = vma_pages(vma);
3180
3181 if (pol) {
3182 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3183 if (!new)
3184 return -ENOMEM;
3185 }
3186 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3187 if (err && new)
3188 sp_free(new);
3189 return err;
3190 }
3191
3192 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3193 void mpol_free_shared_policy(struct shared_policy *sp)
3194 {
3195 struct sp_node *n;
3196 struct rb_node *next;
3197
3198 if (!sp->root.rb_node)
3199 return;
3200 write_lock(&sp->lock);
3201 next = rb_first(&sp->root);
3202 while (next) {
3203 n = rb_entry(next, struct sp_node, nd);
3204 next = rb_next(&n->nd);
3205 sp_delete(sp, n);
3206 }
3207 write_unlock(&sp->lock);
3208 }
3209
3210 #ifdef CONFIG_NUMA_BALANCING
3211 static int __initdata numabalancing_override;
3212
check_numabalancing_enable(void)3213 static void __init check_numabalancing_enable(void)
3214 {
3215 bool numabalancing_default = false;
3216
3217 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3218 numabalancing_default = true;
3219
3220 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3221 if (numabalancing_override)
3222 set_numabalancing_state(numabalancing_override == 1);
3223
3224 if (num_online_nodes() > 1 && !numabalancing_override) {
3225 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3226 numabalancing_default ? "Enabling" : "Disabling");
3227 set_numabalancing_state(numabalancing_default);
3228 }
3229 }
3230
setup_numabalancing(char * str)3231 static int __init setup_numabalancing(char *str)
3232 {
3233 int ret = 0;
3234 if (!str)
3235 goto out;
3236
3237 if (!strcmp(str, "enable")) {
3238 numabalancing_override = 1;
3239 ret = 1;
3240 } else if (!strcmp(str, "disable")) {
3241 numabalancing_override = -1;
3242 ret = 1;
3243 }
3244 out:
3245 if (!ret)
3246 pr_warn("Unable to parse numa_balancing=\n");
3247
3248 return ret;
3249 }
3250 __setup("numa_balancing=", setup_numabalancing);
3251 #else
check_numabalancing_enable(void)3252 static inline void __init check_numabalancing_enable(void)
3253 {
3254 }
3255 #endif /* CONFIG_NUMA_BALANCING */
3256
numa_policy_init(void)3257 void __init numa_policy_init(void)
3258 {
3259 nodemask_t interleave_nodes;
3260 unsigned long largest = 0;
3261 int nid, prefer = 0;
3262
3263 policy_cache = kmem_cache_create("numa_policy",
3264 sizeof(struct mempolicy),
3265 0, SLAB_PANIC, NULL);
3266
3267 sn_cache = kmem_cache_create("shared_policy_node",
3268 sizeof(struct sp_node),
3269 0, SLAB_PANIC, NULL);
3270
3271 for_each_node(nid) {
3272 preferred_node_policy[nid] = (struct mempolicy) {
3273 .refcnt = ATOMIC_INIT(1),
3274 .mode = MPOL_PREFERRED,
3275 .flags = MPOL_F_MOF | MPOL_F_MORON,
3276 .nodes = nodemask_of_node(nid),
3277 };
3278 }
3279
3280 /*
3281 * Set interleaving policy for system init. Interleaving is only
3282 * enabled across suitably sized nodes (default is >= 16MB), or
3283 * fall back to the largest node if they're all smaller.
3284 */
3285 nodes_clear(interleave_nodes);
3286 for_each_node_state(nid, N_MEMORY) {
3287 unsigned long total_pages = node_present_pages(nid);
3288
3289 /* Preserve the largest node */
3290 if (largest < total_pages) {
3291 largest = total_pages;
3292 prefer = nid;
3293 }
3294
3295 /* Interleave this node? */
3296 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3297 node_set(nid, interleave_nodes);
3298 }
3299
3300 /* All too small, use the largest */
3301 if (unlikely(nodes_empty(interleave_nodes)))
3302 node_set(prefer, interleave_nodes);
3303
3304 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3305 pr_err("%s: interleaving failed\n", __func__);
3306
3307 check_numabalancing_enable();
3308 }
3309
3310 /* Reset policy of current process to default */
numa_default_policy(void)3311 void numa_default_policy(void)
3312 {
3313 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3314 }
3315
3316 /*
3317 * Parse and format mempolicy from/to strings
3318 */
3319 static const char * const policy_modes[] =
3320 {
3321 [MPOL_DEFAULT] = "default",
3322 [MPOL_PREFERRED] = "prefer",
3323 [MPOL_BIND] = "bind",
3324 [MPOL_INTERLEAVE] = "interleave",
3325 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3326 [MPOL_LOCAL] = "local",
3327 [MPOL_PREFERRED_MANY] = "prefer (many)",
3328 };
3329
3330 #ifdef CONFIG_TMPFS
3331 /**
3332 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3333 * @str: string containing mempolicy to parse
3334 * @mpol: pointer to struct mempolicy pointer, returned on success.
3335 *
3336 * Format of input:
3337 * <mode>[=<flags>][:<nodelist>]
3338 *
3339 * Return: %0 on success, else %1
3340 */
mpol_parse_str(char * str,struct mempolicy ** mpol)3341 int mpol_parse_str(char *str, struct mempolicy **mpol)
3342 {
3343 struct mempolicy *new = NULL;
3344 unsigned short mode_flags;
3345 nodemask_t nodes;
3346 char *nodelist = strchr(str, ':');
3347 char *flags = strchr(str, '=');
3348 int err = 1, mode;
3349
3350 if (flags)
3351 *flags++ = '\0'; /* terminate mode string */
3352
3353 if (nodelist) {
3354 /* NUL-terminate mode or flags string */
3355 *nodelist++ = '\0';
3356 if (nodelist_parse(nodelist, nodes))
3357 goto out;
3358 if (!nodes_subset(nodes, node_states[N_MEMORY]))
3359 goto out;
3360 } else
3361 nodes_clear(nodes);
3362
3363 mode = match_string(policy_modes, MPOL_MAX, str);
3364 if (mode < 0)
3365 goto out;
3366
3367 switch (mode) {
3368 case MPOL_PREFERRED:
3369 /*
3370 * Insist on a nodelist of one node only, although later
3371 * we use first_node(nodes) to grab a single node, so here
3372 * nodelist (or nodes) cannot be empty.
3373 */
3374 if (nodelist) {
3375 char *rest = nodelist;
3376 while (isdigit(*rest))
3377 rest++;
3378 if (*rest)
3379 goto out;
3380 if (nodes_empty(nodes))
3381 goto out;
3382 }
3383 break;
3384 case MPOL_INTERLEAVE:
3385 case MPOL_WEIGHTED_INTERLEAVE:
3386 /*
3387 * Default to online nodes with memory if no nodelist
3388 */
3389 if (!nodelist)
3390 nodes = node_states[N_MEMORY];
3391 break;
3392 case MPOL_LOCAL:
3393 /*
3394 * Don't allow a nodelist; mpol_new() checks flags
3395 */
3396 if (nodelist)
3397 goto out;
3398 break;
3399 case MPOL_DEFAULT:
3400 /*
3401 * Insist on a empty nodelist
3402 */
3403 if (!nodelist)
3404 err = 0;
3405 goto out;
3406 case MPOL_PREFERRED_MANY:
3407 case MPOL_BIND:
3408 /*
3409 * Insist on a nodelist
3410 */
3411 if (!nodelist)
3412 goto out;
3413 }
3414
3415 mode_flags = 0;
3416 if (flags) {
3417 /*
3418 * Currently, we only support two mutually exclusive
3419 * mode flags.
3420 */
3421 if (!strcmp(flags, "static"))
3422 mode_flags |= MPOL_F_STATIC_NODES;
3423 else if (!strcmp(flags, "relative"))
3424 mode_flags |= MPOL_F_RELATIVE_NODES;
3425 else
3426 goto out;
3427 }
3428
3429 new = mpol_new(mode, mode_flags, &nodes);
3430 if (IS_ERR(new))
3431 goto out;
3432
3433 /*
3434 * Save nodes for mpol_to_str() to show the tmpfs mount options
3435 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3436 */
3437 if (mode != MPOL_PREFERRED) {
3438 new->nodes = nodes;
3439 } else if (nodelist) {
3440 nodes_clear(new->nodes);
3441 node_set(first_node(nodes), new->nodes);
3442 } else {
3443 new->mode = MPOL_LOCAL;
3444 }
3445
3446 /*
3447 * Save nodes for contextualization: this will be used to "clone"
3448 * the mempolicy in a specific context [cpuset] at a later time.
3449 */
3450 new->w.user_nodemask = nodes;
3451
3452 err = 0;
3453
3454 out:
3455 /* Restore string for error message */
3456 if (nodelist)
3457 *--nodelist = ':';
3458 if (flags)
3459 *--flags = '=';
3460 if (!err)
3461 *mpol = new;
3462 return err;
3463 }
3464 #endif /* CONFIG_TMPFS */
3465
3466 /**
3467 * mpol_to_str - format a mempolicy structure for printing
3468 * @buffer: to contain formatted mempolicy string
3469 * @maxlen: length of @buffer
3470 * @pol: pointer to mempolicy to be formatted
3471 *
3472 * Convert @pol into a string. If @buffer is too short, truncate the string.
3473 * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3474 * interleave", plus the longest flag flags, "relative|balancing", and to
3475 * display at least a few node ids.
3476 */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3477 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3478 {
3479 char *p = buffer;
3480 nodemask_t nodes = NODE_MASK_NONE;
3481 unsigned short mode = MPOL_DEFAULT;
3482 unsigned short flags = 0;
3483
3484 if (pol &&
3485 pol != &default_policy &&
3486 !(pol >= &preferred_node_policy[0] &&
3487 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3488 mode = pol->mode;
3489 flags = pol->flags;
3490 }
3491
3492 switch (mode) {
3493 case MPOL_DEFAULT:
3494 case MPOL_LOCAL:
3495 break;
3496 case MPOL_PREFERRED:
3497 case MPOL_PREFERRED_MANY:
3498 case MPOL_BIND:
3499 case MPOL_INTERLEAVE:
3500 case MPOL_WEIGHTED_INTERLEAVE:
3501 nodes = pol->nodes;
3502 break;
3503 default:
3504 WARN_ON_ONCE(1);
3505 snprintf(p, maxlen, "unknown");
3506 return;
3507 }
3508
3509 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3510
3511 if (flags & MPOL_MODE_FLAGS) {
3512 p += snprintf(p, buffer + maxlen - p, "=");
3513
3514 /*
3515 * Static and relative are mutually exclusive.
3516 */
3517 if (flags & MPOL_F_STATIC_NODES)
3518 p += snprintf(p, buffer + maxlen - p, "static");
3519 else if (flags & MPOL_F_RELATIVE_NODES)
3520 p += snprintf(p, buffer + maxlen - p, "relative");
3521
3522 if (flags & MPOL_F_NUMA_BALANCING) {
3523 if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3524 p += snprintf(p, buffer + maxlen - p, "|");
3525 p += snprintf(p, buffer + maxlen - p, "balancing");
3526 }
3527 }
3528
3529 if (!nodes_empty(nodes))
3530 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3531 nodemask_pr_args(&nodes));
3532 }
3533
3534 #ifdef CONFIG_SYSFS
3535 struct iw_node_attr {
3536 struct kobj_attribute kobj_attr;
3537 int nid;
3538 };
3539
3540 struct sysfs_wi_group {
3541 struct kobject wi_kobj;
3542 struct mutex kobj_lock;
3543 struct iw_node_attr *nattrs[];
3544 };
3545
3546 static struct sysfs_wi_group *wi_group;
3547
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3548 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3549 char *buf)
3550 {
3551 struct iw_node_attr *node_attr;
3552 u8 weight;
3553
3554 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3555 weight = get_il_weight(node_attr->nid);
3556 return sysfs_emit(buf, "%d\n", weight);
3557 }
3558
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3559 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3560 const char *buf, size_t count)
3561 {
3562 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3563 struct iw_node_attr *node_attr;
3564 u8 weight = 0;
3565 int i;
3566
3567 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3568 if (count == 0 || sysfs_streq(buf, "") ||
3569 kstrtou8(buf, 0, &weight) || weight == 0)
3570 return -EINVAL;
3571
3572 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3573 GFP_KERNEL);
3574 if (!new_wi_state)
3575 return -ENOMEM;
3576
3577 mutex_lock(&wi_state_lock);
3578 old_wi_state = rcu_dereference_protected(wi_state,
3579 lockdep_is_held(&wi_state_lock));
3580 if (old_wi_state) {
3581 memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3582 nr_node_ids * sizeof(u8));
3583 } else {
3584 for (i = 0; i < nr_node_ids; i++)
3585 new_wi_state->iw_table[i] = 1;
3586 }
3587 new_wi_state->iw_table[node_attr->nid] = weight;
3588 new_wi_state->mode_auto = false;
3589
3590 rcu_assign_pointer(wi_state, new_wi_state);
3591 mutex_unlock(&wi_state_lock);
3592 if (old_wi_state) {
3593 synchronize_rcu();
3594 kfree(old_wi_state);
3595 }
3596 return count;
3597 }
3598
weighted_interleave_auto_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3599 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3600 struct kobj_attribute *attr, char *buf)
3601 {
3602 struct weighted_interleave_state *state;
3603 bool wi_auto = true;
3604
3605 rcu_read_lock();
3606 state = rcu_dereference(wi_state);
3607 if (state)
3608 wi_auto = state->mode_auto;
3609 rcu_read_unlock();
3610
3611 return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3612 }
3613
weighted_interleave_auto_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3614 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3615 struct kobj_attribute *attr, const char *buf, size_t count)
3616 {
3617 struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3618 unsigned int *bw;
3619 bool input;
3620 int i;
3621
3622 if (kstrtobool(buf, &input))
3623 return -EINVAL;
3624
3625 new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3626 GFP_KERNEL);
3627 if (!new_wi_state)
3628 return -ENOMEM;
3629 for (i = 0; i < nr_node_ids; i++)
3630 new_wi_state->iw_table[i] = 1;
3631
3632 mutex_lock(&wi_state_lock);
3633 if (!input) {
3634 old_wi_state = rcu_dereference_protected(wi_state,
3635 lockdep_is_held(&wi_state_lock));
3636 if (!old_wi_state)
3637 goto update_wi_state;
3638 if (input == old_wi_state->mode_auto) {
3639 mutex_unlock(&wi_state_lock);
3640 return count;
3641 }
3642
3643 memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3644 nr_node_ids * sizeof(u8));
3645 goto update_wi_state;
3646 }
3647
3648 bw = node_bw_table;
3649 if (!bw) {
3650 mutex_unlock(&wi_state_lock);
3651 kfree(new_wi_state);
3652 return -ENODEV;
3653 }
3654
3655 new_wi_state->mode_auto = true;
3656 reduce_interleave_weights(bw, new_wi_state->iw_table);
3657
3658 update_wi_state:
3659 rcu_assign_pointer(wi_state, new_wi_state);
3660 mutex_unlock(&wi_state_lock);
3661 if (old_wi_state) {
3662 synchronize_rcu();
3663 kfree(old_wi_state);
3664 }
3665 return count;
3666 }
3667
sysfs_wi_node_delete(int nid)3668 static void sysfs_wi_node_delete(int nid)
3669 {
3670 struct iw_node_attr *attr;
3671
3672 if (nid < 0 || nid >= nr_node_ids)
3673 return;
3674
3675 mutex_lock(&wi_group->kobj_lock);
3676 attr = wi_group->nattrs[nid];
3677 if (!attr) {
3678 mutex_unlock(&wi_group->kobj_lock);
3679 return;
3680 }
3681
3682 wi_group->nattrs[nid] = NULL;
3683 mutex_unlock(&wi_group->kobj_lock);
3684
3685 sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3686 kfree(attr->kobj_attr.attr.name);
3687 kfree(attr);
3688 }
3689
sysfs_wi_node_delete_all(void)3690 static void sysfs_wi_node_delete_all(void)
3691 {
3692 int nid;
3693
3694 for (nid = 0; nid < nr_node_ids; nid++)
3695 sysfs_wi_node_delete(nid);
3696 }
3697
wi_state_free(void)3698 static void wi_state_free(void)
3699 {
3700 struct weighted_interleave_state *old_wi_state;
3701
3702 mutex_lock(&wi_state_lock);
3703 old_wi_state = rcu_dereference_protected(wi_state,
3704 lockdep_is_held(&wi_state_lock));
3705 rcu_assign_pointer(wi_state, NULL);
3706 mutex_unlock(&wi_state_lock);
3707
3708 if (old_wi_state) {
3709 synchronize_rcu();
3710 kfree(old_wi_state);
3711 }
3712 }
3713
3714 static struct kobj_attribute wi_auto_attr =
3715 __ATTR(auto, 0664, weighted_interleave_auto_show,
3716 weighted_interleave_auto_store);
3717
wi_cleanup(void)3718 static void wi_cleanup(void) {
3719 sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3720 sysfs_wi_node_delete_all();
3721 wi_state_free();
3722 }
3723
wi_kobj_release(struct kobject * wi_kobj)3724 static void wi_kobj_release(struct kobject *wi_kobj)
3725 {
3726 kfree(wi_group);
3727 }
3728
3729 static const struct kobj_type wi_ktype = {
3730 .sysfs_ops = &kobj_sysfs_ops,
3731 .release = wi_kobj_release,
3732 };
3733
sysfs_wi_node_add(int nid)3734 static int sysfs_wi_node_add(int nid)
3735 {
3736 int ret;
3737 char *name;
3738 struct iw_node_attr *new_attr;
3739
3740 if (nid < 0 || nid >= nr_node_ids) {
3741 pr_err("invalid node id: %d\n", nid);
3742 return -EINVAL;
3743 }
3744
3745 new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
3746 if (!new_attr)
3747 return -ENOMEM;
3748
3749 name = kasprintf(GFP_KERNEL, "node%d", nid);
3750 if (!name) {
3751 kfree(new_attr);
3752 return -ENOMEM;
3753 }
3754
3755 sysfs_attr_init(&new_attr->kobj_attr.attr);
3756 new_attr->kobj_attr.attr.name = name;
3757 new_attr->kobj_attr.attr.mode = 0644;
3758 new_attr->kobj_attr.show = node_show;
3759 new_attr->kobj_attr.store = node_store;
3760 new_attr->nid = nid;
3761
3762 mutex_lock(&wi_group->kobj_lock);
3763 if (wi_group->nattrs[nid]) {
3764 mutex_unlock(&wi_group->kobj_lock);
3765 ret = -EEXIST;
3766 goto out;
3767 }
3768
3769 ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3770 if (ret) {
3771 mutex_unlock(&wi_group->kobj_lock);
3772 goto out;
3773 }
3774 wi_group->nattrs[nid] = new_attr;
3775 mutex_unlock(&wi_group->kobj_lock);
3776 return 0;
3777
3778 out:
3779 kfree(new_attr->kobj_attr.attr.name);
3780 kfree(new_attr);
3781 return ret;
3782 }
3783
wi_node_notifier(struct notifier_block * nb,unsigned long action,void * data)3784 static int wi_node_notifier(struct notifier_block *nb,
3785 unsigned long action, void *data)
3786 {
3787 int err;
3788 struct node_notify *nn = data;
3789 int nid = nn->nid;
3790
3791 switch (action) {
3792 case NODE_ADDED_FIRST_MEMORY:
3793 err = sysfs_wi_node_add(nid);
3794 if (err)
3795 pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3796 nid, err);
3797 break;
3798 case NODE_REMOVED_LAST_MEMORY:
3799 sysfs_wi_node_delete(nid);
3800 break;
3801 }
3802
3803 return NOTIFY_OK;
3804 }
3805
add_weighted_interleave_group(struct kobject * mempolicy_kobj)3806 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3807 {
3808 int nid, err;
3809
3810 wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
3811 GFP_KERNEL);
3812 if (!wi_group)
3813 return -ENOMEM;
3814 mutex_init(&wi_group->kobj_lock);
3815
3816 err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3817 "weighted_interleave");
3818 if (err)
3819 goto err_put_kobj;
3820
3821 err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3822 if (err)
3823 goto err_put_kobj;
3824
3825 for_each_online_node(nid) {
3826 if (!node_state(nid, N_MEMORY))
3827 continue;
3828
3829 err = sysfs_wi_node_add(nid);
3830 if (err) {
3831 pr_err("failed to add sysfs for node%d during init: %d\n",
3832 nid, err);
3833 goto err_cleanup_kobj;
3834 }
3835 }
3836
3837 hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3838 return 0;
3839
3840 err_cleanup_kobj:
3841 wi_cleanup();
3842 kobject_del(&wi_group->wi_kobj);
3843 err_put_kobj:
3844 kobject_put(&wi_group->wi_kobj);
3845 return err;
3846 }
3847
mempolicy_sysfs_init(void)3848 static int __init mempolicy_sysfs_init(void)
3849 {
3850 int err;
3851 static struct kobject *mempolicy_kobj;
3852
3853 mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3854 if (!mempolicy_kobj)
3855 return -ENOMEM;
3856
3857 err = add_weighted_interleave_group(mempolicy_kobj);
3858 if (err)
3859 goto err_kobj;
3860
3861 return 0;
3862
3863 err_kobj:
3864 kobject_del(mempolicy_kobj);
3865 kobject_put(mempolicy_kobj);
3866 return err;
3867 }
3868
3869 late_initcall(mempolicy_sysfs_init);
3870 #endif /* CONFIG_SYSFS */
3871