1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support six policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * weighted interleave
23 * Allocate memory interleaved over a set of nodes based on
24 * a set of weights (per-node), with normal fallback if it
25 * fails. Otherwise operates the same as interleave.
26 * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27 * on node 0 for every 1 page allocated on node 1.
28 *
29 * bind Only allocate memory on a specific set of nodes,
30 * no fallback.
31 * FIXME: memory is allocated starting with the first node
32 * to the last. It would be better if bind would truly restrict
33 * the allocation to memory nodes instead
34 *
35 * preferred Try a specific node first before normal fallback.
36 * As a special case NUMA_NO_NODE here means do the allocation
37 * on the local CPU. This is normally identical to default,
38 * but useful to set in a VMA when you have a non default
39 * process policy.
40 *
41 * preferred many Try a set of nodes first before normal fallback. This is
42 * similar to preferred without the special case.
43 *
44 * default Allocate on the local node first, or when on a VMA
45 * use the process policy. This is what Linux always did
46 * in a NUMA aware kernel and still does by, ahem, default.
47 *
48 * The process policy is applied for most non interrupt memory allocations
49 * in that process' context. Interrupts ignore the policies and always
50 * try to allocate on the local CPU. The VMA policy is only applied for memory
51 * allocations for a VMA in the VM.
52 *
53 * Currently there are a few corner cases in swapping where the policy
54 * is not applied, but the majority should be handled. When process policy
55 * is used it is not remembered over swap outs/swap ins.
56 *
57 * Only the highest zone in the zone hierarchy gets policied. Allocations
58 * requesting a lower zone just use default policy. This implies that
59 * on systems with highmem kernel lowmem allocation don't get policied.
60 * Same with GFP_DMA allocations.
61 *
62 * For shmem/tmpfs shared memory the policy is shared between
63 * all users and remembered even when nobody has memory mapped.
64 */
65
66 /* Notebook:
67 fix mmap readahead to honour policy and enable policy for any page cache
68 object
69 statistics for bigpages
70 global policy for page cache? currently it uses process policy. Requires
71 first item above.
72 handle mremap for shared memory (currently ignored for the policy)
73 grows down?
74 make bind policy root only? It can trigger oom much faster and the
75 kernel is not always grateful with that.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/task.h>
89 #include <linux/nodemask.h>
90 #include <linux/cpuset.h>
91 #include <linux/slab.h>
92 #include <linux/string.h>
93 #include <linux/export.h>
94 #include <linux/nsproxy.h>
95 #include <linux/interrupt.h>
96 #include <linux/init.h>
97 #include <linux/compat.h>
98 #include <linux/ptrace.h>
99 #include <linux/swap.h>
100 #include <linux/seq_file.h>
101 #include <linux/proc_fs.h>
102 #include <linux/migrate.h>
103 #include <linux/ksm.h>
104 #include <linux/rmap.h>
105 #include <linux/security.h>
106 #include <linux/syscalls.h>
107 #include <linux/ctype.h>
108 #include <linux/mm_inline.h>
109 #include <linux/mmu_notifier.h>
110 #include <linux/printk.h>
111 #include <linux/swapops.h>
112
113 #include <asm/tlbflush.h>
114 #include <asm/tlb.h>
115 #include <linux/uaccess.h>
116
117 #include "internal.h"
118
119 /* Internal flags */
120 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
121 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
122 #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
123
124 static struct kmem_cache *policy_cache;
125 static struct kmem_cache *sn_cache;
126
127 /* Highest zone. An specific allocation for a zone below that is not
128 policied. */
129 enum zone_type policy_zone = 0;
130
131 /*
132 * run-time system-wide default policy => local allocation
133 */
134 static struct mempolicy default_policy = {
135 .refcnt = ATOMIC_INIT(1), /* never free it */
136 .mode = MPOL_LOCAL,
137 };
138
139 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
140
141 /*
142 * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
143 * system-default value should be used. A NULL iw_table also denotes that
144 * system-default values should be used. Until the system-default table
145 * is implemented, the system-default is always 1.
146 *
147 * iw_table is RCU protected
148 */
149 static u8 __rcu *iw_table;
150 static DEFINE_MUTEX(iw_table_lock);
151
get_il_weight(int node)152 static u8 get_il_weight(int node)
153 {
154 u8 *table;
155 u8 weight;
156
157 rcu_read_lock();
158 table = rcu_dereference(iw_table);
159 /* if no iw_table, use system default */
160 weight = table ? table[node] : 1;
161 /* if value in iw_table is 0, use system default */
162 weight = weight ? weight : 1;
163 rcu_read_unlock();
164 return weight;
165 }
166
167 /**
168 * numa_nearest_node - Find nearest node by state
169 * @node: Node id to start the search
170 * @state: State to filter the search
171 *
172 * Lookup the closest node by distance if @nid is not in state.
173 *
174 * Return: this @node if it is in state, otherwise the closest node by distance
175 */
numa_nearest_node(int node,unsigned int state)176 int numa_nearest_node(int node, unsigned int state)
177 {
178 int min_dist = INT_MAX, dist, n, min_node;
179
180 if (state >= NR_NODE_STATES)
181 return -EINVAL;
182
183 if (node == NUMA_NO_NODE || node_state(node, state))
184 return node;
185
186 min_node = node;
187 for_each_node_state(n, state) {
188 dist = node_distance(node, n);
189 if (dist < min_dist) {
190 min_dist = dist;
191 min_node = n;
192 }
193 }
194
195 return min_node;
196 }
197 EXPORT_SYMBOL_GPL(numa_nearest_node);
198
199 /**
200 * nearest_node_nodemask - Find the node in @mask at the nearest distance
201 * from @node.
202 *
203 * @node: a valid node ID to start the search from.
204 * @mask: a pointer to a nodemask representing the allowed nodes.
205 *
206 * This function iterates over all nodes in @mask and calculates the
207 * distance from the starting @node, then it returns the node ID that is
208 * the closest to @node, or MAX_NUMNODES if no node is found.
209 *
210 * Note that @node must be a valid node ID usable with node_distance(),
211 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
212 * or unexpected behavior.
213 */
nearest_node_nodemask(int node,nodemask_t * mask)214 int nearest_node_nodemask(int node, nodemask_t *mask)
215 {
216 int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
217
218 for_each_node_mask(n, *mask) {
219 dist = node_distance(node, n);
220 if (dist < min_dist) {
221 min_dist = dist;
222 min_node = n;
223 }
224 }
225
226 return min_node;
227 }
228 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
229
get_task_policy(struct task_struct * p)230 struct mempolicy *get_task_policy(struct task_struct *p)
231 {
232 struct mempolicy *pol = p->mempolicy;
233 int node;
234
235 if (pol)
236 return pol;
237
238 node = numa_node_id();
239 if (node != NUMA_NO_NODE) {
240 pol = &preferred_node_policy[node];
241 /* preferred_node_policy is not initialised early in boot */
242 if (pol->mode)
243 return pol;
244 }
245
246 return &default_policy;
247 }
248
249 static const struct mempolicy_operations {
250 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
251 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
252 } mpol_ops[MPOL_MAX];
253
mpol_store_user_nodemask(const struct mempolicy * pol)254 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
255 {
256 return pol->flags & MPOL_MODE_FLAGS;
257 }
258
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)259 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
260 const nodemask_t *rel)
261 {
262 nodemask_t tmp;
263 nodes_fold(tmp, *orig, nodes_weight(*rel));
264 nodes_onto(*ret, tmp, *rel);
265 }
266
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)267 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
268 {
269 if (nodes_empty(*nodes))
270 return -EINVAL;
271 pol->nodes = *nodes;
272 return 0;
273 }
274
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)275 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
276 {
277 if (nodes_empty(*nodes))
278 return -EINVAL;
279
280 nodes_clear(pol->nodes);
281 node_set(first_node(*nodes), pol->nodes);
282 return 0;
283 }
284
285 /*
286 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
287 * any, for the new policy. mpol_new() has already validated the nodes
288 * parameter with respect to the policy mode and flags.
289 *
290 * Must be called holding task's alloc_lock to protect task's mems_allowed
291 * and mempolicy. May also be called holding the mmap_lock for write.
292 */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)293 static int mpol_set_nodemask(struct mempolicy *pol,
294 const nodemask_t *nodes, struct nodemask_scratch *nsc)
295 {
296 int ret;
297
298 /*
299 * Default (pol==NULL) resp. local memory policies are not a
300 * subject of any remapping. They also do not need any special
301 * constructor.
302 */
303 if (!pol || pol->mode == MPOL_LOCAL)
304 return 0;
305
306 /* Check N_MEMORY */
307 nodes_and(nsc->mask1,
308 cpuset_current_mems_allowed, node_states[N_MEMORY]);
309
310 VM_BUG_ON(!nodes);
311
312 if (pol->flags & MPOL_F_RELATIVE_NODES)
313 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
314 else
315 nodes_and(nsc->mask2, *nodes, nsc->mask1);
316
317 if (mpol_store_user_nodemask(pol))
318 pol->w.user_nodemask = *nodes;
319 else
320 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
321
322 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
323 return ret;
324 }
325
326 /*
327 * This function just creates a new policy, does some check and simple
328 * initialization. You must invoke mpol_set_nodemask() to set nodes.
329 */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)330 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
331 nodemask_t *nodes)
332 {
333 struct mempolicy *policy;
334
335 if (mode == MPOL_DEFAULT) {
336 if (nodes && !nodes_empty(*nodes))
337 return ERR_PTR(-EINVAL);
338 return NULL;
339 }
340 VM_BUG_ON(!nodes);
341
342 /*
343 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
344 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
345 * All other modes require a valid pointer to a non-empty nodemask.
346 */
347 if (mode == MPOL_PREFERRED) {
348 if (nodes_empty(*nodes)) {
349 if (((flags & MPOL_F_STATIC_NODES) ||
350 (flags & MPOL_F_RELATIVE_NODES)))
351 return ERR_PTR(-EINVAL);
352
353 mode = MPOL_LOCAL;
354 }
355 } else if (mode == MPOL_LOCAL) {
356 if (!nodes_empty(*nodes) ||
357 (flags & MPOL_F_STATIC_NODES) ||
358 (flags & MPOL_F_RELATIVE_NODES))
359 return ERR_PTR(-EINVAL);
360 } else if (nodes_empty(*nodes))
361 return ERR_PTR(-EINVAL);
362
363 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
364 if (!policy)
365 return ERR_PTR(-ENOMEM);
366 atomic_set(&policy->refcnt, 1);
367 policy->mode = mode;
368 policy->flags = flags;
369 policy->home_node = NUMA_NO_NODE;
370
371 return policy;
372 }
373
374 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)375 void __mpol_put(struct mempolicy *pol)
376 {
377 if (!atomic_dec_and_test(&pol->refcnt))
378 return;
379 kmem_cache_free(policy_cache, pol);
380 }
381
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)382 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
383 {
384 }
385
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)386 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
387 {
388 nodemask_t tmp;
389
390 if (pol->flags & MPOL_F_STATIC_NODES)
391 nodes_and(tmp, pol->w.user_nodemask, *nodes);
392 else if (pol->flags & MPOL_F_RELATIVE_NODES)
393 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
394 else {
395 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
396 *nodes);
397 pol->w.cpuset_mems_allowed = *nodes;
398 }
399
400 if (nodes_empty(tmp))
401 tmp = *nodes;
402
403 pol->nodes = tmp;
404 }
405
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)406 static void mpol_rebind_preferred(struct mempolicy *pol,
407 const nodemask_t *nodes)
408 {
409 pol->w.cpuset_mems_allowed = *nodes;
410 }
411
412 /*
413 * mpol_rebind_policy - Migrate a policy to a different set of nodes
414 *
415 * Per-vma policies are protected by mmap_lock. Allocations using per-task
416 * policies are protected by task->mems_allowed_seq to prevent a premature
417 * OOM/allocation failure due to parallel nodemask modification.
418 */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)419 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
420 {
421 if (!pol || pol->mode == MPOL_LOCAL)
422 return;
423 if (!mpol_store_user_nodemask(pol) &&
424 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
425 return;
426
427 mpol_ops[pol->mode].rebind(pol, newmask);
428 }
429
430 /*
431 * Wrapper for mpol_rebind_policy() that just requires task
432 * pointer, and updates task mempolicy.
433 *
434 * Called with task's alloc_lock held.
435 */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
437 {
438 mpol_rebind_policy(tsk->mempolicy, new);
439 }
440
441 /*
442 * Rebind each vma in mm to new nodemask.
443 *
444 * Call holding a reference to mm. Takes mm->mmap_lock during call.
445 */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)446 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
447 {
448 struct vm_area_struct *vma;
449 VMA_ITERATOR(vmi, mm, 0);
450
451 mmap_write_lock(mm);
452 for_each_vma(vmi, vma) {
453 vma_start_write(vma);
454 mpol_rebind_policy(vma->vm_policy, new);
455 }
456 mmap_write_unlock(mm);
457 }
458
459 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
460 [MPOL_DEFAULT] = {
461 .rebind = mpol_rebind_default,
462 },
463 [MPOL_INTERLEAVE] = {
464 .create = mpol_new_nodemask,
465 .rebind = mpol_rebind_nodemask,
466 },
467 [MPOL_PREFERRED] = {
468 .create = mpol_new_preferred,
469 .rebind = mpol_rebind_preferred,
470 },
471 [MPOL_BIND] = {
472 .create = mpol_new_nodemask,
473 .rebind = mpol_rebind_nodemask,
474 },
475 [MPOL_LOCAL] = {
476 .rebind = mpol_rebind_default,
477 },
478 [MPOL_PREFERRED_MANY] = {
479 .create = mpol_new_nodemask,
480 .rebind = mpol_rebind_preferred,
481 },
482 [MPOL_WEIGHTED_INTERLEAVE] = {
483 .create = mpol_new_nodemask,
484 .rebind = mpol_rebind_nodemask,
485 },
486 };
487
488 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
489 unsigned long flags);
490 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
491 pgoff_t ilx, int *nid);
492
strictly_unmovable(unsigned long flags)493 static bool strictly_unmovable(unsigned long flags)
494 {
495 /*
496 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
497 * if any misplaced page is found.
498 */
499 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
500 MPOL_MF_STRICT;
501 }
502
503 struct migration_mpol { /* for alloc_migration_target_by_mpol() */
504 struct mempolicy *pol;
505 pgoff_t ilx;
506 };
507
508 struct queue_pages {
509 struct list_head *pagelist;
510 unsigned long flags;
511 nodemask_t *nmask;
512 unsigned long start;
513 unsigned long end;
514 struct vm_area_struct *first;
515 struct folio *large; /* note last large folio encountered */
516 long nr_failed; /* could not be isolated at this time */
517 };
518
519 /*
520 * Check if the folio's nid is in qp->nmask.
521 *
522 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
523 * in the invert of qp->nmask.
524 */
queue_folio_required(struct folio * folio,struct queue_pages * qp)525 static inline bool queue_folio_required(struct folio *folio,
526 struct queue_pages *qp)
527 {
528 int nid = folio_nid(folio);
529 unsigned long flags = qp->flags;
530
531 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
532 }
533
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)534 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
535 {
536 struct folio *folio;
537 struct queue_pages *qp = walk->private;
538
539 if (unlikely(is_pmd_migration_entry(*pmd))) {
540 qp->nr_failed++;
541 return;
542 }
543 folio = pmd_folio(*pmd);
544 if (is_huge_zero_folio(folio)) {
545 walk->action = ACTION_CONTINUE;
546 return;
547 }
548 if (!queue_folio_required(folio, qp))
549 return;
550 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
551 !vma_migratable(walk->vma) ||
552 !migrate_folio_add(folio, qp->pagelist, qp->flags))
553 qp->nr_failed++;
554 }
555
556 /*
557 * Scan through folios, checking if they satisfy the required conditions,
558 * moving them from LRU to local pagelist for migration if they do (or not).
559 *
560 * queue_folios_pte_range() has two possible return values:
561 * 0 - continue walking to scan for more, even if an existing folio on the
562 * wrong node could not be isolated and queued for migration.
563 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
564 * and an existing folio was on a node that does not follow the policy.
565 */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)566 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
567 unsigned long end, struct mm_walk *walk)
568 {
569 struct vm_area_struct *vma = walk->vma;
570 struct folio *folio;
571 struct queue_pages *qp = walk->private;
572 unsigned long flags = qp->flags;
573 pte_t *pte, *mapped_pte;
574 pte_t ptent;
575 spinlock_t *ptl;
576
577 ptl = pmd_trans_huge_lock(pmd, vma);
578 if (ptl) {
579 queue_folios_pmd(pmd, walk);
580 spin_unlock(ptl);
581 goto out;
582 }
583
584 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
585 if (!pte) {
586 walk->action = ACTION_AGAIN;
587 return 0;
588 }
589 for (; addr != end; pte++, addr += PAGE_SIZE) {
590 ptent = ptep_get(pte);
591 if (pte_none(ptent))
592 continue;
593 if (!pte_present(ptent)) {
594 if (is_migration_entry(pte_to_swp_entry(ptent)))
595 qp->nr_failed++;
596 continue;
597 }
598 folio = vm_normal_folio(vma, addr, ptent);
599 if (!folio || folio_is_zone_device(folio))
600 continue;
601 /*
602 * vm_normal_folio() filters out zero pages, but there might
603 * still be reserved folios to skip, perhaps in a VDSO.
604 */
605 if (folio_test_reserved(folio))
606 continue;
607 if (!queue_folio_required(folio, qp))
608 continue;
609 if (folio_test_large(folio)) {
610 /*
611 * A large folio can only be isolated from LRU once,
612 * but may be mapped by many PTEs (and Copy-On-Write may
613 * intersperse PTEs of other, order 0, folios). This is
614 * a common case, so don't mistake it for failure (but
615 * there can be other cases of multi-mapped pages which
616 * this quick check does not help to filter out - and a
617 * search of the pagelist might grow to be prohibitive).
618 *
619 * migrate_pages(&pagelist) returns nr_failed folios, so
620 * check "large" now so that queue_pages_range() returns
621 * a comparable nr_failed folios. This does imply that
622 * if folio could not be isolated for some racy reason
623 * at its first PTE, later PTEs will not give it another
624 * chance of isolation; but keeps the accounting simple.
625 */
626 if (folio == qp->large)
627 continue;
628 qp->large = folio;
629 }
630 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
631 !vma_migratable(vma) ||
632 !migrate_folio_add(folio, qp->pagelist, flags)) {
633 qp->nr_failed++;
634 if (strictly_unmovable(flags))
635 break;
636 }
637 }
638 pte_unmap_unlock(mapped_pte, ptl);
639 cond_resched();
640 out:
641 if (qp->nr_failed && strictly_unmovable(flags))
642 return -EIO;
643 return 0;
644 }
645
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)646 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
647 unsigned long addr, unsigned long end,
648 struct mm_walk *walk)
649 {
650 #ifdef CONFIG_HUGETLB_PAGE
651 struct queue_pages *qp = walk->private;
652 unsigned long flags = qp->flags;
653 struct folio *folio;
654 spinlock_t *ptl;
655 pte_t entry;
656
657 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
658 entry = huge_ptep_get(walk->mm, addr, pte);
659 if (!pte_present(entry)) {
660 if (unlikely(is_hugetlb_entry_migration(entry)))
661 qp->nr_failed++;
662 goto unlock;
663 }
664 folio = pfn_folio(pte_pfn(entry));
665 if (!queue_folio_required(folio, qp))
666 goto unlock;
667 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
668 !vma_migratable(walk->vma)) {
669 qp->nr_failed++;
670 goto unlock;
671 }
672 /*
673 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
674 * Choosing not to migrate a shared folio is not counted as a failure.
675 *
676 * See folio_maybe_mapped_shared() on possible imprecision when we
677 * cannot easily detect if a folio is shared.
678 */
679 if ((flags & MPOL_MF_MOVE_ALL) ||
680 (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
681 if (!folio_isolate_hugetlb(folio, qp->pagelist))
682 qp->nr_failed++;
683 unlock:
684 spin_unlock(ptl);
685 if (qp->nr_failed && strictly_unmovable(flags))
686 return -EIO;
687 #endif
688 return 0;
689 }
690
691 #ifdef CONFIG_NUMA_BALANCING
692 /*
693 * This is used to mark a range of virtual addresses to be inaccessible.
694 * These are later cleared by a NUMA hinting fault. Depending on these
695 * faults, pages may be migrated for better NUMA placement.
696 *
697 * This is assuming that NUMA faults are handled using PROT_NONE. If
698 * an architecture makes a different choice, it will need further
699 * changes to the core.
700 */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)701 unsigned long change_prot_numa(struct vm_area_struct *vma,
702 unsigned long addr, unsigned long end)
703 {
704 struct mmu_gather tlb;
705 long nr_updated;
706
707 tlb_gather_mmu(&tlb, vma->vm_mm);
708
709 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
710 if (nr_updated > 0) {
711 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
712 count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
713 }
714
715 tlb_finish_mmu(&tlb);
716
717 return nr_updated;
718 }
719 #endif /* CONFIG_NUMA_BALANCING */
720
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)721 static int queue_pages_test_walk(unsigned long start, unsigned long end,
722 struct mm_walk *walk)
723 {
724 struct vm_area_struct *next, *vma = walk->vma;
725 struct queue_pages *qp = walk->private;
726 unsigned long flags = qp->flags;
727
728 /* range check first */
729 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
730
731 if (!qp->first) {
732 qp->first = vma;
733 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
734 (qp->start < vma->vm_start))
735 /* hole at head side of range */
736 return -EFAULT;
737 }
738 next = find_vma(vma->vm_mm, vma->vm_end);
739 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
740 ((vma->vm_end < qp->end) &&
741 (!next || vma->vm_end < next->vm_start)))
742 /* hole at middle or tail of range */
743 return -EFAULT;
744
745 /*
746 * Need check MPOL_MF_STRICT to return -EIO if possible
747 * regardless of vma_migratable
748 */
749 if (!vma_migratable(vma) &&
750 !(flags & MPOL_MF_STRICT))
751 return 1;
752
753 /*
754 * Check page nodes, and queue pages to move, in the current vma.
755 * But if no moving, and no strict checking, the scan can be skipped.
756 */
757 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
758 return 0;
759 return 1;
760 }
761
762 static const struct mm_walk_ops queue_pages_walk_ops = {
763 .hugetlb_entry = queue_folios_hugetlb,
764 .pmd_entry = queue_folios_pte_range,
765 .test_walk = queue_pages_test_walk,
766 .walk_lock = PGWALK_RDLOCK,
767 };
768
769 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
770 .hugetlb_entry = queue_folios_hugetlb,
771 .pmd_entry = queue_folios_pte_range,
772 .test_walk = queue_pages_test_walk,
773 .walk_lock = PGWALK_WRLOCK,
774 };
775
776 /*
777 * Walk through page tables and collect pages to be migrated.
778 *
779 * If pages found in a given range are not on the required set of @nodes,
780 * and migration is allowed, they are isolated and queued to @pagelist.
781 *
782 * queue_pages_range() may return:
783 * 0 - all pages already on the right node, or successfully queued for moving
784 * (or neither strict checking nor moving requested: only range checking).
785 * >0 - this number of misplaced folios could not be queued for moving
786 * (a hugetlbfs page or a transparent huge page being counted as 1).
787 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
788 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
789 */
790 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)791 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
792 nodemask_t *nodes, unsigned long flags,
793 struct list_head *pagelist)
794 {
795 int err;
796 struct queue_pages qp = {
797 .pagelist = pagelist,
798 .flags = flags,
799 .nmask = nodes,
800 .start = start,
801 .end = end,
802 .first = NULL,
803 };
804 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
805 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
806
807 err = walk_page_range(mm, start, end, ops, &qp);
808
809 if (!qp.first)
810 /* whole range in hole */
811 err = -EFAULT;
812
813 return err ? : qp.nr_failed;
814 }
815
816 /*
817 * Apply policy to a single VMA
818 * This must be called with the mmap_lock held for writing.
819 */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)820 static int vma_replace_policy(struct vm_area_struct *vma,
821 struct mempolicy *pol)
822 {
823 int err;
824 struct mempolicy *old;
825 struct mempolicy *new;
826
827 vma_assert_write_locked(vma);
828
829 new = mpol_dup(pol);
830 if (IS_ERR(new))
831 return PTR_ERR(new);
832
833 if (vma->vm_ops && vma->vm_ops->set_policy) {
834 err = vma->vm_ops->set_policy(vma, new);
835 if (err)
836 goto err_out;
837 }
838
839 old = vma->vm_policy;
840 vma->vm_policy = new; /* protected by mmap_lock */
841 mpol_put(old);
842
843 return 0;
844 err_out:
845 mpol_put(new);
846 return err;
847 }
848
849 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)850 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
851 struct vm_area_struct **prev, unsigned long start,
852 unsigned long end, struct mempolicy *new_pol)
853 {
854 unsigned long vmstart, vmend;
855
856 vmend = min(end, vma->vm_end);
857 if (start > vma->vm_start) {
858 *prev = vma;
859 vmstart = start;
860 } else {
861 vmstart = vma->vm_start;
862 }
863
864 if (mpol_equal(vma->vm_policy, new_pol)) {
865 *prev = vma;
866 return 0;
867 }
868
869 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
870 if (IS_ERR(vma))
871 return PTR_ERR(vma);
872
873 *prev = vma;
874 return vma_replace_policy(vma, new_pol);
875 }
876
877 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)878 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
879 nodemask_t *nodes)
880 {
881 struct mempolicy *new, *old;
882 NODEMASK_SCRATCH(scratch);
883 int ret;
884
885 if (!scratch)
886 return -ENOMEM;
887
888 new = mpol_new(mode, flags, nodes);
889 if (IS_ERR(new)) {
890 ret = PTR_ERR(new);
891 goto out;
892 }
893
894 task_lock(current);
895 ret = mpol_set_nodemask(new, nodes, scratch);
896 if (ret) {
897 task_unlock(current);
898 mpol_put(new);
899 goto out;
900 }
901
902 old = current->mempolicy;
903 current->mempolicy = new;
904 if (new && (new->mode == MPOL_INTERLEAVE ||
905 new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
906 current->il_prev = MAX_NUMNODES-1;
907 current->il_weight = 0;
908 }
909 task_unlock(current);
910 mpol_put(old);
911 ret = 0;
912 out:
913 NODEMASK_SCRATCH_FREE(scratch);
914 return ret;
915 }
916
917 /*
918 * Return nodemask for policy for get_mempolicy() query
919 *
920 * Called with task's alloc_lock held
921 */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)922 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
923 {
924 nodes_clear(*nodes);
925 if (pol == &default_policy)
926 return;
927
928 switch (pol->mode) {
929 case MPOL_BIND:
930 case MPOL_INTERLEAVE:
931 case MPOL_PREFERRED:
932 case MPOL_PREFERRED_MANY:
933 case MPOL_WEIGHTED_INTERLEAVE:
934 *nodes = pol->nodes;
935 break;
936 case MPOL_LOCAL:
937 /* return empty node mask for local allocation */
938 break;
939 default:
940 BUG();
941 }
942 }
943
lookup_node(struct mm_struct * mm,unsigned long addr)944 static int lookup_node(struct mm_struct *mm, unsigned long addr)
945 {
946 struct page *p = NULL;
947 int ret;
948
949 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
950 if (ret > 0) {
951 ret = page_to_nid(p);
952 put_page(p);
953 }
954 return ret;
955 }
956
957 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)958 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
959 unsigned long addr, unsigned long flags)
960 {
961 int err;
962 struct mm_struct *mm = current->mm;
963 struct vm_area_struct *vma = NULL;
964 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
965
966 if (flags &
967 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
968 return -EINVAL;
969
970 if (flags & MPOL_F_MEMS_ALLOWED) {
971 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
972 return -EINVAL;
973 *policy = 0; /* just so it's initialized */
974 task_lock(current);
975 *nmask = cpuset_current_mems_allowed;
976 task_unlock(current);
977 return 0;
978 }
979
980 if (flags & MPOL_F_ADDR) {
981 pgoff_t ilx; /* ignored here */
982 /*
983 * Do NOT fall back to task policy if the
984 * vma/shared policy at addr is NULL. We
985 * want to return MPOL_DEFAULT in this case.
986 */
987 mmap_read_lock(mm);
988 vma = vma_lookup(mm, addr);
989 if (!vma) {
990 mmap_read_unlock(mm);
991 return -EFAULT;
992 }
993 pol = __get_vma_policy(vma, addr, &ilx);
994 } else if (addr)
995 return -EINVAL;
996
997 if (!pol)
998 pol = &default_policy; /* indicates default behavior */
999
1000 if (flags & MPOL_F_NODE) {
1001 if (flags & MPOL_F_ADDR) {
1002 /*
1003 * Take a refcount on the mpol, because we are about to
1004 * drop the mmap_lock, after which only "pol" remains
1005 * valid, "vma" is stale.
1006 */
1007 pol_refcount = pol;
1008 vma = NULL;
1009 mpol_get(pol);
1010 mmap_read_unlock(mm);
1011 err = lookup_node(mm, addr);
1012 if (err < 0)
1013 goto out;
1014 *policy = err;
1015 } else if (pol == current->mempolicy &&
1016 pol->mode == MPOL_INTERLEAVE) {
1017 *policy = next_node_in(current->il_prev, pol->nodes);
1018 } else if (pol == current->mempolicy &&
1019 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1020 if (current->il_weight)
1021 *policy = current->il_prev;
1022 else
1023 *policy = next_node_in(current->il_prev,
1024 pol->nodes);
1025 } else {
1026 err = -EINVAL;
1027 goto out;
1028 }
1029 } else {
1030 *policy = pol == &default_policy ? MPOL_DEFAULT :
1031 pol->mode;
1032 /*
1033 * Internal mempolicy flags must be masked off before exposing
1034 * the policy to userspace.
1035 */
1036 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1037 }
1038
1039 err = 0;
1040 if (nmask) {
1041 if (mpol_store_user_nodemask(pol)) {
1042 *nmask = pol->w.user_nodemask;
1043 } else {
1044 task_lock(current);
1045 get_policy_nodemask(pol, nmask);
1046 task_unlock(current);
1047 }
1048 }
1049
1050 out:
1051 mpol_cond_put(pol);
1052 if (vma)
1053 mmap_read_unlock(mm);
1054 if (pol_refcount)
1055 mpol_put(pol_refcount);
1056 return err;
1057 }
1058
1059 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1060 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1061 unsigned long flags)
1062 {
1063 /*
1064 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1065 * Choosing not to migrate a shared folio is not counted as a failure.
1066 *
1067 * See folio_maybe_mapped_shared() on possible imprecision when we
1068 * cannot easily detect if a folio is shared.
1069 */
1070 if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1071 if (folio_isolate_lru(folio)) {
1072 list_add_tail(&folio->lru, foliolist);
1073 node_stat_mod_folio(folio,
1074 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1075 folio_nr_pages(folio));
1076 } else {
1077 /*
1078 * Non-movable folio may reach here. And, there may be
1079 * temporary off LRU folios or non-LRU movable folios.
1080 * Treat them as unmovable folios since they can't be
1081 * isolated, so they can't be moved at the moment.
1082 */
1083 return false;
1084 }
1085 }
1086 return true;
1087 }
1088
1089 /*
1090 * Migrate pages from one node to a target node.
1091 * Returns error or the number of pages not migrated.
1092 */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1093 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1094 int flags)
1095 {
1096 nodemask_t nmask;
1097 struct vm_area_struct *vma;
1098 LIST_HEAD(pagelist);
1099 long nr_failed;
1100 long err = 0;
1101 struct migration_target_control mtc = {
1102 .nid = dest,
1103 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1104 .reason = MR_SYSCALL,
1105 };
1106
1107 nodes_clear(nmask);
1108 node_set(source, nmask);
1109
1110 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1111
1112 mmap_read_lock(mm);
1113 vma = find_vma(mm, 0);
1114 if (unlikely(!vma)) {
1115 mmap_read_unlock(mm);
1116 return 0;
1117 }
1118
1119 /*
1120 * This does not migrate the range, but isolates all pages that
1121 * need migration. Between passing in the full user address
1122 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1123 * but passes back the count of pages which could not be isolated.
1124 */
1125 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1126 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1127 mmap_read_unlock(mm);
1128
1129 if (!list_empty(&pagelist)) {
1130 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1131 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1132 if (err)
1133 putback_movable_pages(&pagelist);
1134 }
1135
1136 if (err >= 0)
1137 err += nr_failed;
1138 return err;
1139 }
1140
1141 /*
1142 * Move pages between the two nodesets so as to preserve the physical
1143 * layout as much as possible.
1144 *
1145 * Returns the number of page that could not be moved.
1146 */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1147 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1148 const nodemask_t *to, int flags)
1149 {
1150 long nr_failed = 0;
1151 long err = 0;
1152 nodemask_t tmp;
1153
1154 lru_cache_disable();
1155
1156 /*
1157 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1158 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1159 * bit in 'tmp', and return that <source, dest> pair for migration.
1160 * The pair of nodemasks 'to' and 'from' define the map.
1161 *
1162 * If no pair of bits is found that way, fallback to picking some
1163 * pair of 'source' and 'dest' bits that are not the same. If the
1164 * 'source' and 'dest' bits are the same, this represents a node
1165 * that will be migrating to itself, so no pages need move.
1166 *
1167 * If no bits are left in 'tmp', or if all remaining bits left
1168 * in 'tmp' correspond to the same bit in 'to', return false
1169 * (nothing left to migrate).
1170 *
1171 * This lets us pick a pair of nodes to migrate between, such that
1172 * if possible the dest node is not already occupied by some other
1173 * source node, minimizing the risk of overloading the memory on a
1174 * node that would happen if we migrated incoming memory to a node
1175 * before migrating outgoing memory source that same node.
1176 *
1177 * A single scan of tmp is sufficient. As we go, we remember the
1178 * most recent <s, d> pair that moved (s != d). If we find a pair
1179 * that not only moved, but what's better, moved to an empty slot
1180 * (d is not set in tmp), then we break out then, with that pair.
1181 * Otherwise when we finish scanning from_tmp, we at least have the
1182 * most recent <s, d> pair that moved. If we get all the way through
1183 * the scan of tmp without finding any node that moved, much less
1184 * moved to an empty node, then there is nothing left worth migrating.
1185 */
1186
1187 tmp = *from;
1188 while (!nodes_empty(tmp)) {
1189 int s, d;
1190 int source = NUMA_NO_NODE;
1191 int dest = 0;
1192
1193 for_each_node_mask(s, tmp) {
1194
1195 /*
1196 * do_migrate_pages() tries to maintain the relative
1197 * node relationship of the pages established between
1198 * threads and memory areas.
1199 *
1200 * However if the number of source nodes is not equal to
1201 * the number of destination nodes we can not preserve
1202 * this node relative relationship. In that case, skip
1203 * copying memory from a node that is in the destination
1204 * mask.
1205 *
1206 * Example: [2,3,4] -> [3,4,5] moves everything.
1207 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1208 */
1209
1210 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1211 (node_isset(s, *to)))
1212 continue;
1213
1214 d = node_remap(s, *from, *to);
1215 if (s == d)
1216 continue;
1217
1218 source = s; /* Node moved. Memorize */
1219 dest = d;
1220
1221 /* dest not in remaining from nodes? */
1222 if (!node_isset(dest, tmp))
1223 break;
1224 }
1225 if (source == NUMA_NO_NODE)
1226 break;
1227
1228 node_clear(source, tmp);
1229 err = migrate_to_node(mm, source, dest, flags);
1230 if (err > 0)
1231 nr_failed += err;
1232 if (err < 0)
1233 break;
1234 }
1235
1236 lru_cache_enable();
1237 if (err < 0)
1238 return err;
1239 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1240 }
1241
1242 /*
1243 * Allocate a new folio for page migration, according to NUMA mempolicy.
1244 */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1245 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1246 unsigned long private)
1247 {
1248 struct migration_mpol *mmpol = (struct migration_mpol *)private;
1249 struct mempolicy *pol = mmpol->pol;
1250 pgoff_t ilx = mmpol->ilx;
1251 unsigned int order;
1252 int nid = numa_node_id();
1253 gfp_t gfp;
1254
1255 order = folio_order(src);
1256 ilx += src->index >> order;
1257
1258 if (folio_test_hugetlb(src)) {
1259 nodemask_t *nodemask;
1260 struct hstate *h;
1261
1262 h = folio_hstate(src);
1263 gfp = htlb_alloc_mask(h);
1264 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1265 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1266 htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1267 }
1268
1269 if (folio_test_large(src))
1270 gfp = GFP_TRANSHUGE;
1271 else
1272 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1273
1274 return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1275 }
1276 #else
1277
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1278 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1279 unsigned long flags)
1280 {
1281 return false;
1282 }
1283
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1284 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1285 const nodemask_t *to, int flags)
1286 {
1287 return -ENOSYS;
1288 }
1289
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1290 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1291 unsigned long private)
1292 {
1293 return NULL;
1294 }
1295 #endif
1296
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1297 static long do_mbind(unsigned long start, unsigned long len,
1298 unsigned short mode, unsigned short mode_flags,
1299 nodemask_t *nmask, unsigned long flags)
1300 {
1301 struct mm_struct *mm = current->mm;
1302 struct vm_area_struct *vma, *prev;
1303 struct vma_iterator vmi;
1304 struct migration_mpol mmpol;
1305 struct mempolicy *new;
1306 unsigned long end;
1307 long err;
1308 long nr_failed;
1309 LIST_HEAD(pagelist);
1310
1311 if (flags & ~(unsigned long)MPOL_MF_VALID)
1312 return -EINVAL;
1313 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1314 return -EPERM;
1315
1316 if (start & ~PAGE_MASK)
1317 return -EINVAL;
1318
1319 if (mode == MPOL_DEFAULT)
1320 flags &= ~MPOL_MF_STRICT;
1321
1322 len = PAGE_ALIGN(len);
1323 end = start + len;
1324
1325 if (end < start)
1326 return -EINVAL;
1327 if (end == start)
1328 return 0;
1329
1330 new = mpol_new(mode, mode_flags, nmask);
1331 if (IS_ERR(new))
1332 return PTR_ERR(new);
1333
1334 /*
1335 * If we are using the default policy then operation
1336 * on discontinuous address spaces is okay after all
1337 */
1338 if (!new)
1339 flags |= MPOL_MF_DISCONTIG_OK;
1340
1341 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1342 lru_cache_disable();
1343 {
1344 NODEMASK_SCRATCH(scratch);
1345 if (scratch) {
1346 mmap_write_lock(mm);
1347 err = mpol_set_nodemask(new, nmask, scratch);
1348 if (err)
1349 mmap_write_unlock(mm);
1350 } else
1351 err = -ENOMEM;
1352 NODEMASK_SCRATCH_FREE(scratch);
1353 }
1354 if (err)
1355 goto mpol_out;
1356
1357 /*
1358 * Lock the VMAs before scanning for pages to migrate,
1359 * to ensure we don't miss a concurrently inserted page.
1360 */
1361 nr_failed = queue_pages_range(mm, start, end, nmask,
1362 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1363
1364 if (nr_failed < 0) {
1365 err = nr_failed;
1366 nr_failed = 0;
1367 } else {
1368 vma_iter_init(&vmi, mm, start);
1369 prev = vma_prev(&vmi);
1370 for_each_vma_range(vmi, vma, end) {
1371 err = mbind_range(&vmi, vma, &prev, start, end, new);
1372 if (err)
1373 break;
1374 }
1375 }
1376
1377 if (!err && !list_empty(&pagelist)) {
1378 /* Convert MPOL_DEFAULT's NULL to task or default policy */
1379 if (!new) {
1380 new = get_task_policy(current);
1381 mpol_get(new);
1382 }
1383 mmpol.pol = new;
1384 mmpol.ilx = 0;
1385
1386 /*
1387 * In the interleaved case, attempt to allocate on exactly the
1388 * targeted nodes, for the first VMA to be migrated; for later
1389 * VMAs, the nodes will still be interleaved from the targeted
1390 * nodemask, but one by one may be selected differently.
1391 */
1392 if (new->mode == MPOL_INTERLEAVE ||
1393 new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1394 struct folio *folio;
1395 unsigned int order;
1396 unsigned long addr = -EFAULT;
1397
1398 list_for_each_entry(folio, &pagelist, lru) {
1399 if (!folio_test_ksm(folio))
1400 break;
1401 }
1402 if (!list_entry_is_head(folio, &pagelist, lru)) {
1403 vma_iter_init(&vmi, mm, start);
1404 for_each_vma_range(vmi, vma, end) {
1405 addr = page_address_in_vma(folio,
1406 folio_page(folio, 0), vma);
1407 if (addr != -EFAULT)
1408 break;
1409 }
1410 }
1411 if (addr != -EFAULT) {
1412 order = folio_order(folio);
1413 /* We already know the pol, but not the ilx */
1414 mpol_cond_put(get_vma_policy(vma, addr, order,
1415 &mmpol.ilx));
1416 /* Set base from which to increment by index */
1417 mmpol.ilx -= folio->index >> order;
1418 }
1419 }
1420 }
1421
1422 mmap_write_unlock(mm);
1423
1424 if (!err && !list_empty(&pagelist)) {
1425 nr_failed |= migrate_pages(&pagelist,
1426 alloc_migration_target_by_mpol, NULL,
1427 (unsigned long)&mmpol, MIGRATE_SYNC,
1428 MR_MEMPOLICY_MBIND, NULL);
1429 }
1430
1431 if (nr_failed && (flags & MPOL_MF_STRICT))
1432 err = -EIO;
1433 if (!list_empty(&pagelist))
1434 putback_movable_pages(&pagelist);
1435 mpol_out:
1436 mpol_put(new);
1437 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1438 lru_cache_enable();
1439 return err;
1440 }
1441
1442 /*
1443 * User space interface with variable sized bitmaps for nodelists.
1444 */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1445 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1446 unsigned long maxnode)
1447 {
1448 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1449 int ret;
1450
1451 if (in_compat_syscall())
1452 ret = compat_get_bitmap(mask,
1453 (const compat_ulong_t __user *)nmask,
1454 maxnode);
1455 else
1456 ret = copy_from_user(mask, nmask,
1457 nlongs * sizeof(unsigned long));
1458
1459 if (ret)
1460 return -EFAULT;
1461
1462 if (maxnode % BITS_PER_LONG)
1463 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1464
1465 return 0;
1466 }
1467
1468 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1469 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1470 unsigned long maxnode)
1471 {
1472 --maxnode;
1473 nodes_clear(*nodes);
1474 if (maxnode == 0 || !nmask)
1475 return 0;
1476 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1477 return -EINVAL;
1478
1479 /*
1480 * When the user specified more nodes than supported just check
1481 * if the non supported part is all zero, one word at a time,
1482 * starting at the end.
1483 */
1484 while (maxnode > MAX_NUMNODES) {
1485 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1486 unsigned long t;
1487
1488 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1489 return -EFAULT;
1490
1491 if (maxnode - bits >= MAX_NUMNODES) {
1492 maxnode -= bits;
1493 } else {
1494 maxnode = MAX_NUMNODES;
1495 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1496 }
1497 if (t)
1498 return -EINVAL;
1499 }
1500
1501 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1502 }
1503
1504 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1505 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1506 nodemask_t *nodes)
1507 {
1508 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1509 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1510 bool compat = in_compat_syscall();
1511
1512 if (compat)
1513 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1514
1515 if (copy > nbytes) {
1516 if (copy > PAGE_SIZE)
1517 return -EINVAL;
1518 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1519 return -EFAULT;
1520 copy = nbytes;
1521 maxnode = nr_node_ids;
1522 }
1523
1524 if (compat)
1525 return compat_put_bitmap((compat_ulong_t __user *)mask,
1526 nodes_addr(*nodes), maxnode);
1527
1528 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1529 }
1530
1531 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1532 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1533 {
1534 *flags = *mode & MPOL_MODE_FLAGS;
1535 *mode &= ~MPOL_MODE_FLAGS;
1536
1537 if ((unsigned int)(*mode) >= MPOL_MAX)
1538 return -EINVAL;
1539 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1540 return -EINVAL;
1541 if (*flags & MPOL_F_NUMA_BALANCING) {
1542 if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1543 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1544 else
1545 return -EINVAL;
1546 }
1547 return 0;
1548 }
1549
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1550 static long kernel_mbind(unsigned long start, unsigned long len,
1551 unsigned long mode, const unsigned long __user *nmask,
1552 unsigned long maxnode, unsigned int flags)
1553 {
1554 unsigned short mode_flags;
1555 nodemask_t nodes;
1556 int lmode = mode;
1557 int err;
1558
1559 start = untagged_addr(start);
1560 err = sanitize_mpol_flags(&lmode, &mode_flags);
1561 if (err)
1562 return err;
1563
1564 err = get_nodes(&nodes, nmask, maxnode);
1565 if (err)
1566 return err;
1567
1568 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1569 }
1570
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1571 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1572 unsigned long, home_node, unsigned long, flags)
1573 {
1574 struct mm_struct *mm = current->mm;
1575 struct vm_area_struct *vma, *prev;
1576 struct mempolicy *new, *old;
1577 unsigned long end;
1578 int err = -ENOENT;
1579 VMA_ITERATOR(vmi, mm, start);
1580
1581 start = untagged_addr(start);
1582 if (start & ~PAGE_MASK)
1583 return -EINVAL;
1584 /*
1585 * flags is used for future extension if any.
1586 */
1587 if (flags != 0)
1588 return -EINVAL;
1589
1590 /*
1591 * Check home_node is online to avoid accessing uninitialized
1592 * NODE_DATA.
1593 */
1594 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1595 return -EINVAL;
1596
1597 len = PAGE_ALIGN(len);
1598 end = start + len;
1599
1600 if (end < start)
1601 return -EINVAL;
1602 if (end == start)
1603 return 0;
1604 mmap_write_lock(mm);
1605 prev = vma_prev(&vmi);
1606 for_each_vma_range(vmi, vma, end) {
1607 /*
1608 * If any vma in the range got policy other than MPOL_BIND
1609 * or MPOL_PREFERRED_MANY we return error. We don't reset
1610 * the home node for vmas we already updated before.
1611 */
1612 old = vma_policy(vma);
1613 if (!old) {
1614 prev = vma;
1615 continue;
1616 }
1617 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1618 err = -EOPNOTSUPP;
1619 break;
1620 }
1621 new = mpol_dup(old);
1622 if (IS_ERR(new)) {
1623 err = PTR_ERR(new);
1624 break;
1625 }
1626
1627 vma_start_write(vma);
1628 new->home_node = home_node;
1629 err = mbind_range(&vmi, vma, &prev, start, end, new);
1630 mpol_put(new);
1631 if (err)
1632 break;
1633 }
1634 mmap_write_unlock(mm);
1635 return err;
1636 }
1637
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1638 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1639 unsigned long, mode, const unsigned long __user *, nmask,
1640 unsigned long, maxnode, unsigned int, flags)
1641 {
1642 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1643 }
1644
1645 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1646 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1647 unsigned long maxnode)
1648 {
1649 unsigned short mode_flags;
1650 nodemask_t nodes;
1651 int lmode = mode;
1652 int err;
1653
1654 err = sanitize_mpol_flags(&lmode, &mode_flags);
1655 if (err)
1656 return err;
1657
1658 err = get_nodes(&nodes, nmask, maxnode);
1659 if (err)
1660 return err;
1661
1662 return do_set_mempolicy(lmode, mode_flags, &nodes);
1663 }
1664
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1665 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1666 unsigned long, maxnode)
1667 {
1668 return kernel_set_mempolicy(mode, nmask, maxnode);
1669 }
1670
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1671 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1672 const unsigned long __user *old_nodes,
1673 const unsigned long __user *new_nodes)
1674 {
1675 struct mm_struct *mm = NULL;
1676 struct task_struct *task;
1677 nodemask_t task_nodes;
1678 int err;
1679 nodemask_t *old;
1680 nodemask_t *new;
1681 NODEMASK_SCRATCH(scratch);
1682
1683 if (!scratch)
1684 return -ENOMEM;
1685
1686 old = &scratch->mask1;
1687 new = &scratch->mask2;
1688
1689 err = get_nodes(old, old_nodes, maxnode);
1690 if (err)
1691 goto out;
1692
1693 err = get_nodes(new, new_nodes, maxnode);
1694 if (err)
1695 goto out;
1696
1697 /* Find the mm_struct */
1698 rcu_read_lock();
1699 task = pid ? find_task_by_vpid(pid) : current;
1700 if (!task) {
1701 rcu_read_unlock();
1702 err = -ESRCH;
1703 goto out;
1704 }
1705 get_task_struct(task);
1706
1707 err = -EINVAL;
1708
1709 /*
1710 * Check if this process has the right to modify the specified process.
1711 * Use the regular "ptrace_may_access()" checks.
1712 */
1713 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1714 rcu_read_unlock();
1715 err = -EPERM;
1716 goto out_put;
1717 }
1718 rcu_read_unlock();
1719
1720 task_nodes = cpuset_mems_allowed(task);
1721 /* Is the user allowed to access the target nodes? */
1722 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1723 err = -EPERM;
1724 goto out_put;
1725 }
1726
1727 task_nodes = cpuset_mems_allowed(current);
1728 nodes_and(*new, *new, task_nodes);
1729 if (nodes_empty(*new))
1730 goto out_put;
1731
1732 err = security_task_movememory(task);
1733 if (err)
1734 goto out_put;
1735
1736 mm = get_task_mm(task);
1737 put_task_struct(task);
1738
1739 if (!mm) {
1740 err = -EINVAL;
1741 goto out;
1742 }
1743
1744 err = do_migrate_pages(mm, old, new,
1745 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1746
1747 mmput(mm);
1748 out:
1749 NODEMASK_SCRATCH_FREE(scratch);
1750
1751 return err;
1752
1753 out_put:
1754 put_task_struct(task);
1755 goto out;
1756 }
1757
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1758 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1759 const unsigned long __user *, old_nodes,
1760 const unsigned long __user *, new_nodes)
1761 {
1762 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1763 }
1764
1765 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1766 static int kernel_get_mempolicy(int __user *policy,
1767 unsigned long __user *nmask,
1768 unsigned long maxnode,
1769 unsigned long addr,
1770 unsigned long flags)
1771 {
1772 int err;
1773 int pval;
1774 nodemask_t nodes;
1775
1776 if (nmask != NULL && maxnode < nr_node_ids)
1777 return -EINVAL;
1778
1779 addr = untagged_addr(addr);
1780
1781 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1782
1783 if (err)
1784 return err;
1785
1786 if (policy && put_user(pval, policy))
1787 return -EFAULT;
1788
1789 if (nmask)
1790 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1791
1792 return err;
1793 }
1794
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1795 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1796 unsigned long __user *, nmask, unsigned long, maxnode,
1797 unsigned long, addr, unsigned long, flags)
1798 {
1799 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1800 }
1801
vma_migratable(struct vm_area_struct * vma)1802 bool vma_migratable(struct vm_area_struct *vma)
1803 {
1804 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1805 return false;
1806
1807 /*
1808 * DAX device mappings require predictable access latency, so avoid
1809 * incurring periodic faults.
1810 */
1811 if (vma_is_dax(vma))
1812 return false;
1813
1814 if (is_vm_hugetlb_page(vma) &&
1815 !hugepage_migration_supported(hstate_vma(vma)))
1816 return false;
1817
1818 /*
1819 * Migration allocates pages in the highest zone. If we cannot
1820 * do so then migration (at least from node to node) is not
1821 * possible.
1822 */
1823 if (vma->vm_file &&
1824 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1825 < policy_zone)
1826 return false;
1827 return true;
1828 }
1829
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)1830 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1831 unsigned long addr, pgoff_t *ilx)
1832 {
1833 *ilx = 0;
1834 return (vma->vm_ops && vma->vm_ops->get_policy) ?
1835 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
1836 }
1837
1838 /*
1839 * get_vma_policy(@vma, @addr, @order, @ilx)
1840 * @vma: virtual memory area whose policy is sought
1841 * @addr: address in @vma for shared policy lookup
1842 * @order: 0, or appropriate huge_page_order for interleaving
1843 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
1844 * MPOL_WEIGHTED_INTERLEAVE
1845 *
1846 * Returns effective policy for a VMA at specified address.
1847 * Falls back to current->mempolicy or system default policy, as necessary.
1848 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1849 * count--added by the get_policy() vm_op, as appropriate--to protect against
1850 * freeing by another task. It is the caller's responsibility to free the
1851 * extra reference for shared policies.
1852 */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)1853 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1854 unsigned long addr, int order, pgoff_t *ilx)
1855 {
1856 struct mempolicy *pol;
1857
1858 pol = __get_vma_policy(vma, addr, ilx);
1859 if (!pol)
1860 pol = get_task_policy(current);
1861 if (pol->mode == MPOL_INTERLEAVE ||
1862 pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1863 *ilx += vma->vm_pgoff >> order;
1864 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1865 }
1866 return pol;
1867 }
1868
vma_policy_mof(struct vm_area_struct * vma)1869 bool vma_policy_mof(struct vm_area_struct *vma)
1870 {
1871 struct mempolicy *pol;
1872
1873 if (vma->vm_ops && vma->vm_ops->get_policy) {
1874 bool ret = false;
1875 pgoff_t ilx; /* ignored here */
1876
1877 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1878 if (pol && (pol->flags & MPOL_F_MOF))
1879 ret = true;
1880 mpol_cond_put(pol);
1881
1882 return ret;
1883 }
1884
1885 pol = vma->vm_policy;
1886 if (!pol)
1887 pol = get_task_policy(current);
1888
1889 return pol->flags & MPOL_F_MOF;
1890 }
1891
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)1892 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1893 {
1894 enum zone_type dynamic_policy_zone = policy_zone;
1895
1896 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1897
1898 /*
1899 * if policy->nodes has movable memory only,
1900 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1901 *
1902 * policy->nodes is intersect with node_states[N_MEMORY].
1903 * so if the following test fails, it implies
1904 * policy->nodes has movable memory only.
1905 */
1906 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1907 dynamic_policy_zone = ZONE_MOVABLE;
1908
1909 return zone >= dynamic_policy_zone;
1910 }
1911
weighted_interleave_nodes(struct mempolicy * policy)1912 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
1913 {
1914 unsigned int node;
1915 unsigned int cpuset_mems_cookie;
1916
1917 retry:
1918 /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
1919 cpuset_mems_cookie = read_mems_allowed_begin();
1920 node = current->il_prev;
1921 if (!current->il_weight || !node_isset(node, policy->nodes)) {
1922 node = next_node_in(node, policy->nodes);
1923 if (read_mems_allowed_retry(cpuset_mems_cookie))
1924 goto retry;
1925 if (node == MAX_NUMNODES)
1926 return node;
1927 current->il_prev = node;
1928 current->il_weight = get_il_weight(node);
1929 }
1930 current->il_weight--;
1931 return node;
1932 }
1933
1934 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)1935 static unsigned int interleave_nodes(struct mempolicy *policy)
1936 {
1937 unsigned int nid;
1938 unsigned int cpuset_mems_cookie;
1939
1940 /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
1941 do {
1942 cpuset_mems_cookie = read_mems_allowed_begin();
1943 nid = next_node_in(current->il_prev, policy->nodes);
1944 } while (read_mems_allowed_retry(cpuset_mems_cookie));
1945
1946 if (nid < MAX_NUMNODES)
1947 current->il_prev = nid;
1948 return nid;
1949 }
1950
1951 /*
1952 * Depending on the memory policy provide a node from which to allocate the
1953 * next slab entry.
1954 */
mempolicy_slab_node(void)1955 unsigned int mempolicy_slab_node(void)
1956 {
1957 struct mempolicy *policy;
1958 int node = numa_mem_id();
1959
1960 if (!in_task())
1961 return node;
1962
1963 policy = current->mempolicy;
1964 if (!policy)
1965 return node;
1966
1967 switch (policy->mode) {
1968 case MPOL_PREFERRED:
1969 return first_node(policy->nodes);
1970
1971 case MPOL_INTERLEAVE:
1972 return interleave_nodes(policy);
1973
1974 case MPOL_WEIGHTED_INTERLEAVE:
1975 return weighted_interleave_nodes(policy);
1976
1977 case MPOL_BIND:
1978 case MPOL_PREFERRED_MANY:
1979 {
1980 struct zoneref *z;
1981
1982 /*
1983 * Follow bind policy behavior and start allocation at the
1984 * first node.
1985 */
1986 struct zonelist *zonelist;
1987 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1988 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1989 z = first_zones_zonelist(zonelist, highest_zoneidx,
1990 &policy->nodes);
1991 return zonelist_zone(z) ? zonelist_node_idx(z) : node;
1992 }
1993 case MPOL_LOCAL:
1994 return node;
1995
1996 default:
1997 BUG();
1998 }
1999 }
2000
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2001 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2002 nodemask_t *mask)
2003 {
2004 /*
2005 * barrier stabilizes the nodemask locally so that it can be iterated
2006 * over safely without concern for changes. Allocators validate node
2007 * selection does not violate mems_allowed, so this is safe.
2008 */
2009 barrier();
2010 memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2011 barrier();
2012 return nodes_weight(*mask);
2013 }
2014
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2015 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2016 {
2017 nodemask_t nodemask;
2018 unsigned int target, nr_nodes;
2019 u8 *table;
2020 unsigned int weight_total = 0;
2021 u8 weight;
2022 int nid;
2023
2024 nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2025 if (!nr_nodes)
2026 return numa_node_id();
2027
2028 rcu_read_lock();
2029 table = rcu_dereference(iw_table);
2030 /* calculate the total weight */
2031 for_each_node_mask(nid, nodemask) {
2032 /* detect system default usage */
2033 weight = table ? table[nid] : 1;
2034 weight = weight ? weight : 1;
2035 weight_total += weight;
2036 }
2037
2038 /* Calculate the node offset based on totals */
2039 target = ilx % weight_total;
2040 nid = first_node(nodemask);
2041 while (target) {
2042 /* detect system default usage */
2043 weight = table ? table[nid] : 1;
2044 weight = weight ? weight : 1;
2045 if (target < weight)
2046 break;
2047 target -= weight;
2048 nid = next_node_in(nid, nodemask);
2049 }
2050 rcu_read_unlock();
2051 return nid;
2052 }
2053
2054 /*
2055 * Do static interleaving for interleave index @ilx. Returns the ilx'th
2056 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2057 * exceeds the number of present nodes.
2058 */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2059 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2060 {
2061 nodemask_t nodemask;
2062 unsigned int target, nnodes;
2063 int i;
2064 int nid;
2065
2066 nnodes = read_once_policy_nodemask(pol, &nodemask);
2067 if (!nnodes)
2068 return numa_node_id();
2069 target = ilx % nnodes;
2070 nid = first_node(nodemask);
2071 for (i = 0; i < target; i++)
2072 nid = next_node(nid, nodemask);
2073 return nid;
2074 }
2075
2076 /*
2077 * Return a nodemask representing a mempolicy for filtering nodes for
2078 * page allocation, together with preferred node id (or the input node id).
2079 */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2080 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2081 pgoff_t ilx, int *nid)
2082 {
2083 nodemask_t *nodemask = NULL;
2084
2085 switch (pol->mode) {
2086 case MPOL_PREFERRED:
2087 /* Override input node id */
2088 *nid = first_node(pol->nodes);
2089 break;
2090 case MPOL_PREFERRED_MANY:
2091 nodemask = &pol->nodes;
2092 if (pol->home_node != NUMA_NO_NODE)
2093 *nid = pol->home_node;
2094 break;
2095 case MPOL_BIND:
2096 /* Restrict to nodemask (but not on lower zones) */
2097 if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2098 cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2099 nodemask = &pol->nodes;
2100 if (pol->home_node != NUMA_NO_NODE)
2101 *nid = pol->home_node;
2102 /*
2103 * __GFP_THISNODE shouldn't even be used with the bind policy
2104 * because we might easily break the expectation to stay on the
2105 * requested node and not break the policy.
2106 */
2107 WARN_ON_ONCE(gfp & __GFP_THISNODE);
2108 break;
2109 case MPOL_INTERLEAVE:
2110 /* Override input node id */
2111 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2112 interleave_nodes(pol) : interleave_nid(pol, ilx);
2113 break;
2114 case MPOL_WEIGHTED_INTERLEAVE:
2115 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2116 weighted_interleave_nodes(pol) :
2117 weighted_interleave_nid(pol, ilx);
2118 break;
2119 }
2120
2121 return nodemask;
2122 }
2123
2124 #ifdef CONFIG_HUGETLBFS
2125 /*
2126 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2127 * @vma: virtual memory area whose policy is sought
2128 * @addr: address in @vma for shared policy lookup and interleave policy
2129 * @gfp_flags: for requested zone
2130 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2131 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2132 *
2133 * Returns a nid suitable for a huge page allocation and a pointer
2134 * to the struct mempolicy for conditional unref after allocation.
2135 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2136 * to the mempolicy's @nodemask for filtering the zonelist.
2137 */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2138 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2139 struct mempolicy **mpol, nodemask_t **nodemask)
2140 {
2141 pgoff_t ilx;
2142 int nid;
2143
2144 nid = numa_node_id();
2145 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2146 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2147 return nid;
2148 }
2149
2150 /*
2151 * init_nodemask_of_mempolicy
2152 *
2153 * If the current task's mempolicy is "default" [NULL], return 'false'
2154 * to indicate default policy. Otherwise, extract the policy nodemask
2155 * for 'bind' or 'interleave' policy into the argument nodemask, or
2156 * initialize the argument nodemask to contain the single node for
2157 * 'preferred' or 'local' policy and return 'true' to indicate presence
2158 * of non-default mempolicy.
2159 *
2160 * We don't bother with reference counting the mempolicy [mpol_get/put]
2161 * because the current task is examining it's own mempolicy and a task's
2162 * mempolicy is only ever changed by the task itself.
2163 *
2164 * N.B., it is the caller's responsibility to free a returned nodemask.
2165 */
init_nodemask_of_mempolicy(nodemask_t * mask)2166 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2167 {
2168 struct mempolicy *mempolicy;
2169
2170 if (!(mask && current->mempolicy))
2171 return false;
2172
2173 task_lock(current);
2174 mempolicy = current->mempolicy;
2175 switch (mempolicy->mode) {
2176 case MPOL_PREFERRED:
2177 case MPOL_PREFERRED_MANY:
2178 case MPOL_BIND:
2179 case MPOL_INTERLEAVE:
2180 case MPOL_WEIGHTED_INTERLEAVE:
2181 *mask = mempolicy->nodes;
2182 break;
2183
2184 case MPOL_LOCAL:
2185 init_nodemask_of_node(mask, numa_node_id());
2186 break;
2187
2188 default:
2189 BUG();
2190 }
2191 task_unlock(current);
2192
2193 return true;
2194 }
2195 #endif
2196
2197 /*
2198 * mempolicy_in_oom_domain
2199 *
2200 * If tsk's mempolicy is "bind", check for intersection between mask and
2201 * the policy nodemask. Otherwise, return true for all other policies
2202 * including "interleave", as a tsk with "interleave" policy may have
2203 * memory allocated from all nodes in system.
2204 *
2205 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2206 */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2207 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2208 const nodemask_t *mask)
2209 {
2210 struct mempolicy *mempolicy;
2211 bool ret = true;
2212
2213 if (!mask)
2214 return ret;
2215
2216 task_lock(tsk);
2217 mempolicy = tsk->mempolicy;
2218 if (mempolicy && mempolicy->mode == MPOL_BIND)
2219 ret = nodes_intersects(mempolicy->nodes, *mask);
2220 task_unlock(tsk);
2221
2222 return ret;
2223 }
2224
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2225 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2226 int nid, nodemask_t *nodemask)
2227 {
2228 struct page *page;
2229 gfp_t preferred_gfp;
2230
2231 /*
2232 * This is a two pass approach. The first pass will only try the
2233 * preferred nodes but skip the direct reclaim and allow the
2234 * allocation to fail, while the second pass will try all the
2235 * nodes in system.
2236 */
2237 preferred_gfp = gfp | __GFP_NOWARN;
2238 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2239 page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2240 if (!page)
2241 page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2242
2243 return page;
2244 }
2245
2246 /**
2247 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2248 * @gfp: GFP flags.
2249 * @order: Order of the page allocation.
2250 * @pol: Pointer to the NUMA mempolicy.
2251 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2252 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2253 *
2254 * Return: The page on success or NULL if allocation fails.
2255 */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2256 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2257 struct mempolicy *pol, pgoff_t ilx, int nid)
2258 {
2259 nodemask_t *nodemask;
2260 struct page *page;
2261
2262 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2263
2264 if (pol->mode == MPOL_PREFERRED_MANY)
2265 return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2266
2267 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2268 /* filter "hugepage" allocation, unless from alloc_pages() */
2269 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2270 /*
2271 * For hugepage allocation and non-interleave policy which
2272 * allows the current node (or other explicitly preferred
2273 * node) we only try to allocate from the current/preferred
2274 * node and don't fall back to other nodes, as the cost of
2275 * remote accesses would likely offset THP benefits.
2276 *
2277 * If the policy is interleave or does not allow the current
2278 * node in its nodemask, we allocate the standard way.
2279 */
2280 if (pol->mode != MPOL_INTERLEAVE &&
2281 pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2282 (!nodemask || node_isset(nid, *nodemask))) {
2283 /*
2284 * First, try to allocate THP only on local node, but
2285 * don't reclaim unnecessarily, just compact.
2286 */
2287 page = __alloc_frozen_pages_noprof(
2288 gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2289 nid, NULL);
2290 if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2291 return page;
2292 /*
2293 * If hugepage allocations are configured to always
2294 * synchronous compact or the vma has been madvised
2295 * to prefer hugepage backing, retry allowing remote
2296 * memory with both reclaim and compact as well.
2297 */
2298 }
2299 }
2300
2301 page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2302
2303 if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2304 pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2305 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2306 if (static_branch_likely(&vm_numa_stat_key) &&
2307 page_to_nid(page) == nid) {
2308 preempt_disable();
2309 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2310 preempt_enable();
2311 }
2312 }
2313
2314 return page;
2315 }
2316
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2317 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2318 struct mempolicy *pol, pgoff_t ilx, int nid)
2319 {
2320 struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2321 ilx, nid);
2322 if (!page)
2323 return NULL;
2324
2325 set_page_refcounted(page);
2326 return page_rmappable_folio(page);
2327 }
2328
2329 /**
2330 * vma_alloc_folio - Allocate a folio for a VMA.
2331 * @gfp: GFP flags.
2332 * @order: Order of the folio.
2333 * @vma: Pointer to VMA.
2334 * @addr: Virtual address of the allocation. Must be inside @vma.
2335 *
2336 * Allocate a folio for a specific address in @vma, using the appropriate
2337 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
2338 * VMA to prevent it from going away. Should be used for all allocations
2339 * for folios that will be mapped into user space, excepting hugetlbfs, and
2340 * excepting where direct use of folio_alloc_mpol() is more appropriate.
2341 *
2342 * Return: The folio on success or NULL if allocation fails.
2343 */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2344 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2345 unsigned long addr)
2346 {
2347 struct mempolicy *pol;
2348 pgoff_t ilx;
2349 struct folio *folio;
2350
2351 if (vma->vm_flags & VM_DROPPABLE)
2352 gfp |= __GFP_NOWARN;
2353
2354 pol = get_vma_policy(vma, addr, order, &ilx);
2355 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2356 mpol_cond_put(pol);
2357 return folio;
2358 }
2359 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2360
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2361 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2362 {
2363 struct mempolicy *pol = &default_policy;
2364
2365 /*
2366 * No reference counting needed for current->mempolicy
2367 * nor system default_policy
2368 */
2369 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2370 pol = get_task_policy(current);
2371
2372 return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2373 numa_node_id());
2374 }
2375
2376 /**
2377 * alloc_pages - Allocate pages.
2378 * @gfp: GFP flags.
2379 * @order: Power of two of number of pages to allocate.
2380 *
2381 * Allocate 1 << @order contiguous pages. The physical address of the
2382 * first page is naturally aligned (eg an order-3 allocation will be aligned
2383 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2384 * process is honoured when in process context.
2385 *
2386 * Context: Can be called from any context, providing the appropriate GFP
2387 * flags are used.
2388 * Return: The page on success or NULL if allocation fails.
2389 */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2390 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2391 {
2392 struct page *page = alloc_frozen_pages_noprof(gfp, order);
2393
2394 if (page)
2395 set_page_refcounted(page);
2396 return page;
2397 }
2398 EXPORT_SYMBOL(alloc_pages_noprof);
2399
folio_alloc_noprof(gfp_t gfp,unsigned int order)2400 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2401 {
2402 return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2403 }
2404 EXPORT_SYMBOL(folio_alloc_noprof);
2405
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2406 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2407 struct mempolicy *pol, unsigned long nr_pages,
2408 struct page **page_array)
2409 {
2410 int nodes;
2411 unsigned long nr_pages_per_node;
2412 int delta;
2413 int i;
2414 unsigned long nr_allocated;
2415 unsigned long total_allocated = 0;
2416
2417 nodes = nodes_weight(pol->nodes);
2418 nr_pages_per_node = nr_pages / nodes;
2419 delta = nr_pages - nodes * nr_pages_per_node;
2420
2421 for (i = 0; i < nodes; i++) {
2422 if (delta) {
2423 nr_allocated = alloc_pages_bulk_noprof(gfp,
2424 interleave_nodes(pol), NULL,
2425 nr_pages_per_node + 1,
2426 page_array);
2427 delta--;
2428 } else {
2429 nr_allocated = alloc_pages_bulk_noprof(gfp,
2430 interleave_nodes(pol), NULL,
2431 nr_pages_per_node, page_array);
2432 }
2433
2434 page_array += nr_allocated;
2435 total_allocated += nr_allocated;
2436 }
2437
2438 return total_allocated;
2439 }
2440
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2441 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2442 struct mempolicy *pol, unsigned long nr_pages,
2443 struct page **page_array)
2444 {
2445 struct task_struct *me = current;
2446 unsigned int cpuset_mems_cookie;
2447 unsigned long total_allocated = 0;
2448 unsigned long nr_allocated = 0;
2449 unsigned long rounds;
2450 unsigned long node_pages, delta;
2451 u8 *table, *weights, weight;
2452 unsigned int weight_total = 0;
2453 unsigned long rem_pages = nr_pages;
2454 nodemask_t nodes;
2455 int nnodes, node;
2456 int resume_node = MAX_NUMNODES - 1;
2457 u8 resume_weight = 0;
2458 int prev_node;
2459 int i;
2460
2461 if (!nr_pages)
2462 return 0;
2463
2464 /* read the nodes onto the stack, retry if done during rebind */
2465 do {
2466 cpuset_mems_cookie = read_mems_allowed_begin();
2467 nnodes = read_once_policy_nodemask(pol, &nodes);
2468 } while (read_mems_allowed_retry(cpuset_mems_cookie));
2469
2470 /* if the nodemask has become invalid, we cannot do anything */
2471 if (!nnodes)
2472 return 0;
2473
2474 /* Continue allocating from most recent node and adjust the nr_pages */
2475 node = me->il_prev;
2476 weight = me->il_weight;
2477 if (weight && node_isset(node, nodes)) {
2478 node_pages = min(rem_pages, weight);
2479 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2480 page_array);
2481 page_array += nr_allocated;
2482 total_allocated += nr_allocated;
2483 /* if that's all the pages, no need to interleave */
2484 if (rem_pages <= weight) {
2485 me->il_weight -= rem_pages;
2486 return total_allocated;
2487 }
2488 /* Otherwise we adjust remaining pages, continue from there */
2489 rem_pages -= weight;
2490 }
2491 /* clear active weight in case of an allocation failure */
2492 me->il_weight = 0;
2493 prev_node = node;
2494
2495 /* create a local copy of node weights to operate on outside rcu */
2496 weights = kzalloc(nr_node_ids, GFP_KERNEL);
2497 if (!weights)
2498 return total_allocated;
2499
2500 rcu_read_lock();
2501 table = rcu_dereference(iw_table);
2502 if (table)
2503 memcpy(weights, table, nr_node_ids);
2504 rcu_read_unlock();
2505
2506 /* calculate total, detect system default usage */
2507 for_each_node_mask(node, nodes) {
2508 if (!weights[node])
2509 weights[node] = 1;
2510 weight_total += weights[node];
2511 }
2512
2513 /*
2514 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2515 * Track which node weighted interleave should resume from.
2516 *
2517 * if (rounds > 0) and (delta == 0), resume_node will always be
2518 * the node following prev_node and its weight.
2519 */
2520 rounds = rem_pages / weight_total;
2521 delta = rem_pages % weight_total;
2522 resume_node = next_node_in(prev_node, nodes);
2523 resume_weight = weights[resume_node];
2524 for (i = 0; i < nnodes; i++) {
2525 node = next_node_in(prev_node, nodes);
2526 weight = weights[node];
2527 node_pages = weight * rounds;
2528 /* If a delta exists, add this node's portion of the delta */
2529 if (delta > weight) {
2530 node_pages += weight;
2531 delta -= weight;
2532 } else if (delta) {
2533 /* when delta is depleted, resume from that node */
2534 node_pages += delta;
2535 resume_node = node;
2536 resume_weight = weight - delta;
2537 delta = 0;
2538 }
2539 /* node_pages can be 0 if an allocation fails and rounds == 0 */
2540 if (!node_pages)
2541 break;
2542 nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2543 page_array);
2544 page_array += nr_allocated;
2545 total_allocated += nr_allocated;
2546 if (total_allocated == nr_pages)
2547 break;
2548 prev_node = node;
2549 }
2550 me->il_prev = resume_node;
2551 me->il_weight = resume_weight;
2552 kfree(weights);
2553 return total_allocated;
2554 }
2555
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2556 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2557 struct mempolicy *pol, unsigned long nr_pages,
2558 struct page **page_array)
2559 {
2560 gfp_t preferred_gfp;
2561 unsigned long nr_allocated = 0;
2562
2563 preferred_gfp = gfp | __GFP_NOWARN;
2564 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2565
2566 nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2567 nr_pages, page_array);
2568
2569 if (nr_allocated < nr_pages)
2570 nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2571 nr_pages - nr_allocated,
2572 page_array + nr_allocated);
2573 return nr_allocated;
2574 }
2575
2576 /* alloc pages bulk and mempolicy should be considered at the
2577 * same time in some situation such as vmalloc.
2578 *
2579 * It can accelerate memory allocation especially interleaving
2580 * allocate memory.
2581 */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2582 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2583 unsigned long nr_pages, struct page **page_array)
2584 {
2585 struct mempolicy *pol = &default_policy;
2586 nodemask_t *nodemask;
2587 int nid;
2588
2589 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2590 pol = get_task_policy(current);
2591
2592 if (pol->mode == MPOL_INTERLEAVE)
2593 return alloc_pages_bulk_interleave(gfp, pol,
2594 nr_pages, page_array);
2595
2596 if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2597 return alloc_pages_bulk_weighted_interleave(
2598 gfp, pol, nr_pages, page_array);
2599
2600 if (pol->mode == MPOL_PREFERRED_MANY)
2601 return alloc_pages_bulk_preferred_many(gfp,
2602 numa_node_id(), pol, nr_pages, page_array);
2603
2604 nid = numa_node_id();
2605 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2606 return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2607 nr_pages, page_array);
2608 }
2609
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2610 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2611 {
2612 struct mempolicy *pol = mpol_dup(src->vm_policy);
2613
2614 if (IS_ERR(pol))
2615 return PTR_ERR(pol);
2616 dst->vm_policy = pol;
2617 return 0;
2618 }
2619
2620 /*
2621 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2622 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2623 * with the mems_allowed returned by cpuset_mems_allowed(). This
2624 * keeps mempolicies cpuset relative after its cpuset moves. See
2625 * further kernel/cpuset.c update_nodemask().
2626 *
2627 * current's mempolicy may be rebinded by the other task(the task that changes
2628 * cpuset's mems), so we needn't do rebind work for current task.
2629 */
2630
2631 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2632 struct mempolicy *__mpol_dup(struct mempolicy *old)
2633 {
2634 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2635
2636 if (!new)
2637 return ERR_PTR(-ENOMEM);
2638
2639 /* task's mempolicy is protected by alloc_lock */
2640 if (old == current->mempolicy) {
2641 task_lock(current);
2642 *new = *old;
2643 task_unlock(current);
2644 } else
2645 *new = *old;
2646
2647 if (current_cpuset_is_being_rebound()) {
2648 nodemask_t mems = cpuset_mems_allowed(current);
2649 mpol_rebind_policy(new, &mems);
2650 }
2651 atomic_set(&new->refcnt, 1);
2652 return new;
2653 }
2654
2655 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2656 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2657 {
2658 if (!a || !b)
2659 return false;
2660 if (a->mode != b->mode)
2661 return false;
2662 if (a->flags != b->flags)
2663 return false;
2664 if (a->home_node != b->home_node)
2665 return false;
2666 if (mpol_store_user_nodemask(a))
2667 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2668 return false;
2669
2670 switch (a->mode) {
2671 case MPOL_BIND:
2672 case MPOL_INTERLEAVE:
2673 case MPOL_PREFERRED:
2674 case MPOL_PREFERRED_MANY:
2675 case MPOL_WEIGHTED_INTERLEAVE:
2676 return !!nodes_equal(a->nodes, b->nodes);
2677 case MPOL_LOCAL:
2678 return true;
2679 default:
2680 BUG();
2681 return false;
2682 }
2683 }
2684
2685 /*
2686 * Shared memory backing store policy support.
2687 *
2688 * Remember policies even when nobody has shared memory mapped.
2689 * The policies are kept in Red-Black tree linked from the inode.
2690 * They are protected by the sp->lock rwlock, which should be held
2691 * for any accesses to the tree.
2692 */
2693
2694 /*
2695 * lookup first element intersecting start-end. Caller holds sp->lock for
2696 * reading or for writing
2697 */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2698 static struct sp_node *sp_lookup(struct shared_policy *sp,
2699 pgoff_t start, pgoff_t end)
2700 {
2701 struct rb_node *n = sp->root.rb_node;
2702
2703 while (n) {
2704 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2705
2706 if (start >= p->end)
2707 n = n->rb_right;
2708 else if (end <= p->start)
2709 n = n->rb_left;
2710 else
2711 break;
2712 }
2713 if (!n)
2714 return NULL;
2715 for (;;) {
2716 struct sp_node *w = NULL;
2717 struct rb_node *prev = rb_prev(n);
2718 if (!prev)
2719 break;
2720 w = rb_entry(prev, struct sp_node, nd);
2721 if (w->end <= start)
2722 break;
2723 n = prev;
2724 }
2725 return rb_entry(n, struct sp_node, nd);
2726 }
2727
2728 /*
2729 * Insert a new shared policy into the list. Caller holds sp->lock for
2730 * writing.
2731 */
sp_insert(struct shared_policy * sp,struct sp_node * new)2732 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2733 {
2734 struct rb_node **p = &sp->root.rb_node;
2735 struct rb_node *parent = NULL;
2736 struct sp_node *nd;
2737
2738 while (*p) {
2739 parent = *p;
2740 nd = rb_entry(parent, struct sp_node, nd);
2741 if (new->start < nd->start)
2742 p = &(*p)->rb_left;
2743 else if (new->end > nd->end)
2744 p = &(*p)->rb_right;
2745 else
2746 BUG();
2747 }
2748 rb_link_node(&new->nd, parent, p);
2749 rb_insert_color(&new->nd, &sp->root);
2750 }
2751
2752 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2753 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2754 pgoff_t idx)
2755 {
2756 struct mempolicy *pol = NULL;
2757 struct sp_node *sn;
2758
2759 if (!sp->root.rb_node)
2760 return NULL;
2761 read_lock(&sp->lock);
2762 sn = sp_lookup(sp, idx, idx+1);
2763 if (sn) {
2764 mpol_get(sn->policy);
2765 pol = sn->policy;
2766 }
2767 read_unlock(&sp->lock);
2768 return pol;
2769 }
2770
sp_free(struct sp_node * n)2771 static void sp_free(struct sp_node *n)
2772 {
2773 mpol_put(n->policy);
2774 kmem_cache_free(sn_cache, n);
2775 }
2776
2777 /**
2778 * mpol_misplaced - check whether current folio node is valid in policy
2779 *
2780 * @folio: folio to be checked
2781 * @vmf: structure describing the fault
2782 * @addr: virtual address in @vma for shared policy lookup and interleave policy
2783 *
2784 * Lookup current policy node id for vma,addr and "compare to" folio's
2785 * node id. Policy determination "mimics" alloc_page_vma().
2786 * Called from fault path where we know the vma and faulting address.
2787 *
2788 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2789 * policy, or a suitable node ID to allocate a replacement folio from.
2790 */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2791 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2792 unsigned long addr)
2793 {
2794 struct mempolicy *pol;
2795 pgoff_t ilx;
2796 struct zoneref *z;
2797 int curnid = folio_nid(folio);
2798 struct vm_area_struct *vma = vmf->vma;
2799 int thiscpu = raw_smp_processor_id();
2800 int thisnid = numa_node_id();
2801 int polnid = NUMA_NO_NODE;
2802 int ret = NUMA_NO_NODE;
2803
2804 /*
2805 * Make sure ptl is held so that we don't preempt and we
2806 * have a stable smp processor id
2807 */
2808 lockdep_assert_held(vmf->ptl);
2809 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2810 if (!(pol->flags & MPOL_F_MOF))
2811 goto out;
2812
2813 switch (pol->mode) {
2814 case MPOL_INTERLEAVE:
2815 polnid = interleave_nid(pol, ilx);
2816 break;
2817
2818 case MPOL_WEIGHTED_INTERLEAVE:
2819 polnid = weighted_interleave_nid(pol, ilx);
2820 break;
2821
2822 case MPOL_PREFERRED:
2823 if (node_isset(curnid, pol->nodes))
2824 goto out;
2825 polnid = first_node(pol->nodes);
2826 break;
2827
2828 case MPOL_LOCAL:
2829 polnid = numa_node_id();
2830 break;
2831
2832 case MPOL_BIND:
2833 case MPOL_PREFERRED_MANY:
2834 /*
2835 * Even though MPOL_PREFERRED_MANY can allocate pages outside
2836 * policy nodemask we don't allow numa migration to nodes
2837 * outside policy nodemask for now. This is done so that if we
2838 * want demotion to slow memory to happen, before allocating
2839 * from some DRAM node say 'x', we will end up using a
2840 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
2841 * we should not promote to node 'x' from slow memory node.
2842 */
2843 if (pol->flags & MPOL_F_MORON) {
2844 /*
2845 * Optimize placement among multiple nodes
2846 * via NUMA balancing
2847 */
2848 if (node_isset(thisnid, pol->nodes))
2849 break;
2850 goto out;
2851 }
2852
2853 /*
2854 * use current page if in policy nodemask,
2855 * else select nearest allowed node, if any.
2856 * If no allowed nodes, use current [!misplaced].
2857 */
2858 if (node_isset(curnid, pol->nodes))
2859 goto out;
2860 z = first_zones_zonelist(
2861 node_zonelist(thisnid, GFP_HIGHUSER),
2862 gfp_zone(GFP_HIGHUSER),
2863 &pol->nodes);
2864 polnid = zonelist_node_idx(z);
2865 break;
2866
2867 default:
2868 BUG();
2869 }
2870
2871 /* Migrate the folio towards the node whose CPU is referencing it */
2872 if (pol->flags & MPOL_F_MORON) {
2873 polnid = thisnid;
2874
2875 if (!should_numa_migrate_memory(current, folio, curnid,
2876 thiscpu))
2877 goto out;
2878 }
2879
2880 if (curnid != polnid)
2881 ret = polnid;
2882 out:
2883 mpol_cond_put(pol);
2884
2885 return ret;
2886 }
2887
2888 /*
2889 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2890 * dropped after task->mempolicy is set to NULL so that any allocation done as
2891 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2892 * policy.
2893 */
mpol_put_task_policy(struct task_struct * task)2894 void mpol_put_task_policy(struct task_struct *task)
2895 {
2896 struct mempolicy *pol;
2897
2898 task_lock(task);
2899 pol = task->mempolicy;
2900 task->mempolicy = NULL;
2901 task_unlock(task);
2902 mpol_put(pol);
2903 }
2904
sp_delete(struct shared_policy * sp,struct sp_node * n)2905 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2906 {
2907 rb_erase(&n->nd, &sp->root);
2908 sp_free(n);
2909 }
2910
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)2911 static void sp_node_init(struct sp_node *node, unsigned long start,
2912 unsigned long end, struct mempolicy *pol)
2913 {
2914 node->start = start;
2915 node->end = end;
2916 node->policy = pol;
2917 }
2918
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)2919 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2920 struct mempolicy *pol)
2921 {
2922 struct sp_node *n;
2923 struct mempolicy *newpol;
2924
2925 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2926 if (!n)
2927 return NULL;
2928
2929 newpol = mpol_dup(pol);
2930 if (IS_ERR(newpol)) {
2931 kmem_cache_free(sn_cache, n);
2932 return NULL;
2933 }
2934 newpol->flags |= MPOL_F_SHARED;
2935 sp_node_init(n, start, end, newpol);
2936
2937 return n;
2938 }
2939
2940 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)2941 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
2942 pgoff_t end, struct sp_node *new)
2943 {
2944 struct sp_node *n;
2945 struct sp_node *n_new = NULL;
2946 struct mempolicy *mpol_new = NULL;
2947 int ret = 0;
2948
2949 restart:
2950 write_lock(&sp->lock);
2951 n = sp_lookup(sp, start, end);
2952 /* Take care of old policies in the same range. */
2953 while (n && n->start < end) {
2954 struct rb_node *next = rb_next(&n->nd);
2955 if (n->start >= start) {
2956 if (n->end <= end)
2957 sp_delete(sp, n);
2958 else
2959 n->start = end;
2960 } else {
2961 /* Old policy spanning whole new range. */
2962 if (n->end > end) {
2963 if (!n_new)
2964 goto alloc_new;
2965
2966 *mpol_new = *n->policy;
2967 atomic_set(&mpol_new->refcnt, 1);
2968 sp_node_init(n_new, end, n->end, mpol_new);
2969 n->end = start;
2970 sp_insert(sp, n_new);
2971 n_new = NULL;
2972 mpol_new = NULL;
2973 break;
2974 } else
2975 n->end = start;
2976 }
2977 if (!next)
2978 break;
2979 n = rb_entry(next, struct sp_node, nd);
2980 }
2981 if (new)
2982 sp_insert(sp, new);
2983 write_unlock(&sp->lock);
2984 ret = 0;
2985
2986 err_out:
2987 if (mpol_new)
2988 mpol_put(mpol_new);
2989 if (n_new)
2990 kmem_cache_free(sn_cache, n_new);
2991
2992 return ret;
2993
2994 alloc_new:
2995 write_unlock(&sp->lock);
2996 ret = -ENOMEM;
2997 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2998 if (!n_new)
2999 goto err_out;
3000 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3001 if (!mpol_new)
3002 goto err_out;
3003 atomic_set(&mpol_new->refcnt, 1);
3004 goto restart;
3005 }
3006
3007 /**
3008 * mpol_shared_policy_init - initialize shared policy for inode
3009 * @sp: pointer to inode shared policy
3010 * @mpol: struct mempolicy to install
3011 *
3012 * Install non-NULL @mpol in inode's shared policy rb-tree.
3013 * On entry, the current task has a reference on a non-NULL @mpol.
3014 * This must be released on exit.
3015 * This is called at get_inode() calls and we can use GFP_KERNEL.
3016 */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3017 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3018 {
3019 int ret;
3020
3021 sp->root = RB_ROOT; /* empty tree == default mempolicy */
3022 rwlock_init(&sp->lock);
3023
3024 if (mpol) {
3025 struct sp_node *sn;
3026 struct mempolicy *npol;
3027 NODEMASK_SCRATCH(scratch);
3028
3029 if (!scratch)
3030 goto put_mpol;
3031
3032 /* contextualize the tmpfs mount point mempolicy to this file */
3033 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3034 if (IS_ERR(npol))
3035 goto free_scratch; /* no valid nodemask intersection */
3036
3037 task_lock(current);
3038 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3039 task_unlock(current);
3040 if (ret)
3041 goto put_npol;
3042
3043 /* alloc node covering entire file; adds ref to file's npol */
3044 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3045 if (sn)
3046 sp_insert(sp, sn);
3047 put_npol:
3048 mpol_put(npol); /* drop initial ref on file's npol */
3049 free_scratch:
3050 NODEMASK_SCRATCH_FREE(scratch);
3051 put_mpol:
3052 mpol_put(mpol); /* drop our incoming ref on sb mpol */
3053 }
3054 }
3055
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3056 int mpol_set_shared_policy(struct shared_policy *sp,
3057 struct vm_area_struct *vma, struct mempolicy *pol)
3058 {
3059 int err;
3060 struct sp_node *new = NULL;
3061 unsigned long sz = vma_pages(vma);
3062
3063 if (pol) {
3064 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3065 if (!new)
3066 return -ENOMEM;
3067 }
3068 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3069 if (err && new)
3070 sp_free(new);
3071 return err;
3072 }
3073
3074 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3075 void mpol_free_shared_policy(struct shared_policy *sp)
3076 {
3077 struct sp_node *n;
3078 struct rb_node *next;
3079
3080 if (!sp->root.rb_node)
3081 return;
3082 write_lock(&sp->lock);
3083 next = rb_first(&sp->root);
3084 while (next) {
3085 n = rb_entry(next, struct sp_node, nd);
3086 next = rb_next(&n->nd);
3087 sp_delete(sp, n);
3088 }
3089 write_unlock(&sp->lock);
3090 }
3091
3092 #ifdef CONFIG_NUMA_BALANCING
3093 static int __initdata numabalancing_override;
3094
check_numabalancing_enable(void)3095 static void __init check_numabalancing_enable(void)
3096 {
3097 bool numabalancing_default = false;
3098
3099 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3100 numabalancing_default = true;
3101
3102 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3103 if (numabalancing_override)
3104 set_numabalancing_state(numabalancing_override == 1);
3105
3106 if (num_online_nodes() > 1 && !numabalancing_override) {
3107 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3108 numabalancing_default ? "Enabling" : "Disabling");
3109 set_numabalancing_state(numabalancing_default);
3110 }
3111 }
3112
setup_numabalancing(char * str)3113 static int __init setup_numabalancing(char *str)
3114 {
3115 int ret = 0;
3116 if (!str)
3117 goto out;
3118
3119 if (!strcmp(str, "enable")) {
3120 numabalancing_override = 1;
3121 ret = 1;
3122 } else if (!strcmp(str, "disable")) {
3123 numabalancing_override = -1;
3124 ret = 1;
3125 }
3126 out:
3127 if (!ret)
3128 pr_warn("Unable to parse numa_balancing=\n");
3129
3130 return ret;
3131 }
3132 __setup("numa_balancing=", setup_numabalancing);
3133 #else
check_numabalancing_enable(void)3134 static inline void __init check_numabalancing_enable(void)
3135 {
3136 }
3137 #endif /* CONFIG_NUMA_BALANCING */
3138
numa_policy_init(void)3139 void __init numa_policy_init(void)
3140 {
3141 nodemask_t interleave_nodes;
3142 unsigned long largest = 0;
3143 int nid, prefer = 0;
3144
3145 policy_cache = kmem_cache_create("numa_policy",
3146 sizeof(struct mempolicy),
3147 0, SLAB_PANIC, NULL);
3148
3149 sn_cache = kmem_cache_create("shared_policy_node",
3150 sizeof(struct sp_node),
3151 0, SLAB_PANIC, NULL);
3152
3153 for_each_node(nid) {
3154 preferred_node_policy[nid] = (struct mempolicy) {
3155 .refcnt = ATOMIC_INIT(1),
3156 .mode = MPOL_PREFERRED,
3157 .flags = MPOL_F_MOF | MPOL_F_MORON,
3158 .nodes = nodemask_of_node(nid),
3159 };
3160 }
3161
3162 /*
3163 * Set interleaving policy for system init. Interleaving is only
3164 * enabled across suitably sized nodes (default is >= 16MB), or
3165 * fall back to the largest node if they're all smaller.
3166 */
3167 nodes_clear(interleave_nodes);
3168 for_each_node_state(nid, N_MEMORY) {
3169 unsigned long total_pages = node_present_pages(nid);
3170
3171 /* Preserve the largest node */
3172 if (largest < total_pages) {
3173 largest = total_pages;
3174 prefer = nid;
3175 }
3176
3177 /* Interleave this node? */
3178 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3179 node_set(nid, interleave_nodes);
3180 }
3181
3182 /* All too small, use the largest */
3183 if (unlikely(nodes_empty(interleave_nodes)))
3184 node_set(prefer, interleave_nodes);
3185
3186 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3187 pr_err("%s: interleaving failed\n", __func__);
3188
3189 check_numabalancing_enable();
3190 }
3191
3192 /* Reset policy of current process to default */
numa_default_policy(void)3193 void numa_default_policy(void)
3194 {
3195 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3196 }
3197
3198 /*
3199 * Parse and format mempolicy from/to strings
3200 */
3201 static const char * const policy_modes[] =
3202 {
3203 [MPOL_DEFAULT] = "default",
3204 [MPOL_PREFERRED] = "prefer",
3205 [MPOL_BIND] = "bind",
3206 [MPOL_INTERLEAVE] = "interleave",
3207 [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3208 [MPOL_LOCAL] = "local",
3209 [MPOL_PREFERRED_MANY] = "prefer (many)",
3210 };
3211
3212 #ifdef CONFIG_TMPFS
3213 /**
3214 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3215 * @str: string containing mempolicy to parse
3216 * @mpol: pointer to struct mempolicy pointer, returned on success.
3217 *
3218 * Format of input:
3219 * <mode>[=<flags>][:<nodelist>]
3220 *
3221 * Return: %0 on success, else %1
3222 */
mpol_parse_str(char * str,struct mempolicy ** mpol)3223 int mpol_parse_str(char *str, struct mempolicy **mpol)
3224 {
3225 struct mempolicy *new = NULL;
3226 unsigned short mode_flags;
3227 nodemask_t nodes;
3228 char *nodelist = strchr(str, ':');
3229 char *flags = strchr(str, '=');
3230 int err = 1, mode;
3231
3232 if (flags)
3233 *flags++ = '\0'; /* terminate mode string */
3234
3235 if (nodelist) {
3236 /* NUL-terminate mode or flags string */
3237 *nodelist++ = '\0';
3238 if (nodelist_parse(nodelist, nodes))
3239 goto out;
3240 if (!nodes_subset(nodes, node_states[N_MEMORY]))
3241 goto out;
3242 } else
3243 nodes_clear(nodes);
3244
3245 mode = match_string(policy_modes, MPOL_MAX, str);
3246 if (mode < 0)
3247 goto out;
3248
3249 switch (mode) {
3250 case MPOL_PREFERRED:
3251 /*
3252 * Insist on a nodelist of one node only, although later
3253 * we use first_node(nodes) to grab a single node, so here
3254 * nodelist (or nodes) cannot be empty.
3255 */
3256 if (nodelist) {
3257 char *rest = nodelist;
3258 while (isdigit(*rest))
3259 rest++;
3260 if (*rest)
3261 goto out;
3262 if (nodes_empty(nodes))
3263 goto out;
3264 }
3265 break;
3266 case MPOL_INTERLEAVE:
3267 case MPOL_WEIGHTED_INTERLEAVE:
3268 /*
3269 * Default to online nodes with memory if no nodelist
3270 */
3271 if (!nodelist)
3272 nodes = node_states[N_MEMORY];
3273 break;
3274 case MPOL_LOCAL:
3275 /*
3276 * Don't allow a nodelist; mpol_new() checks flags
3277 */
3278 if (nodelist)
3279 goto out;
3280 break;
3281 case MPOL_DEFAULT:
3282 /*
3283 * Insist on a empty nodelist
3284 */
3285 if (!nodelist)
3286 err = 0;
3287 goto out;
3288 case MPOL_PREFERRED_MANY:
3289 case MPOL_BIND:
3290 /*
3291 * Insist on a nodelist
3292 */
3293 if (!nodelist)
3294 goto out;
3295 }
3296
3297 mode_flags = 0;
3298 if (flags) {
3299 /*
3300 * Currently, we only support two mutually exclusive
3301 * mode flags.
3302 */
3303 if (!strcmp(flags, "static"))
3304 mode_flags |= MPOL_F_STATIC_NODES;
3305 else if (!strcmp(flags, "relative"))
3306 mode_flags |= MPOL_F_RELATIVE_NODES;
3307 else
3308 goto out;
3309 }
3310
3311 new = mpol_new(mode, mode_flags, &nodes);
3312 if (IS_ERR(new))
3313 goto out;
3314
3315 /*
3316 * Save nodes for mpol_to_str() to show the tmpfs mount options
3317 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3318 */
3319 if (mode != MPOL_PREFERRED) {
3320 new->nodes = nodes;
3321 } else if (nodelist) {
3322 nodes_clear(new->nodes);
3323 node_set(first_node(nodes), new->nodes);
3324 } else {
3325 new->mode = MPOL_LOCAL;
3326 }
3327
3328 /*
3329 * Save nodes for contextualization: this will be used to "clone"
3330 * the mempolicy in a specific context [cpuset] at a later time.
3331 */
3332 new->w.user_nodemask = nodes;
3333
3334 err = 0;
3335
3336 out:
3337 /* Restore string for error message */
3338 if (nodelist)
3339 *--nodelist = ':';
3340 if (flags)
3341 *--flags = '=';
3342 if (!err)
3343 *mpol = new;
3344 return err;
3345 }
3346 #endif /* CONFIG_TMPFS */
3347
3348 /**
3349 * mpol_to_str - format a mempolicy structure for printing
3350 * @buffer: to contain formatted mempolicy string
3351 * @maxlen: length of @buffer
3352 * @pol: pointer to mempolicy to be formatted
3353 *
3354 * Convert @pol into a string. If @buffer is too short, truncate the string.
3355 * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3356 * interleave", plus the longest flag flags, "relative|balancing", and to
3357 * display at least a few node ids.
3358 */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3359 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3360 {
3361 char *p = buffer;
3362 nodemask_t nodes = NODE_MASK_NONE;
3363 unsigned short mode = MPOL_DEFAULT;
3364 unsigned short flags = 0;
3365
3366 if (pol &&
3367 pol != &default_policy &&
3368 !(pol >= &preferred_node_policy[0] &&
3369 pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3370 mode = pol->mode;
3371 flags = pol->flags;
3372 }
3373
3374 switch (mode) {
3375 case MPOL_DEFAULT:
3376 case MPOL_LOCAL:
3377 break;
3378 case MPOL_PREFERRED:
3379 case MPOL_PREFERRED_MANY:
3380 case MPOL_BIND:
3381 case MPOL_INTERLEAVE:
3382 case MPOL_WEIGHTED_INTERLEAVE:
3383 nodes = pol->nodes;
3384 break;
3385 default:
3386 WARN_ON_ONCE(1);
3387 snprintf(p, maxlen, "unknown");
3388 return;
3389 }
3390
3391 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3392
3393 if (flags & MPOL_MODE_FLAGS) {
3394 p += snprintf(p, buffer + maxlen - p, "=");
3395
3396 /*
3397 * Static and relative are mutually exclusive.
3398 */
3399 if (flags & MPOL_F_STATIC_NODES)
3400 p += snprintf(p, buffer + maxlen - p, "static");
3401 else if (flags & MPOL_F_RELATIVE_NODES)
3402 p += snprintf(p, buffer + maxlen - p, "relative");
3403
3404 if (flags & MPOL_F_NUMA_BALANCING) {
3405 if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3406 p += snprintf(p, buffer + maxlen - p, "|");
3407 p += snprintf(p, buffer + maxlen - p, "balancing");
3408 }
3409 }
3410
3411 if (!nodes_empty(nodes))
3412 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3413 nodemask_pr_args(&nodes));
3414 }
3415
3416 #ifdef CONFIG_SYSFS
3417 struct iw_node_attr {
3418 struct kobj_attribute kobj_attr;
3419 int nid;
3420 };
3421
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3422 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3423 char *buf)
3424 {
3425 struct iw_node_attr *node_attr;
3426 u8 weight;
3427
3428 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3429 weight = get_il_weight(node_attr->nid);
3430 return sysfs_emit(buf, "%d\n", weight);
3431 }
3432
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3433 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3434 const char *buf, size_t count)
3435 {
3436 struct iw_node_attr *node_attr;
3437 u8 *new;
3438 u8 *old;
3439 u8 weight = 0;
3440
3441 node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3442 if (count == 0 || sysfs_streq(buf, ""))
3443 weight = 0;
3444 else if (kstrtou8(buf, 0, &weight))
3445 return -EINVAL;
3446
3447 new = kzalloc(nr_node_ids, GFP_KERNEL);
3448 if (!new)
3449 return -ENOMEM;
3450
3451 mutex_lock(&iw_table_lock);
3452 old = rcu_dereference_protected(iw_table,
3453 lockdep_is_held(&iw_table_lock));
3454 if (old)
3455 memcpy(new, old, nr_node_ids);
3456 new[node_attr->nid] = weight;
3457 rcu_assign_pointer(iw_table, new);
3458 mutex_unlock(&iw_table_lock);
3459 synchronize_rcu();
3460 kfree(old);
3461 return count;
3462 }
3463
3464 static struct iw_node_attr **node_attrs;
3465
sysfs_wi_node_release(struct iw_node_attr * node_attr,struct kobject * parent)3466 static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
3467 struct kobject *parent)
3468 {
3469 if (!node_attr)
3470 return;
3471 sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
3472 kfree(node_attr->kobj_attr.attr.name);
3473 kfree(node_attr);
3474 }
3475
sysfs_wi_release(struct kobject * wi_kobj)3476 static void sysfs_wi_release(struct kobject *wi_kobj)
3477 {
3478 int i;
3479
3480 for (i = 0; i < nr_node_ids; i++)
3481 sysfs_wi_node_release(node_attrs[i], wi_kobj);
3482 kobject_put(wi_kobj);
3483 }
3484
3485 static const struct kobj_type wi_ktype = {
3486 .sysfs_ops = &kobj_sysfs_ops,
3487 .release = sysfs_wi_release,
3488 };
3489
add_weight_node(int nid,struct kobject * wi_kobj)3490 static int add_weight_node(int nid, struct kobject *wi_kobj)
3491 {
3492 struct iw_node_attr *node_attr;
3493 char *name;
3494
3495 node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
3496 if (!node_attr)
3497 return -ENOMEM;
3498
3499 name = kasprintf(GFP_KERNEL, "node%d", nid);
3500 if (!name) {
3501 kfree(node_attr);
3502 return -ENOMEM;
3503 }
3504
3505 sysfs_attr_init(&node_attr->kobj_attr.attr);
3506 node_attr->kobj_attr.attr.name = name;
3507 node_attr->kobj_attr.attr.mode = 0644;
3508 node_attr->kobj_attr.show = node_show;
3509 node_attr->kobj_attr.store = node_store;
3510 node_attr->nid = nid;
3511
3512 if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
3513 kfree(node_attr->kobj_attr.attr.name);
3514 kfree(node_attr);
3515 pr_err("failed to add attribute to weighted_interleave\n");
3516 return -ENOMEM;
3517 }
3518
3519 node_attrs[nid] = node_attr;
3520 return 0;
3521 }
3522
add_weighted_interleave_group(struct kobject * root_kobj)3523 static int add_weighted_interleave_group(struct kobject *root_kobj)
3524 {
3525 struct kobject *wi_kobj;
3526 int nid, err;
3527
3528 wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
3529 if (!wi_kobj)
3530 return -ENOMEM;
3531
3532 err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
3533 "weighted_interleave");
3534 if (err) {
3535 kfree(wi_kobj);
3536 return err;
3537 }
3538
3539 for_each_node_state(nid, N_POSSIBLE) {
3540 err = add_weight_node(nid, wi_kobj);
3541 if (err) {
3542 pr_err("failed to add sysfs [node%d]\n", nid);
3543 break;
3544 }
3545 }
3546 if (err)
3547 kobject_put(wi_kobj);
3548 return 0;
3549 }
3550
mempolicy_kobj_release(struct kobject * kobj)3551 static void mempolicy_kobj_release(struct kobject *kobj)
3552 {
3553 u8 *old;
3554
3555 mutex_lock(&iw_table_lock);
3556 old = rcu_dereference_protected(iw_table,
3557 lockdep_is_held(&iw_table_lock));
3558 rcu_assign_pointer(iw_table, NULL);
3559 mutex_unlock(&iw_table_lock);
3560 synchronize_rcu();
3561 kfree(old);
3562 kfree(node_attrs);
3563 kfree(kobj);
3564 }
3565
3566 static const struct kobj_type mempolicy_ktype = {
3567 .release = mempolicy_kobj_release
3568 };
3569
mempolicy_sysfs_init(void)3570 static int __init mempolicy_sysfs_init(void)
3571 {
3572 int err;
3573 static struct kobject *mempolicy_kobj;
3574
3575 mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
3576 if (!mempolicy_kobj) {
3577 err = -ENOMEM;
3578 goto err_out;
3579 }
3580
3581 node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
3582 GFP_KERNEL);
3583 if (!node_attrs) {
3584 err = -ENOMEM;
3585 goto mempol_out;
3586 }
3587
3588 err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
3589 "mempolicy");
3590 if (err)
3591 goto node_out;
3592
3593 err = add_weighted_interleave_group(mempolicy_kobj);
3594 if (err) {
3595 pr_err("mempolicy sysfs structure failed to initialize\n");
3596 kobject_put(mempolicy_kobj);
3597 return err;
3598 }
3599
3600 return err;
3601 node_out:
3602 kfree(node_attrs);
3603 mempol_out:
3604 kfree(mempolicy_kobj);
3605 err_out:
3606 pr_err("failed to add mempolicy kobject to the system\n");
3607 return err;
3608 }
3609
3610 late_initcall(mempolicy_sysfs_init);
3611 #endif /* CONFIG_SYSFS */
3612