1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support six policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * weighted interleave
23  *                Allocate memory interleaved over a set of nodes based on
24  *                a set of weights (per-node), with normal fallback if it
25  *                fails.  Otherwise operates the same as interleave.
26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27  *                on node 0 for every 1 page allocated on node 1.
28  *
29  * bind           Only allocate memory on a specific set of nodes,
30  *                no fallback.
31  *                FIXME: memory is allocated starting with the first node
32  *                to the last. It would be better if bind would truly restrict
33  *                the allocation to memory nodes instead
34  *
35  * preferred      Try a specific node first before normal fallback.
36  *                As a special case NUMA_NO_NODE here means do the allocation
37  *                on the local CPU. This is normally identical to default,
38  *                but useful to set in a VMA when you have a non default
39  *                process policy.
40  *
41  * preferred many Try a set of nodes first before normal fallback. This is
42  *                similar to preferred without the special case.
43  *
44  * default        Allocate on the local node first, or when on a VMA
45  *                use the process policy. This is what Linux always did
46  *		  in a NUMA aware kernel and still does by, ahem, default.
47  *
48  * The process policy is applied for most non interrupt memory allocations
49  * in that process' context. Interrupts ignore the policies and always
50  * try to allocate on the local CPU. The VMA policy is only applied for memory
51  * allocations for a VMA in the VM.
52  *
53  * Currently there are a few corner cases in swapping where the policy
54  * is not applied, but the majority should be handled. When process policy
55  * is used it is not remembered over swap outs/swap ins.
56  *
57  * Only the highest zone in the zone hierarchy gets policied. Allocations
58  * requesting a lower zone just use default policy. This implies that
59  * on systems with highmem kernel lowmem allocation don't get policied.
60  * Same with GFP_DMA allocations.
61  *
62  * For shmem/tmpfs shared memory the policy is shared between
63  * all users and remembered even when nobody has memory mapped.
64  */
65 
66 /* Notebook:
67    fix mmap readahead to honour policy and enable policy for any page cache
68    object
69    statistics for bigpages
70    global policy for page cache? currently it uses process policy. Requires
71    first item above.
72    handle mremap for shared memory (currently ignored for the policy)
73    grows down?
74    make bind policy root only? It can trigger oom much faster and the
75    kernel is not always grateful with that.
76 */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/task.h>
89 #include <linux/nodemask.h>
90 #include <linux/cpuset.h>
91 #include <linux/slab.h>
92 #include <linux/string.h>
93 #include <linux/export.h>
94 #include <linux/nsproxy.h>
95 #include <linux/interrupt.h>
96 #include <linux/init.h>
97 #include <linux/compat.h>
98 #include <linux/ptrace.h>
99 #include <linux/swap.h>
100 #include <linux/seq_file.h>
101 #include <linux/proc_fs.h>
102 #include <linux/migrate.h>
103 #include <linux/ksm.h>
104 #include <linux/rmap.h>
105 #include <linux/security.h>
106 #include <linux/syscalls.h>
107 #include <linux/ctype.h>
108 #include <linux/mm_inline.h>
109 #include <linux/mmu_notifier.h>
110 #include <linux/printk.h>
111 #include <linux/swapops.h>
112 
113 #include <asm/tlbflush.h>
114 #include <asm/tlb.h>
115 #include <linux/uaccess.h>
116 
117 #include "internal.h"
118 
119 /* Internal flags */
120 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
121 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
122 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
123 
124 static struct kmem_cache *policy_cache;
125 static struct kmem_cache *sn_cache;
126 
127 /* Highest zone. An specific allocation for a zone below that is not
128    policied. */
129 enum zone_type policy_zone = 0;
130 
131 /*
132  * run-time system-wide default policy => local allocation
133  */
134 static struct mempolicy default_policy = {
135 	.refcnt = ATOMIC_INIT(1), /* never free it */
136 	.mode = MPOL_LOCAL,
137 };
138 
139 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
140 
141 /*
142  * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
143  * system-default value should be used. A NULL iw_table also denotes that
144  * system-default values should be used. Until the system-default table
145  * is implemented, the system-default is always 1.
146  *
147  * iw_table is RCU protected
148  */
149 static u8 __rcu *iw_table;
150 static DEFINE_MUTEX(iw_table_lock);
151 
get_il_weight(int node)152 static u8 get_il_weight(int node)
153 {
154 	u8 *table;
155 	u8 weight;
156 
157 	rcu_read_lock();
158 	table = rcu_dereference(iw_table);
159 	/* if no iw_table, use system default */
160 	weight = table ? table[node] : 1;
161 	/* if value in iw_table is 0, use system default */
162 	weight = weight ? weight : 1;
163 	rcu_read_unlock();
164 	return weight;
165 }
166 
167 /**
168  * numa_nearest_node - Find nearest node by state
169  * @node: Node id to start the search
170  * @state: State to filter the search
171  *
172  * Lookup the closest node by distance if @nid is not in state.
173  *
174  * Return: this @node if it is in state, otherwise the closest node by distance
175  */
numa_nearest_node(int node,unsigned int state)176 int numa_nearest_node(int node, unsigned int state)
177 {
178 	int min_dist = INT_MAX, dist, n, min_node;
179 
180 	if (state >= NR_NODE_STATES)
181 		return -EINVAL;
182 
183 	if (node == NUMA_NO_NODE || node_state(node, state))
184 		return node;
185 
186 	min_node = node;
187 	for_each_node_state(n, state) {
188 		dist = node_distance(node, n);
189 		if (dist < min_dist) {
190 			min_dist = dist;
191 			min_node = n;
192 		}
193 	}
194 
195 	return min_node;
196 }
197 EXPORT_SYMBOL_GPL(numa_nearest_node);
198 
199 /**
200  * nearest_node_nodemask - Find the node in @mask at the nearest distance
201  *			   from @node.
202  *
203  * @node: a valid node ID to start the search from.
204  * @mask: a pointer to a nodemask representing the allowed nodes.
205  *
206  * This function iterates over all nodes in @mask and calculates the
207  * distance from the starting @node, then it returns the node ID that is
208  * the closest to @node, or MAX_NUMNODES if no node is found.
209  *
210  * Note that @node must be a valid node ID usable with node_distance(),
211  * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
212  * or unexpected behavior.
213  */
nearest_node_nodemask(int node,nodemask_t * mask)214 int nearest_node_nodemask(int node, nodemask_t *mask)
215 {
216 	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
217 
218 	for_each_node_mask(n, *mask) {
219 		dist = node_distance(node, n);
220 		if (dist < min_dist) {
221 			min_dist = dist;
222 			min_node = n;
223 		}
224 	}
225 
226 	return min_node;
227 }
228 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
229 
get_task_policy(struct task_struct * p)230 struct mempolicy *get_task_policy(struct task_struct *p)
231 {
232 	struct mempolicy *pol = p->mempolicy;
233 	int node;
234 
235 	if (pol)
236 		return pol;
237 
238 	node = numa_node_id();
239 	if (node != NUMA_NO_NODE) {
240 		pol = &preferred_node_policy[node];
241 		/* preferred_node_policy is not initialised early in boot */
242 		if (pol->mode)
243 			return pol;
244 	}
245 
246 	return &default_policy;
247 }
248 
249 static const struct mempolicy_operations {
250 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
251 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
252 } mpol_ops[MPOL_MAX];
253 
mpol_store_user_nodemask(const struct mempolicy * pol)254 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
255 {
256 	return pol->flags & MPOL_MODE_FLAGS;
257 }
258 
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)259 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
260 				   const nodemask_t *rel)
261 {
262 	nodemask_t tmp;
263 	nodes_fold(tmp, *orig, nodes_weight(*rel));
264 	nodes_onto(*ret, tmp, *rel);
265 }
266 
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)267 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
268 {
269 	if (nodes_empty(*nodes))
270 		return -EINVAL;
271 	pol->nodes = *nodes;
272 	return 0;
273 }
274 
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)275 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
276 {
277 	if (nodes_empty(*nodes))
278 		return -EINVAL;
279 
280 	nodes_clear(pol->nodes);
281 	node_set(first_node(*nodes), pol->nodes);
282 	return 0;
283 }
284 
285 /*
286  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
287  * any, for the new policy.  mpol_new() has already validated the nodes
288  * parameter with respect to the policy mode and flags.
289  *
290  * Must be called holding task's alloc_lock to protect task's mems_allowed
291  * and mempolicy.  May also be called holding the mmap_lock for write.
292  */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)293 static int mpol_set_nodemask(struct mempolicy *pol,
294 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
295 {
296 	int ret;
297 
298 	/*
299 	 * Default (pol==NULL) resp. local memory policies are not a
300 	 * subject of any remapping. They also do not need any special
301 	 * constructor.
302 	 */
303 	if (!pol || pol->mode == MPOL_LOCAL)
304 		return 0;
305 
306 	/* Check N_MEMORY */
307 	nodes_and(nsc->mask1,
308 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
309 
310 	VM_BUG_ON(!nodes);
311 
312 	if (pol->flags & MPOL_F_RELATIVE_NODES)
313 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
314 	else
315 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
316 
317 	if (mpol_store_user_nodemask(pol))
318 		pol->w.user_nodemask = *nodes;
319 	else
320 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
321 
322 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
323 	return ret;
324 }
325 
326 /*
327  * This function just creates a new policy, does some check and simple
328  * initialization. You must invoke mpol_set_nodemask() to set nodes.
329  */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)330 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
331 				  nodemask_t *nodes)
332 {
333 	struct mempolicy *policy;
334 
335 	if (mode == MPOL_DEFAULT) {
336 		if (nodes && !nodes_empty(*nodes))
337 			return ERR_PTR(-EINVAL);
338 		return NULL;
339 	}
340 	VM_BUG_ON(!nodes);
341 
342 	/*
343 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
344 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
345 	 * All other modes require a valid pointer to a non-empty nodemask.
346 	 */
347 	if (mode == MPOL_PREFERRED) {
348 		if (nodes_empty(*nodes)) {
349 			if (((flags & MPOL_F_STATIC_NODES) ||
350 			     (flags & MPOL_F_RELATIVE_NODES)))
351 				return ERR_PTR(-EINVAL);
352 
353 			mode = MPOL_LOCAL;
354 		}
355 	} else if (mode == MPOL_LOCAL) {
356 		if (!nodes_empty(*nodes) ||
357 		    (flags & MPOL_F_STATIC_NODES) ||
358 		    (flags & MPOL_F_RELATIVE_NODES))
359 			return ERR_PTR(-EINVAL);
360 	} else if (nodes_empty(*nodes))
361 		return ERR_PTR(-EINVAL);
362 
363 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
364 	if (!policy)
365 		return ERR_PTR(-ENOMEM);
366 	atomic_set(&policy->refcnt, 1);
367 	policy->mode = mode;
368 	policy->flags = flags;
369 	policy->home_node = NUMA_NO_NODE;
370 
371 	return policy;
372 }
373 
374 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)375 void __mpol_put(struct mempolicy *pol)
376 {
377 	if (!atomic_dec_and_test(&pol->refcnt))
378 		return;
379 	kmem_cache_free(policy_cache, pol);
380 }
381 
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)382 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
383 {
384 }
385 
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)386 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
387 {
388 	nodemask_t tmp;
389 
390 	if (pol->flags & MPOL_F_STATIC_NODES)
391 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
392 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
393 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
394 	else {
395 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
396 								*nodes);
397 		pol->w.cpuset_mems_allowed = *nodes;
398 	}
399 
400 	if (nodes_empty(tmp))
401 		tmp = *nodes;
402 
403 	pol->nodes = tmp;
404 }
405 
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)406 static void mpol_rebind_preferred(struct mempolicy *pol,
407 						const nodemask_t *nodes)
408 {
409 	pol->w.cpuset_mems_allowed = *nodes;
410 }
411 
412 /*
413  * mpol_rebind_policy - Migrate a policy to a different set of nodes
414  *
415  * Per-vma policies are protected by mmap_lock. Allocations using per-task
416  * policies are protected by task->mems_allowed_seq to prevent a premature
417  * OOM/allocation failure due to parallel nodemask modification.
418  */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)419 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
420 {
421 	if (!pol || pol->mode == MPOL_LOCAL)
422 		return;
423 	if (!mpol_store_user_nodemask(pol) &&
424 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
425 		return;
426 
427 	mpol_ops[pol->mode].rebind(pol, newmask);
428 }
429 
430 /*
431  * Wrapper for mpol_rebind_policy() that just requires task
432  * pointer, and updates task mempolicy.
433  *
434  * Called with task's alloc_lock held.
435  */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
437 {
438 	mpol_rebind_policy(tsk->mempolicy, new);
439 }
440 
441 /*
442  * Rebind each vma in mm to new nodemask.
443  *
444  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
445  */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)446 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
447 {
448 	struct vm_area_struct *vma;
449 	VMA_ITERATOR(vmi, mm, 0);
450 
451 	mmap_write_lock(mm);
452 	for_each_vma(vmi, vma) {
453 		vma_start_write(vma);
454 		mpol_rebind_policy(vma->vm_policy, new);
455 	}
456 	mmap_write_unlock(mm);
457 }
458 
459 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
460 	[MPOL_DEFAULT] = {
461 		.rebind = mpol_rebind_default,
462 	},
463 	[MPOL_INTERLEAVE] = {
464 		.create = mpol_new_nodemask,
465 		.rebind = mpol_rebind_nodemask,
466 	},
467 	[MPOL_PREFERRED] = {
468 		.create = mpol_new_preferred,
469 		.rebind = mpol_rebind_preferred,
470 	},
471 	[MPOL_BIND] = {
472 		.create = mpol_new_nodemask,
473 		.rebind = mpol_rebind_nodemask,
474 	},
475 	[MPOL_LOCAL] = {
476 		.rebind = mpol_rebind_default,
477 	},
478 	[MPOL_PREFERRED_MANY] = {
479 		.create = mpol_new_nodemask,
480 		.rebind = mpol_rebind_preferred,
481 	},
482 	[MPOL_WEIGHTED_INTERLEAVE] = {
483 		.create = mpol_new_nodemask,
484 		.rebind = mpol_rebind_nodemask,
485 	},
486 };
487 
488 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
489 				unsigned long flags);
490 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
491 				pgoff_t ilx, int *nid);
492 
strictly_unmovable(unsigned long flags)493 static bool strictly_unmovable(unsigned long flags)
494 {
495 	/*
496 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
497 	 * if any misplaced page is found.
498 	 */
499 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
500 			 MPOL_MF_STRICT;
501 }
502 
503 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
504 	struct mempolicy *pol;
505 	pgoff_t ilx;
506 };
507 
508 struct queue_pages {
509 	struct list_head *pagelist;
510 	unsigned long flags;
511 	nodemask_t *nmask;
512 	unsigned long start;
513 	unsigned long end;
514 	struct vm_area_struct *first;
515 	struct folio *large;		/* note last large folio encountered */
516 	long nr_failed;			/* could not be isolated at this time */
517 };
518 
519 /*
520  * Check if the folio's nid is in qp->nmask.
521  *
522  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
523  * in the invert of qp->nmask.
524  */
queue_folio_required(struct folio * folio,struct queue_pages * qp)525 static inline bool queue_folio_required(struct folio *folio,
526 					struct queue_pages *qp)
527 {
528 	int nid = folio_nid(folio);
529 	unsigned long flags = qp->flags;
530 
531 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
532 }
533 
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)534 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
535 {
536 	struct folio *folio;
537 	struct queue_pages *qp = walk->private;
538 
539 	if (unlikely(is_pmd_migration_entry(*pmd))) {
540 		qp->nr_failed++;
541 		return;
542 	}
543 	folio = pmd_folio(*pmd);
544 	if (is_huge_zero_folio(folio)) {
545 		walk->action = ACTION_CONTINUE;
546 		return;
547 	}
548 	if (!queue_folio_required(folio, qp))
549 		return;
550 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
551 	    !vma_migratable(walk->vma) ||
552 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
553 		qp->nr_failed++;
554 }
555 
556 /*
557  * Scan through folios, checking if they satisfy the required conditions,
558  * moving them from LRU to local pagelist for migration if they do (or not).
559  *
560  * queue_folios_pte_range() has two possible return values:
561  * 0 - continue walking to scan for more, even if an existing folio on the
562  *     wrong node could not be isolated and queued for migration.
563  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
564  *        and an existing folio was on a node that does not follow the policy.
565  */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)566 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
567 			unsigned long end, struct mm_walk *walk)
568 {
569 	struct vm_area_struct *vma = walk->vma;
570 	struct folio *folio;
571 	struct queue_pages *qp = walk->private;
572 	unsigned long flags = qp->flags;
573 	pte_t *pte, *mapped_pte;
574 	pte_t ptent;
575 	spinlock_t *ptl;
576 
577 	ptl = pmd_trans_huge_lock(pmd, vma);
578 	if (ptl) {
579 		queue_folios_pmd(pmd, walk);
580 		spin_unlock(ptl);
581 		goto out;
582 	}
583 
584 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
585 	if (!pte) {
586 		walk->action = ACTION_AGAIN;
587 		return 0;
588 	}
589 	for (; addr != end; pte++, addr += PAGE_SIZE) {
590 		ptent = ptep_get(pte);
591 		if (pte_none(ptent))
592 			continue;
593 		if (!pte_present(ptent)) {
594 			if (is_migration_entry(pte_to_swp_entry(ptent)))
595 				qp->nr_failed++;
596 			continue;
597 		}
598 		folio = vm_normal_folio(vma, addr, ptent);
599 		if (!folio || folio_is_zone_device(folio))
600 			continue;
601 		/*
602 		 * vm_normal_folio() filters out zero pages, but there might
603 		 * still be reserved folios to skip, perhaps in a VDSO.
604 		 */
605 		if (folio_test_reserved(folio))
606 			continue;
607 		if (!queue_folio_required(folio, qp))
608 			continue;
609 		if (folio_test_large(folio)) {
610 			/*
611 			 * A large folio can only be isolated from LRU once,
612 			 * but may be mapped by many PTEs (and Copy-On-Write may
613 			 * intersperse PTEs of other, order 0, folios).  This is
614 			 * a common case, so don't mistake it for failure (but
615 			 * there can be other cases of multi-mapped pages which
616 			 * this quick check does not help to filter out - and a
617 			 * search of the pagelist might grow to be prohibitive).
618 			 *
619 			 * migrate_pages(&pagelist) returns nr_failed folios, so
620 			 * check "large" now so that queue_pages_range() returns
621 			 * a comparable nr_failed folios.  This does imply that
622 			 * if folio could not be isolated for some racy reason
623 			 * at its first PTE, later PTEs will not give it another
624 			 * chance of isolation; but keeps the accounting simple.
625 			 */
626 			if (folio == qp->large)
627 				continue;
628 			qp->large = folio;
629 		}
630 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
631 		    !vma_migratable(vma) ||
632 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
633 			qp->nr_failed++;
634 			if (strictly_unmovable(flags))
635 				break;
636 		}
637 	}
638 	pte_unmap_unlock(mapped_pte, ptl);
639 	cond_resched();
640 out:
641 	if (qp->nr_failed && strictly_unmovable(flags))
642 		return -EIO;
643 	return 0;
644 }
645 
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)646 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
647 			       unsigned long addr, unsigned long end,
648 			       struct mm_walk *walk)
649 {
650 #ifdef CONFIG_HUGETLB_PAGE
651 	struct queue_pages *qp = walk->private;
652 	unsigned long flags = qp->flags;
653 	struct folio *folio;
654 	spinlock_t *ptl;
655 	pte_t entry;
656 
657 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
658 	entry = huge_ptep_get(walk->mm, addr, pte);
659 	if (!pte_present(entry)) {
660 		if (unlikely(is_hugetlb_entry_migration(entry)))
661 			qp->nr_failed++;
662 		goto unlock;
663 	}
664 	folio = pfn_folio(pte_pfn(entry));
665 	if (!queue_folio_required(folio, qp))
666 		goto unlock;
667 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
668 	    !vma_migratable(walk->vma)) {
669 		qp->nr_failed++;
670 		goto unlock;
671 	}
672 	/*
673 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
674 	 * Choosing not to migrate a shared folio is not counted as a failure.
675 	 *
676 	 * See folio_maybe_mapped_shared() on possible imprecision when we
677 	 * cannot easily detect if a folio is shared.
678 	 */
679 	if ((flags & MPOL_MF_MOVE_ALL) ||
680 	    (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
681 		if (!folio_isolate_hugetlb(folio, qp->pagelist))
682 			qp->nr_failed++;
683 unlock:
684 	spin_unlock(ptl);
685 	if (qp->nr_failed && strictly_unmovable(flags))
686 		return -EIO;
687 #endif
688 	return 0;
689 }
690 
691 #ifdef CONFIG_NUMA_BALANCING
692 /*
693  * This is used to mark a range of virtual addresses to be inaccessible.
694  * These are later cleared by a NUMA hinting fault. Depending on these
695  * faults, pages may be migrated for better NUMA placement.
696  *
697  * This is assuming that NUMA faults are handled using PROT_NONE. If
698  * an architecture makes a different choice, it will need further
699  * changes to the core.
700  */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)701 unsigned long change_prot_numa(struct vm_area_struct *vma,
702 			unsigned long addr, unsigned long end)
703 {
704 	struct mmu_gather tlb;
705 	long nr_updated;
706 
707 	tlb_gather_mmu(&tlb, vma->vm_mm);
708 
709 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
710 	if (nr_updated > 0) {
711 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
712 		count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
713 	}
714 
715 	tlb_finish_mmu(&tlb);
716 
717 	return nr_updated;
718 }
719 #endif /* CONFIG_NUMA_BALANCING */
720 
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)721 static int queue_pages_test_walk(unsigned long start, unsigned long end,
722 				struct mm_walk *walk)
723 {
724 	struct vm_area_struct *next, *vma = walk->vma;
725 	struct queue_pages *qp = walk->private;
726 	unsigned long flags = qp->flags;
727 
728 	/* range check first */
729 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
730 
731 	if (!qp->first) {
732 		qp->first = vma;
733 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
734 			(qp->start < vma->vm_start))
735 			/* hole at head side of range */
736 			return -EFAULT;
737 	}
738 	next = find_vma(vma->vm_mm, vma->vm_end);
739 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
740 		((vma->vm_end < qp->end) &&
741 		(!next || vma->vm_end < next->vm_start)))
742 		/* hole at middle or tail of range */
743 		return -EFAULT;
744 
745 	/*
746 	 * Need check MPOL_MF_STRICT to return -EIO if possible
747 	 * regardless of vma_migratable
748 	 */
749 	if (!vma_migratable(vma) &&
750 	    !(flags & MPOL_MF_STRICT))
751 		return 1;
752 
753 	/*
754 	 * Check page nodes, and queue pages to move, in the current vma.
755 	 * But if no moving, and no strict checking, the scan can be skipped.
756 	 */
757 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
758 		return 0;
759 	return 1;
760 }
761 
762 static const struct mm_walk_ops queue_pages_walk_ops = {
763 	.hugetlb_entry		= queue_folios_hugetlb,
764 	.pmd_entry		= queue_folios_pte_range,
765 	.test_walk		= queue_pages_test_walk,
766 	.walk_lock		= PGWALK_RDLOCK,
767 };
768 
769 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
770 	.hugetlb_entry		= queue_folios_hugetlb,
771 	.pmd_entry		= queue_folios_pte_range,
772 	.test_walk		= queue_pages_test_walk,
773 	.walk_lock		= PGWALK_WRLOCK,
774 };
775 
776 /*
777  * Walk through page tables and collect pages to be migrated.
778  *
779  * If pages found in a given range are not on the required set of @nodes,
780  * and migration is allowed, they are isolated and queued to @pagelist.
781  *
782  * queue_pages_range() may return:
783  * 0 - all pages already on the right node, or successfully queued for moving
784  *     (or neither strict checking nor moving requested: only range checking).
785  * >0 - this number of misplaced folios could not be queued for moving
786  *      (a hugetlbfs page or a transparent huge page being counted as 1).
787  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
788  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
789  */
790 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)791 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
792 		nodemask_t *nodes, unsigned long flags,
793 		struct list_head *pagelist)
794 {
795 	int err;
796 	struct queue_pages qp = {
797 		.pagelist = pagelist,
798 		.flags = flags,
799 		.nmask = nodes,
800 		.start = start,
801 		.end = end,
802 		.first = NULL,
803 	};
804 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
805 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
806 
807 	err = walk_page_range(mm, start, end, ops, &qp);
808 
809 	if (!qp.first)
810 		/* whole range in hole */
811 		err = -EFAULT;
812 
813 	return err ? : qp.nr_failed;
814 }
815 
816 /*
817  * Apply policy to a single VMA
818  * This must be called with the mmap_lock held for writing.
819  */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)820 static int vma_replace_policy(struct vm_area_struct *vma,
821 				struct mempolicy *pol)
822 {
823 	int err;
824 	struct mempolicy *old;
825 	struct mempolicy *new;
826 
827 	vma_assert_write_locked(vma);
828 
829 	new = mpol_dup(pol);
830 	if (IS_ERR(new))
831 		return PTR_ERR(new);
832 
833 	if (vma->vm_ops && vma->vm_ops->set_policy) {
834 		err = vma->vm_ops->set_policy(vma, new);
835 		if (err)
836 			goto err_out;
837 	}
838 
839 	old = vma->vm_policy;
840 	vma->vm_policy = new; /* protected by mmap_lock */
841 	mpol_put(old);
842 
843 	return 0;
844  err_out:
845 	mpol_put(new);
846 	return err;
847 }
848 
849 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)850 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
851 		struct vm_area_struct **prev, unsigned long start,
852 		unsigned long end, struct mempolicy *new_pol)
853 {
854 	unsigned long vmstart, vmend;
855 
856 	vmend = min(end, vma->vm_end);
857 	if (start > vma->vm_start) {
858 		*prev = vma;
859 		vmstart = start;
860 	} else {
861 		vmstart = vma->vm_start;
862 	}
863 
864 	if (mpol_equal(vma->vm_policy, new_pol)) {
865 		*prev = vma;
866 		return 0;
867 	}
868 
869 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
870 	if (IS_ERR(vma))
871 		return PTR_ERR(vma);
872 
873 	*prev = vma;
874 	return vma_replace_policy(vma, new_pol);
875 }
876 
877 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)878 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
879 			     nodemask_t *nodes)
880 {
881 	struct mempolicy *new, *old;
882 	NODEMASK_SCRATCH(scratch);
883 	int ret;
884 
885 	if (!scratch)
886 		return -ENOMEM;
887 
888 	new = mpol_new(mode, flags, nodes);
889 	if (IS_ERR(new)) {
890 		ret = PTR_ERR(new);
891 		goto out;
892 	}
893 
894 	task_lock(current);
895 	ret = mpol_set_nodemask(new, nodes, scratch);
896 	if (ret) {
897 		task_unlock(current);
898 		mpol_put(new);
899 		goto out;
900 	}
901 
902 	old = current->mempolicy;
903 	current->mempolicy = new;
904 	if (new && (new->mode == MPOL_INTERLEAVE ||
905 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
906 		current->il_prev = MAX_NUMNODES-1;
907 		current->il_weight = 0;
908 	}
909 	task_unlock(current);
910 	mpol_put(old);
911 	ret = 0;
912 out:
913 	NODEMASK_SCRATCH_FREE(scratch);
914 	return ret;
915 }
916 
917 /*
918  * Return nodemask for policy for get_mempolicy() query
919  *
920  * Called with task's alloc_lock held
921  */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)922 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
923 {
924 	nodes_clear(*nodes);
925 	if (pol == &default_policy)
926 		return;
927 
928 	switch (pol->mode) {
929 	case MPOL_BIND:
930 	case MPOL_INTERLEAVE:
931 	case MPOL_PREFERRED:
932 	case MPOL_PREFERRED_MANY:
933 	case MPOL_WEIGHTED_INTERLEAVE:
934 		*nodes = pol->nodes;
935 		break;
936 	case MPOL_LOCAL:
937 		/* return empty node mask for local allocation */
938 		break;
939 	default:
940 		BUG();
941 	}
942 }
943 
lookup_node(struct mm_struct * mm,unsigned long addr)944 static int lookup_node(struct mm_struct *mm, unsigned long addr)
945 {
946 	struct page *p = NULL;
947 	int ret;
948 
949 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
950 	if (ret > 0) {
951 		ret = page_to_nid(p);
952 		put_page(p);
953 	}
954 	return ret;
955 }
956 
957 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)958 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
959 			     unsigned long addr, unsigned long flags)
960 {
961 	int err;
962 	struct mm_struct *mm = current->mm;
963 	struct vm_area_struct *vma = NULL;
964 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
965 
966 	if (flags &
967 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
968 		return -EINVAL;
969 
970 	if (flags & MPOL_F_MEMS_ALLOWED) {
971 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
972 			return -EINVAL;
973 		*policy = 0;	/* just so it's initialized */
974 		task_lock(current);
975 		*nmask  = cpuset_current_mems_allowed;
976 		task_unlock(current);
977 		return 0;
978 	}
979 
980 	if (flags & MPOL_F_ADDR) {
981 		pgoff_t ilx;		/* ignored here */
982 		/*
983 		 * Do NOT fall back to task policy if the
984 		 * vma/shared policy at addr is NULL.  We
985 		 * want to return MPOL_DEFAULT in this case.
986 		 */
987 		mmap_read_lock(mm);
988 		vma = vma_lookup(mm, addr);
989 		if (!vma) {
990 			mmap_read_unlock(mm);
991 			return -EFAULT;
992 		}
993 		pol = __get_vma_policy(vma, addr, &ilx);
994 	} else if (addr)
995 		return -EINVAL;
996 
997 	if (!pol)
998 		pol = &default_policy;	/* indicates default behavior */
999 
1000 	if (flags & MPOL_F_NODE) {
1001 		if (flags & MPOL_F_ADDR) {
1002 			/*
1003 			 * Take a refcount on the mpol, because we are about to
1004 			 * drop the mmap_lock, after which only "pol" remains
1005 			 * valid, "vma" is stale.
1006 			 */
1007 			pol_refcount = pol;
1008 			vma = NULL;
1009 			mpol_get(pol);
1010 			mmap_read_unlock(mm);
1011 			err = lookup_node(mm, addr);
1012 			if (err < 0)
1013 				goto out;
1014 			*policy = err;
1015 		} else if (pol == current->mempolicy &&
1016 				pol->mode == MPOL_INTERLEAVE) {
1017 			*policy = next_node_in(current->il_prev, pol->nodes);
1018 		} else if (pol == current->mempolicy &&
1019 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1020 			if (current->il_weight)
1021 				*policy = current->il_prev;
1022 			else
1023 				*policy = next_node_in(current->il_prev,
1024 						       pol->nodes);
1025 		} else {
1026 			err = -EINVAL;
1027 			goto out;
1028 		}
1029 	} else {
1030 		*policy = pol == &default_policy ? MPOL_DEFAULT :
1031 						pol->mode;
1032 		/*
1033 		 * Internal mempolicy flags must be masked off before exposing
1034 		 * the policy to userspace.
1035 		 */
1036 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1037 	}
1038 
1039 	err = 0;
1040 	if (nmask) {
1041 		if (mpol_store_user_nodemask(pol)) {
1042 			*nmask = pol->w.user_nodemask;
1043 		} else {
1044 			task_lock(current);
1045 			get_policy_nodemask(pol, nmask);
1046 			task_unlock(current);
1047 		}
1048 	}
1049 
1050  out:
1051 	mpol_cond_put(pol);
1052 	if (vma)
1053 		mmap_read_unlock(mm);
1054 	if (pol_refcount)
1055 		mpol_put(pol_refcount);
1056 	return err;
1057 }
1058 
1059 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1060 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1061 				unsigned long flags)
1062 {
1063 	/*
1064 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1065 	 * Choosing not to migrate a shared folio is not counted as a failure.
1066 	 *
1067 	 * See folio_maybe_mapped_shared() on possible imprecision when we
1068 	 * cannot easily detect if a folio is shared.
1069 	 */
1070 	if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1071 		if (folio_isolate_lru(folio)) {
1072 			list_add_tail(&folio->lru, foliolist);
1073 			node_stat_mod_folio(folio,
1074 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1075 				folio_nr_pages(folio));
1076 		} else {
1077 			/*
1078 			 * Non-movable folio may reach here.  And, there may be
1079 			 * temporary off LRU folios or non-LRU movable folios.
1080 			 * Treat them as unmovable folios since they can't be
1081 			 * isolated, so they can't be moved at the moment.
1082 			 */
1083 			return false;
1084 		}
1085 	}
1086 	return true;
1087 }
1088 
1089 /*
1090  * Migrate pages from one node to a target node.
1091  * Returns error or the number of pages not migrated.
1092  */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1093 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1094 			    int flags)
1095 {
1096 	nodemask_t nmask;
1097 	struct vm_area_struct *vma;
1098 	LIST_HEAD(pagelist);
1099 	long nr_failed;
1100 	long err = 0;
1101 	struct migration_target_control mtc = {
1102 		.nid = dest,
1103 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1104 		.reason = MR_SYSCALL,
1105 	};
1106 
1107 	nodes_clear(nmask);
1108 	node_set(source, nmask);
1109 
1110 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1111 
1112 	mmap_read_lock(mm);
1113 	vma = find_vma(mm, 0);
1114 	if (unlikely(!vma)) {
1115 		mmap_read_unlock(mm);
1116 		return 0;
1117 	}
1118 
1119 	/*
1120 	 * This does not migrate the range, but isolates all pages that
1121 	 * need migration.  Between passing in the full user address
1122 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1123 	 * but passes back the count of pages which could not be isolated.
1124 	 */
1125 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1126 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1127 	mmap_read_unlock(mm);
1128 
1129 	if (!list_empty(&pagelist)) {
1130 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1131 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1132 		if (err)
1133 			putback_movable_pages(&pagelist);
1134 	}
1135 
1136 	if (err >= 0)
1137 		err += nr_failed;
1138 	return err;
1139 }
1140 
1141 /*
1142  * Move pages between the two nodesets so as to preserve the physical
1143  * layout as much as possible.
1144  *
1145  * Returns the number of page that could not be moved.
1146  */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1147 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1148 		     const nodemask_t *to, int flags)
1149 {
1150 	long nr_failed = 0;
1151 	long err = 0;
1152 	nodemask_t tmp;
1153 
1154 	lru_cache_disable();
1155 
1156 	/*
1157 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1158 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1159 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1160 	 * The pair of nodemasks 'to' and 'from' define the map.
1161 	 *
1162 	 * If no pair of bits is found that way, fallback to picking some
1163 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1164 	 * 'source' and 'dest' bits are the same, this represents a node
1165 	 * that will be migrating to itself, so no pages need move.
1166 	 *
1167 	 * If no bits are left in 'tmp', or if all remaining bits left
1168 	 * in 'tmp' correspond to the same bit in 'to', return false
1169 	 * (nothing left to migrate).
1170 	 *
1171 	 * This lets us pick a pair of nodes to migrate between, such that
1172 	 * if possible the dest node is not already occupied by some other
1173 	 * source node, minimizing the risk of overloading the memory on a
1174 	 * node that would happen if we migrated incoming memory to a node
1175 	 * before migrating outgoing memory source that same node.
1176 	 *
1177 	 * A single scan of tmp is sufficient.  As we go, we remember the
1178 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1179 	 * that not only moved, but what's better, moved to an empty slot
1180 	 * (d is not set in tmp), then we break out then, with that pair.
1181 	 * Otherwise when we finish scanning from_tmp, we at least have the
1182 	 * most recent <s, d> pair that moved.  If we get all the way through
1183 	 * the scan of tmp without finding any node that moved, much less
1184 	 * moved to an empty node, then there is nothing left worth migrating.
1185 	 */
1186 
1187 	tmp = *from;
1188 	while (!nodes_empty(tmp)) {
1189 		int s, d;
1190 		int source = NUMA_NO_NODE;
1191 		int dest = 0;
1192 
1193 		for_each_node_mask(s, tmp) {
1194 
1195 			/*
1196 			 * do_migrate_pages() tries to maintain the relative
1197 			 * node relationship of the pages established between
1198 			 * threads and memory areas.
1199                          *
1200 			 * However if the number of source nodes is not equal to
1201 			 * the number of destination nodes we can not preserve
1202 			 * this node relative relationship.  In that case, skip
1203 			 * copying memory from a node that is in the destination
1204 			 * mask.
1205 			 *
1206 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1207 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1208 			 */
1209 
1210 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1211 						(node_isset(s, *to)))
1212 				continue;
1213 
1214 			d = node_remap(s, *from, *to);
1215 			if (s == d)
1216 				continue;
1217 
1218 			source = s;	/* Node moved. Memorize */
1219 			dest = d;
1220 
1221 			/* dest not in remaining from nodes? */
1222 			if (!node_isset(dest, tmp))
1223 				break;
1224 		}
1225 		if (source == NUMA_NO_NODE)
1226 			break;
1227 
1228 		node_clear(source, tmp);
1229 		err = migrate_to_node(mm, source, dest, flags);
1230 		if (err > 0)
1231 			nr_failed += err;
1232 		if (err < 0)
1233 			break;
1234 	}
1235 
1236 	lru_cache_enable();
1237 	if (err < 0)
1238 		return err;
1239 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1240 }
1241 
1242 /*
1243  * Allocate a new folio for page migration, according to NUMA mempolicy.
1244  */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1245 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1246 						    unsigned long private)
1247 {
1248 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1249 	struct mempolicy *pol = mmpol->pol;
1250 	pgoff_t ilx = mmpol->ilx;
1251 	unsigned int order;
1252 	int nid = numa_node_id();
1253 	gfp_t gfp;
1254 
1255 	order = folio_order(src);
1256 	ilx += src->index >> order;
1257 
1258 	if (folio_test_hugetlb(src)) {
1259 		nodemask_t *nodemask;
1260 		struct hstate *h;
1261 
1262 		h = folio_hstate(src);
1263 		gfp = htlb_alloc_mask(h);
1264 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1265 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1266 				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1267 	}
1268 
1269 	if (folio_test_large(src))
1270 		gfp = GFP_TRANSHUGE;
1271 	else
1272 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1273 
1274 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1275 }
1276 #else
1277 
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1278 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1279 				unsigned long flags)
1280 {
1281 	return false;
1282 }
1283 
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1284 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1285 		     const nodemask_t *to, int flags)
1286 {
1287 	return -ENOSYS;
1288 }
1289 
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1290 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1291 						    unsigned long private)
1292 {
1293 	return NULL;
1294 }
1295 #endif
1296 
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1297 static long do_mbind(unsigned long start, unsigned long len,
1298 		     unsigned short mode, unsigned short mode_flags,
1299 		     nodemask_t *nmask, unsigned long flags)
1300 {
1301 	struct mm_struct *mm = current->mm;
1302 	struct vm_area_struct *vma, *prev;
1303 	struct vma_iterator vmi;
1304 	struct migration_mpol mmpol;
1305 	struct mempolicy *new;
1306 	unsigned long end;
1307 	long err;
1308 	long nr_failed;
1309 	LIST_HEAD(pagelist);
1310 
1311 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1312 		return -EINVAL;
1313 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1314 		return -EPERM;
1315 
1316 	if (start & ~PAGE_MASK)
1317 		return -EINVAL;
1318 
1319 	if (mode == MPOL_DEFAULT)
1320 		flags &= ~MPOL_MF_STRICT;
1321 
1322 	len = PAGE_ALIGN(len);
1323 	end = start + len;
1324 
1325 	if (end < start)
1326 		return -EINVAL;
1327 	if (end == start)
1328 		return 0;
1329 
1330 	new = mpol_new(mode, mode_flags, nmask);
1331 	if (IS_ERR(new))
1332 		return PTR_ERR(new);
1333 
1334 	/*
1335 	 * If we are using the default policy then operation
1336 	 * on discontinuous address spaces is okay after all
1337 	 */
1338 	if (!new)
1339 		flags |= MPOL_MF_DISCONTIG_OK;
1340 
1341 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1342 		lru_cache_disable();
1343 	{
1344 		NODEMASK_SCRATCH(scratch);
1345 		if (scratch) {
1346 			mmap_write_lock(mm);
1347 			err = mpol_set_nodemask(new, nmask, scratch);
1348 			if (err)
1349 				mmap_write_unlock(mm);
1350 		} else
1351 			err = -ENOMEM;
1352 		NODEMASK_SCRATCH_FREE(scratch);
1353 	}
1354 	if (err)
1355 		goto mpol_out;
1356 
1357 	/*
1358 	 * Lock the VMAs before scanning for pages to migrate,
1359 	 * to ensure we don't miss a concurrently inserted page.
1360 	 */
1361 	nr_failed = queue_pages_range(mm, start, end, nmask,
1362 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1363 
1364 	if (nr_failed < 0) {
1365 		err = nr_failed;
1366 		nr_failed = 0;
1367 	} else {
1368 		vma_iter_init(&vmi, mm, start);
1369 		prev = vma_prev(&vmi);
1370 		for_each_vma_range(vmi, vma, end) {
1371 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1372 			if (err)
1373 				break;
1374 		}
1375 	}
1376 
1377 	if (!err && !list_empty(&pagelist)) {
1378 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1379 		if (!new) {
1380 			new = get_task_policy(current);
1381 			mpol_get(new);
1382 		}
1383 		mmpol.pol = new;
1384 		mmpol.ilx = 0;
1385 
1386 		/*
1387 		 * In the interleaved case, attempt to allocate on exactly the
1388 		 * targeted nodes, for the first VMA to be migrated; for later
1389 		 * VMAs, the nodes will still be interleaved from the targeted
1390 		 * nodemask, but one by one may be selected differently.
1391 		 */
1392 		if (new->mode == MPOL_INTERLEAVE ||
1393 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1394 			struct folio *folio;
1395 			unsigned int order;
1396 			unsigned long addr = -EFAULT;
1397 
1398 			list_for_each_entry(folio, &pagelist, lru) {
1399 				if (!folio_test_ksm(folio))
1400 					break;
1401 			}
1402 			if (!list_entry_is_head(folio, &pagelist, lru)) {
1403 				vma_iter_init(&vmi, mm, start);
1404 				for_each_vma_range(vmi, vma, end) {
1405 					addr = page_address_in_vma(folio,
1406 						folio_page(folio, 0), vma);
1407 					if (addr != -EFAULT)
1408 						break;
1409 				}
1410 			}
1411 			if (addr != -EFAULT) {
1412 				order = folio_order(folio);
1413 				/* We already know the pol, but not the ilx */
1414 				mpol_cond_put(get_vma_policy(vma, addr, order,
1415 							     &mmpol.ilx));
1416 				/* Set base from which to increment by index */
1417 				mmpol.ilx -= folio->index >> order;
1418 			}
1419 		}
1420 	}
1421 
1422 	mmap_write_unlock(mm);
1423 
1424 	if (!err && !list_empty(&pagelist)) {
1425 		nr_failed |= migrate_pages(&pagelist,
1426 				alloc_migration_target_by_mpol, NULL,
1427 				(unsigned long)&mmpol, MIGRATE_SYNC,
1428 				MR_MEMPOLICY_MBIND, NULL);
1429 	}
1430 
1431 	if (nr_failed && (flags & MPOL_MF_STRICT))
1432 		err = -EIO;
1433 	if (!list_empty(&pagelist))
1434 		putback_movable_pages(&pagelist);
1435 mpol_out:
1436 	mpol_put(new);
1437 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1438 		lru_cache_enable();
1439 	return err;
1440 }
1441 
1442 /*
1443  * User space interface with variable sized bitmaps for nodelists.
1444  */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1445 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1446 		      unsigned long maxnode)
1447 {
1448 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1449 	int ret;
1450 
1451 	if (in_compat_syscall())
1452 		ret = compat_get_bitmap(mask,
1453 					(const compat_ulong_t __user *)nmask,
1454 					maxnode);
1455 	else
1456 		ret = copy_from_user(mask, nmask,
1457 				     nlongs * sizeof(unsigned long));
1458 
1459 	if (ret)
1460 		return -EFAULT;
1461 
1462 	if (maxnode % BITS_PER_LONG)
1463 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1464 
1465 	return 0;
1466 }
1467 
1468 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1469 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1470 		     unsigned long maxnode)
1471 {
1472 	--maxnode;
1473 	nodes_clear(*nodes);
1474 	if (maxnode == 0 || !nmask)
1475 		return 0;
1476 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1477 		return -EINVAL;
1478 
1479 	/*
1480 	 * When the user specified more nodes than supported just check
1481 	 * if the non supported part is all zero, one word at a time,
1482 	 * starting at the end.
1483 	 */
1484 	while (maxnode > MAX_NUMNODES) {
1485 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1486 		unsigned long t;
1487 
1488 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1489 			return -EFAULT;
1490 
1491 		if (maxnode - bits >= MAX_NUMNODES) {
1492 			maxnode -= bits;
1493 		} else {
1494 			maxnode = MAX_NUMNODES;
1495 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1496 		}
1497 		if (t)
1498 			return -EINVAL;
1499 	}
1500 
1501 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1502 }
1503 
1504 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1505 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1506 			      nodemask_t *nodes)
1507 {
1508 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1509 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1510 	bool compat = in_compat_syscall();
1511 
1512 	if (compat)
1513 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1514 
1515 	if (copy > nbytes) {
1516 		if (copy > PAGE_SIZE)
1517 			return -EINVAL;
1518 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1519 			return -EFAULT;
1520 		copy = nbytes;
1521 		maxnode = nr_node_ids;
1522 	}
1523 
1524 	if (compat)
1525 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1526 					 nodes_addr(*nodes), maxnode);
1527 
1528 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1529 }
1530 
1531 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1532 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1533 {
1534 	*flags = *mode & MPOL_MODE_FLAGS;
1535 	*mode &= ~MPOL_MODE_FLAGS;
1536 
1537 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1538 		return -EINVAL;
1539 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1540 		return -EINVAL;
1541 	if (*flags & MPOL_F_NUMA_BALANCING) {
1542 		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1543 			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1544 		else
1545 			return -EINVAL;
1546 	}
1547 	return 0;
1548 }
1549 
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1550 static long kernel_mbind(unsigned long start, unsigned long len,
1551 			 unsigned long mode, const unsigned long __user *nmask,
1552 			 unsigned long maxnode, unsigned int flags)
1553 {
1554 	unsigned short mode_flags;
1555 	nodemask_t nodes;
1556 	int lmode = mode;
1557 	int err;
1558 
1559 	start = untagged_addr(start);
1560 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1561 	if (err)
1562 		return err;
1563 
1564 	err = get_nodes(&nodes, nmask, maxnode);
1565 	if (err)
1566 		return err;
1567 
1568 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1569 }
1570 
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1571 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1572 		unsigned long, home_node, unsigned long, flags)
1573 {
1574 	struct mm_struct *mm = current->mm;
1575 	struct vm_area_struct *vma, *prev;
1576 	struct mempolicy *new, *old;
1577 	unsigned long end;
1578 	int err = -ENOENT;
1579 	VMA_ITERATOR(vmi, mm, start);
1580 
1581 	start = untagged_addr(start);
1582 	if (start & ~PAGE_MASK)
1583 		return -EINVAL;
1584 	/*
1585 	 * flags is used for future extension if any.
1586 	 */
1587 	if (flags != 0)
1588 		return -EINVAL;
1589 
1590 	/*
1591 	 * Check home_node is online to avoid accessing uninitialized
1592 	 * NODE_DATA.
1593 	 */
1594 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1595 		return -EINVAL;
1596 
1597 	len = PAGE_ALIGN(len);
1598 	end = start + len;
1599 
1600 	if (end < start)
1601 		return -EINVAL;
1602 	if (end == start)
1603 		return 0;
1604 	mmap_write_lock(mm);
1605 	prev = vma_prev(&vmi);
1606 	for_each_vma_range(vmi, vma, end) {
1607 		/*
1608 		 * If any vma in the range got policy other than MPOL_BIND
1609 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1610 		 * the home node for vmas we already updated before.
1611 		 */
1612 		old = vma_policy(vma);
1613 		if (!old) {
1614 			prev = vma;
1615 			continue;
1616 		}
1617 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1618 			err = -EOPNOTSUPP;
1619 			break;
1620 		}
1621 		new = mpol_dup(old);
1622 		if (IS_ERR(new)) {
1623 			err = PTR_ERR(new);
1624 			break;
1625 		}
1626 
1627 		vma_start_write(vma);
1628 		new->home_node = home_node;
1629 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1630 		mpol_put(new);
1631 		if (err)
1632 			break;
1633 	}
1634 	mmap_write_unlock(mm);
1635 	return err;
1636 }
1637 
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1638 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1639 		unsigned long, mode, const unsigned long __user *, nmask,
1640 		unsigned long, maxnode, unsigned int, flags)
1641 {
1642 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1643 }
1644 
1645 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1646 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1647 				 unsigned long maxnode)
1648 {
1649 	unsigned short mode_flags;
1650 	nodemask_t nodes;
1651 	int lmode = mode;
1652 	int err;
1653 
1654 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1655 	if (err)
1656 		return err;
1657 
1658 	err = get_nodes(&nodes, nmask, maxnode);
1659 	if (err)
1660 		return err;
1661 
1662 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1663 }
1664 
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1665 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1666 		unsigned long, maxnode)
1667 {
1668 	return kernel_set_mempolicy(mode, nmask, maxnode);
1669 }
1670 
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1671 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1672 				const unsigned long __user *old_nodes,
1673 				const unsigned long __user *new_nodes)
1674 {
1675 	struct mm_struct *mm = NULL;
1676 	struct task_struct *task;
1677 	nodemask_t task_nodes;
1678 	int err;
1679 	nodemask_t *old;
1680 	nodemask_t *new;
1681 	NODEMASK_SCRATCH(scratch);
1682 
1683 	if (!scratch)
1684 		return -ENOMEM;
1685 
1686 	old = &scratch->mask1;
1687 	new = &scratch->mask2;
1688 
1689 	err = get_nodes(old, old_nodes, maxnode);
1690 	if (err)
1691 		goto out;
1692 
1693 	err = get_nodes(new, new_nodes, maxnode);
1694 	if (err)
1695 		goto out;
1696 
1697 	/* Find the mm_struct */
1698 	rcu_read_lock();
1699 	task = pid ? find_task_by_vpid(pid) : current;
1700 	if (!task) {
1701 		rcu_read_unlock();
1702 		err = -ESRCH;
1703 		goto out;
1704 	}
1705 	get_task_struct(task);
1706 
1707 	err = -EINVAL;
1708 
1709 	/*
1710 	 * Check if this process has the right to modify the specified process.
1711 	 * Use the regular "ptrace_may_access()" checks.
1712 	 */
1713 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1714 		rcu_read_unlock();
1715 		err = -EPERM;
1716 		goto out_put;
1717 	}
1718 	rcu_read_unlock();
1719 
1720 	task_nodes = cpuset_mems_allowed(task);
1721 	/* Is the user allowed to access the target nodes? */
1722 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1723 		err = -EPERM;
1724 		goto out_put;
1725 	}
1726 
1727 	task_nodes = cpuset_mems_allowed(current);
1728 	nodes_and(*new, *new, task_nodes);
1729 	if (nodes_empty(*new))
1730 		goto out_put;
1731 
1732 	err = security_task_movememory(task);
1733 	if (err)
1734 		goto out_put;
1735 
1736 	mm = get_task_mm(task);
1737 	put_task_struct(task);
1738 
1739 	if (!mm) {
1740 		err = -EINVAL;
1741 		goto out;
1742 	}
1743 
1744 	err = do_migrate_pages(mm, old, new,
1745 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1746 
1747 	mmput(mm);
1748 out:
1749 	NODEMASK_SCRATCH_FREE(scratch);
1750 
1751 	return err;
1752 
1753 out_put:
1754 	put_task_struct(task);
1755 	goto out;
1756 }
1757 
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1758 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1759 		const unsigned long __user *, old_nodes,
1760 		const unsigned long __user *, new_nodes)
1761 {
1762 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1763 }
1764 
1765 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1766 static int kernel_get_mempolicy(int __user *policy,
1767 				unsigned long __user *nmask,
1768 				unsigned long maxnode,
1769 				unsigned long addr,
1770 				unsigned long flags)
1771 {
1772 	int err;
1773 	int pval;
1774 	nodemask_t nodes;
1775 
1776 	if (nmask != NULL && maxnode < nr_node_ids)
1777 		return -EINVAL;
1778 
1779 	addr = untagged_addr(addr);
1780 
1781 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1782 
1783 	if (err)
1784 		return err;
1785 
1786 	if (policy && put_user(pval, policy))
1787 		return -EFAULT;
1788 
1789 	if (nmask)
1790 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1791 
1792 	return err;
1793 }
1794 
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1795 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1796 		unsigned long __user *, nmask, unsigned long, maxnode,
1797 		unsigned long, addr, unsigned long, flags)
1798 {
1799 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1800 }
1801 
vma_migratable(struct vm_area_struct * vma)1802 bool vma_migratable(struct vm_area_struct *vma)
1803 {
1804 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1805 		return false;
1806 
1807 	/*
1808 	 * DAX device mappings require predictable access latency, so avoid
1809 	 * incurring periodic faults.
1810 	 */
1811 	if (vma_is_dax(vma))
1812 		return false;
1813 
1814 	if (is_vm_hugetlb_page(vma) &&
1815 		!hugepage_migration_supported(hstate_vma(vma)))
1816 		return false;
1817 
1818 	/*
1819 	 * Migration allocates pages in the highest zone. If we cannot
1820 	 * do so then migration (at least from node to node) is not
1821 	 * possible.
1822 	 */
1823 	if (vma->vm_file &&
1824 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1825 			< policy_zone)
1826 		return false;
1827 	return true;
1828 }
1829 
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)1830 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1831 				   unsigned long addr, pgoff_t *ilx)
1832 {
1833 	*ilx = 0;
1834 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
1835 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
1836 }
1837 
1838 /*
1839  * get_vma_policy(@vma, @addr, @order, @ilx)
1840  * @vma: virtual memory area whose policy is sought
1841  * @addr: address in @vma for shared policy lookup
1842  * @order: 0, or appropriate huge_page_order for interleaving
1843  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
1844  *       MPOL_WEIGHTED_INTERLEAVE
1845  *
1846  * Returns effective policy for a VMA at specified address.
1847  * Falls back to current->mempolicy or system default policy, as necessary.
1848  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1849  * count--added by the get_policy() vm_op, as appropriate--to protect against
1850  * freeing by another task.  It is the caller's responsibility to free the
1851  * extra reference for shared policies.
1852  */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)1853 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1854 				 unsigned long addr, int order, pgoff_t *ilx)
1855 {
1856 	struct mempolicy *pol;
1857 
1858 	pol = __get_vma_policy(vma, addr, ilx);
1859 	if (!pol)
1860 		pol = get_task_policy(current);
1861 	if (pol->mode == MPOL_INTERLEAVE ||
1862 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1863 		*ilx += vma->vm_pgoff >> order;
1864 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1865 	}
1866 	return pol;
1867 }
1868 
vma_policy_mof(struct vm_area_struct * vma)1869 bool vma_policy_mof(struct vm_area_struct *vma)
1870 {
1871 	struct mempolicy *pol;
1872 
1873 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1874 		bool ret = false;
1875 		pgoff_t ilx;		/* ignored here */
1876 
1877 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1878 		if (pol && (pol->flags & MPOL_F_MOF))
1879 			ret = true;
1880 		mpol_cond_put(pol);
1881 
1882 		return ret;
1883 	}
1884 
1885 	pol = vma->vm_policy;
1886 	if (!pol)
1887 		pol = get_task_policy(current);
1888 
1889 	return pol->flags & MPOL_F_MOF;
1890 }
1891 
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)1892 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1893 {
1894 	enum zone_type dynamic_policy_zone = policy_zone;
1895 
1896 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1897 
1898 	/*
1899 	 * if policy->nodes has movable memory only,
1900 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1901 	 *
1902 	 * policy->nodes is intersect with node_states[N_MEMORY].
1903 	 * so if the following test fails, it implies
1904 	 * policy->nodes has movable memory only.
1905 	 */
1906 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1907 		dynamic_policy_zone = ZONE_MOVABLE;
1908 
1909 	return zone >= dynamic_policy_zone;
1910 }
1911 
weighted_interleave_nodes(struct mempolicy * policy)1912 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
1913 {
1914 	unsigned int node;
1915 	unsigned int cpuset_mems_cookie;
1916 
1917 retry:
1918 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
1919 	cpuset_mems_cookie = read_mems_allowed_begin();
1920 	node = current->il_prev;
1921 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
1922 		node = next_node_in(node, policy->nodes);
1923 		if (read_mems_allowed_retry(cpuset_mems_cookie))
1924 			goto retry;
1925 		if (node == MAX_NUMNODES)
1926 			return node;
1927 		current->il_prev = node;
1928 		current->il_weight = get_il_weight(node);
1929 	}
1930 	current->il_weight--;
1931 	return node;
1932 }
1933 
1934 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)1935 static unsigned int interleave_nodes(struct mempolicy *policy)
1936 {
1937 	unsigned int nid;
1938 	unsigned int cpuset_mems_cookie;
1939 
1940 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
1941 	do {
1942 		cpuset_mems_cookie = read_mems_allowed_begin();
1943 		nid = next_node_in(current->il_prev, policy->nodes);
1944 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
1945 
1946 	if (nid < MAX_NUMNODES)
1947 		current->il_prev = nid;
1948 	return nid;
1949 }
1950 
1951 /*
1952  * Depending on the memory policy provide a node from which to allocate the
1953  * next slab entry.
1954  */
mempolicy_slab_node(void)1955 unsigned int mempolicy_slab_node(void)
1956 {
1957 	struct mempolicy *policy;
1958 	int node = numa_mem_id();
1959 
1960 	if (!in_task())
1961 		return node;
1962 
1963 	policy = current->mempolicy;
1964 	if (!policy)
1965 		return node;
1966 
1967 	switch (policy->mode) {
1968 	case MPOL_PREFERRED:
1969 		return first_node(policy->nodes);
1970 
1971 	case MPOL_INTERLEAVE:
1972 		return interleave_nodes(policy);
1973 
1974 	case MPOL_WEIGHTED_INTERLEAVE:
1975 		return weighted_interleave_nodes(policy);
1976 
1977 	case MPOL_BIND:
1978 	case MPOL_PREFERRED_MANY:
1979 	{
1980 		struct zoneref *z;
1981 
1982 		/*
1983 		 * Follow bind policy behavior and start allocation at the
1984 		 * first node.
1985 		 */
1986 		struct zonelist *zonelist;
1987 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1988 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1989 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1990 							&policy->nodes);
1991 		return zonelist_zone(z) ? zonelist_node_idx(z) : node;
1992 	}
1993 	case MPOL_LOCAL:
1994 		return node;
1995 
1996 	default:
1997 		BUG();
1998 	}
1999 }
2000 
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2001 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2002 					      nodemask_t *mask)
2003 {
2004 	/*
2005 	 * barrier stabilizes the nodemask locally so that it can be iterated
2006 	 * over safely without concern for changes. Allocators validate node
2007 	 * selection does not violate mems_allowed, so this is safe.
2008 	 */
2009 	barrier();
2010 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2011 	barrier();
2012 	return nodes_weight(*mask);
2013 }
2014 
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2015 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2016 {
2017 	nodemask_t nodemask;
2018 	unsigned int target, nr_nodes;
2019 	u8 *table;
2020 	unsigned int weight_total = 0;
2021 	u8 weight;
2022 	int nid;
2023 
2024 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2025 	if (!nr_nodes)
2026 		return numa_node_id();
2027 
2028 	rcu_read_lock();
2029 	table = rcu_dereference(iw_table);
2030 	/* calculate the total weight */
2031 	for_each_node_mask(nid, nodemask) {
2032 		/* detect system default usage */
2033 		weight = table ? table[nid] : 1;
2034 		weight = weight ? weight : 1;
2035 		weight_total += weight;
2036 	}
2037 
2038 	/* Calculate the node offset based on totals */
2039 	target = ilx % weight_total;
2040 	nid = first_node(nodemask);
2041 	while (target) {
2042 		/* detect system default usage */
2043 		weight = table ? table[nid] : 1;
2044 		weight = weight ? weight : 1;
2045 		if (target < weight)
2046 			break;
2047 		target -= weight;
2048 		nid = next_node_in(nid, nodemask);
2049 	}
2050 	rcu_read_unlock();
2051 	return nid;
2052 }
2053 
2054 /*
2055  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2056  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2057  * exceeds the number of present nodes.
2058  */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2059 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2060 {
2061 	nodemask_t nodemask;
2062 	unsigned int target, nnodes;
2063 	int i;
2064 	int nid;
2065 
2066 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2067 	if (!nnodes)
2068 		return numa_node_id();
2069 	target = ilx % nnodes;
2070 	nid = first_node(nodemask);
2071 	for (i = 0; i < target; i++)
2072 		nid = next_node(nid, nodemask);
2073 	return nid;
2074 }
2075 
2076 /*
2077  * Return a nodemask representing a mempolicy for filtering nodes for
2078  * page allocation, together with preferred node id (or the input node id).
2079  */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2080 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2081 				   pgoff_t ilx, int *nid)
2082 {
2083 	nodemask_t *nodemask = NULL;
2084 
2085 	switch (pol->mode) {
2086 	case MPOL_PREFERRED:
2087 		/* Override input node id */
2088 		*nid = first_node(pol->nodes);
2089 		break;
2090 	case MPOL_PREFERRED_MANY:
2091 		nodemask = &pol->nodes;
2092 		if (pol->home_node != NUMA_NO_NODE)
2093 			*nid = pol->home_node;
2094 		break;
2095 	case MPOL_BIND:
2096 		/* Restrict to nodemask (but not on lower zones) */
2097 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2098 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2099 			nodemask = &pol->nodes;
2100 		if (pol->home_node != NUMA_NO_NODE)
2101 			*nid = pol->home_node;
2102 		/*
2103 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2104 		 * because we might easily break the expectation to stay on the
2105 		 * requested node and not break the policy.
2106 		 */
2107 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2108 		break;
2109 	case MPOL_INTERLEAVE:
2110 		/* Override input node id */
2111 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2112 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2113 		break;
2114 	case MPOL_WEIGHTED_INTERLEAVE:
2115 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2116 			weighted_interleave_nodes(pol) :
2117 			weighted_interleave_nid(pol, ilx);
2118 		break;
2119 	}
2120 
2121 	return nodemask;
2122 }
2123 
2124 #ifdef CONFIG_HUGETLBFS
2125 /*
2126  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2127  * @vma: virtual memory area whose policy is sought
2128  * @addr: address in @vma for shared policy lookup and interleave policy
2129  * @gfp_flags: for requested zone
2130  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2131  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2132  *
2133  * Returns a nid suitable for a huge page allocation and a pointer
2134  * to the struct mempolicy for conditional unref after allocation.
2135  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2136  * to the mempolicy's @nodemask for filtering the zonelist.
2137  */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2138 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2139 		struct mempolicy **mpol, nodemask_t **nodemask)
2140 {
2141 	pgoff_t ilx;
2142 	int nid;
2143 
2144 	nid = numa_node_id();
2145 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2146 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2147 	return nid;
2148 }
2149 
2150 /*
2151  * init_nodemask_of_mempolicy
2152  *
2153  * If the current task's mempolicy is "default" [NULL], return 'false'
2154  * to indicate default policy.  Otherwise, extract the policy nodemask
2155  * for 'bind' or 'interleave' policy into the argument nodemask, or
2156  * initialize the argument nodemask to contain the single node for
2157  * 'preferred' or 'local' policy and return 'true' to indicate presence
2158  * of non-default mempolicy.
2159  *
2160  * We don't bother with reference counting the mempolicy [mpol_get/put]
2161  * because the current task is examining it's own mempolicy and a task's
2162  * mempolicy is only ever changed by the task itself.
2163  *
2164  * N.B., it is the caller's responsibility to free a returned nodemask.
2165  */
init_nodemask_of_mempolicy(nodemask_t * mask)2166 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2167 {
2168 	struct mempolicy *mempolicy;
2169 
2170 	if (!(mask && current->mempolicy))
2171 		return false;
2172 
2173 	task_lock(current);
2174 	mempolicy = current->mempolicy;
2175 	switch (mempolicy->mode) {
2176 	case MPOL_PREFERRED:
2177 	case MPOL_PREFERRED_MANY:
2178 	case MPOL_BIND:
2179 	case MPOL_INTERLEAVE:
2180 	case MPOL_WEIGHTED_INTERLEAVE:
2181 		*mask = mempolicy->nodes;
2182 		break;
2183 
2184 	case MPOL_LOCAL:
2185 		init_nodemask_of_node(mask, numa_node_id());
2186 		break;
2187 
2188 	default:
2189 		BUG();
2190 	}
2191 	task_unlock(current);
2192 
2193 	return true;
2194 }
2195 #endif
2196 
2197 /*
2198  * mempolicy_in_oom_domain
2199  *
2200  * If tsk's mempolicy is "bind", check for intersection between mask and
2201  * the policy nodemask. Otherwise, return true for all other policies
2202  * including "interleave", as a tsk with "interleave" policy may have
2203  * memory allocated from all nodes in system.
2204  *
2205  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2206  */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2207 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2208 					const nodemask_t *mask)
2209 {
2210 	struct mempolicy *mempolicy;
2211 	bool ret = true;
2212 
2213 	if (!mask)
2214 		return ret;
2215 
2216 	task_lock(tsk);
2217 	mempolicy = tsk->mempolicy;
2218 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2219 		ret = nodes_intersects(mempolicy->nodes, *mask);
2220 	task_unlock(tsk);
2221 
2222 	return ret;
2223 }
2224 
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2225 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2226 						int nid, nodemask_t *nodemask)
2227 {
2228 	struct page *page;
2229 	gfp_t preferred_gfp;
2230 
2231 	/*
2232 	 * This is a two pass approach. The first pass will only try the
2233 	 * preferred nodes but skip the direct reclaim and allow the
2234 	 * allocation to fail, while the second pass will try all the
2235 	 * nodes in system.
2236 	 */
2237 	preferred_gfp = gfp | __GFP_NOWARN;
2238 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2239 	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2240 	if (!page)
2241 		page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2242 
2243 	return page;
2244 }
2245 
2246 /**
2247  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2248  * @gfp: GFP flags.
2249  * @order: Order of the page allocation.
2250  * @pol: Pointer to the NUMA mempolicy.
2251  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2252  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2253  *
2254  * Return: The page on success or NULL if allocation fails.
2255  */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2256 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2257 		struct mempolicy *pol, pgoff_t ilx, int nid)
2258 {
2259 	nodemask_t *nodemask;
2260 	struct page *page;
2261 
2262 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2263 
2264 	if (pol->mode == MPOL_PREFERRED_MANY)
2265 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2266 
2267 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2268 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2269 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2270 		/*
2271 		 * For hugepage allocation and non-interleave policy which
2272 		 * allows the current node (or other explicitly preferred
2273 		 * node) we only try to allocate from the current/preferred
2274 		 * node and don't fall back to other nodes, as the cost of
2275 		 * remote accesses would likely offset THP benefits.
2276 		 *
2277 		 * If the policy is interleave or does not allow the current
2278 		 * node in its nodemask, we allocate the standard way.
2279 		 */
2280 		if (pol->mode != MPOL_INTERLEAVE &&
2281 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2282 		    (!nodemask || node_isset(nid, *nodemask))) {
2283 			/*
2284 			 * First, try to allocate THP only on local node, but
2285 			 * don't reclaim unnecessarily, just compact.
2286 			 */
2287 			page = __alloc_frozen_pages_noprof(
2288 				gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2289 				nid, NULL);
2290 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2291 				return page;
2292 			/*
2293 			 * If hugepage allocations are configured to always
2294 			 * synchronous compact or the vma has been madvised
2295 			 * to prefer hugepage backing, retry allowing remote
2296 			 * memory with both reclaim and compact as well.
2297 			 */
2298 		}
2299 	}
2300 
2301 	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2302 
2303 	if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2304 		     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2305 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2306 		if (static_branch_likely(&vm_numa_stat_key) &&
2307 		    page_to_nid(page) == nid) {
2308 			preempt_disable();
2309 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2310 			preempt_enable();
2311 		}
2312 	}
2313 
2314 	return page;
2315 }
2316 
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2317 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2318 		struct mempolicy *pol, pgoff_t ilx, int nid)
2319 {
2320 	struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2321 			ilx, nid);
2322 	if (!page)
2323 		return NULL;
2324 
2325 	set_page_refcounted(page);
2326 	return page_rmappable_folio(page);
2327 }
2328 
2329 /**
2330  * vma_alloc_folio - Allocate a folio for a VMA.
2331  * @gfp: GFP flags.
2332  * @order: Order of the folio.
2333  * @vma: Pointer to VMA.
2334  * @addr: Virtual address of the allocation.  Must be inside @vma.
2335  *
2336  * Allocate a folio for a specific address in @vma, using the appropriate
2337  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2338  * VMA to prevent it from going away.  Should be used for all allocations
2339  * for folios that will be mapped into user space, excepting hugetlbfs, and
2340  * excepting where direct use of folio_alloc_mpol() is more appropriate.
2341  *
2342  * Return: The folio on success or NULL if allocation fails.
2343  */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2344 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2345 		unsigned long addr)
2346 {
2347 	struct mempolicy *pol;
2348 	pgoff_t ilx;
2349 	struct folio *folio;
2350 
2351 	if (vma->vm_flags & VM_DROPPABLE)
2352 		gfp |= __GFP_NOWARN;
2353 
2354 	pol = get_vma_policy(vma, addr, order, &ilx);
2355 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2356 	mpol_cond_put(pol);
2357 	return folio;
2358 }
2359 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2360 
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2361 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2362 {
2363 	struct mempolicy *pol = &default_policy;
2364 
2365 	/*
2366 	 * No reference counting needed for current->mempolicy
2367 	 * nor system default_policy
2368 	 */
2369 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2370 		pol = get_task_policy(current);
2371 
2372 	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2373 				       numa_node_id());
2374 }
2375 
2376 /**
2377  * alloc_pages - Allocate pages.
2378  * @gfp: GFP flags.
2379  * @order: Power of two of number of pages to allocate.
2380  *
2381  * Allocate 1 << @order contiguous pages.  The physical address of the
2382  * first page is naturally aligned (eg an order-3 allocation will be aligned
2383  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2384  * process is honoured when in process context.
2385  *
2386  * Context: Can be called from any context, providing the appropriate GFP
2387  * flags are used.
2388  * Return: The page on success or NULL if allocation fails.
2389  */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2390 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2391 {
2392 	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2393 
2394 	if (page)
2395 		set_page_refcounted(page);
2396 	return page;
2397 }
2398 EXPORT_SYMBOL(alloc_pages_noprof);
2399 
folio_alloc_noprof(gfp_t gfp,unsigned int order)2400 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2401 {
2402 	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2403 }
2404 EXPORT_SYMBOL(folio_alloc_noprof);
2405 
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2406 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2407 		struct mempolicy *pol, unsigned long nr_pages,
2408 		struct page **page_array)
2409 {
2410 	int nodes;
2411 	unsigned long nr_pages_per_node;
2412 	int delta;
2413 	int i;
2414 	unsigned long nr_allocated;
2415 	unsigned long total_allocated = 0;
2416 
2417 	nodes = nodes_weight(pol->nodes);
2418 	nr_pages_per_node = nr_pages / nodes;
2419 	delta = nr_pages - nodes * nr_pages_per_node;
2420 
2421 	for (i = 0; i < nodes; i++) {
2422 		if (delta) {
2423 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2424 					interleave_nodes(pol), NULL,
2425 					nr_pages_per_node + 1,
2426 					page_array);
2427 			delta--;
2428 		} else {
2429 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2430 					interleave_nodes(pol), NULL,
2431 					nr_pages_per_node, page_array);
2432 		}
2433 
2434 		page_array += nr_allocated;
2435 		total_allocated += nr_allocated;
2436 	}
2437 
2438 	return total_allocated;
2439 }
2440 
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2441 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2442 		struct mempolicy *pol, unsigned long nr_pages,
2443 		struct page **page_array)
2444 {
2445 	struct task_struct *me = current;
2446 	unsigned int cpuset_mems_cookie;
2447 	unsigned long total_allocated = 0;
2448 	unsigned long nr_allocated = 0;
2449 	unsigned long rounds;
2450 	unsigned long node_pages, delta;
2451 	u8 *table, *weights, weight;
2452 	unsigned int weight_total = 0;
2453 	unsigned long rem_pages = nr_pages;
2454 	nodemask_t nodes;
2455 	int nnodes, node;
2456 	int resume_node = MAX_NUMNODES - 1;
2457 	u8 resume_weight = 0;
2458 	int prev_node;
2459 	int i;
2460 
2461 	if (!nr_pages)
2462 		return 0;
2463 
2464 	/* read the nodes onto the stack, retry if done during rebind */
2465 	do {
2466 		cpuset_mems_cookie = read_mems_allowed_begin();
2467 		nnodes = read_once_policy_nodemask(pol, &nodes);
2468 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2469 
2470 	/* if the nodemask has become invalid, we cannot do anything */
2471 	if (!nnodes)
2472 		return 0;
2473 
2474 	/* Continue allocating from most recent node and adjust the nr_pages */
2475 	node = me->il_prev;
2476 	weight = me->il_weight;
2477 	if (weight && node_isset(node, nodes)) {
2478 		node_pages = min(rem_pages, weight);
2479 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2480 						  page_array);
2481 		page_array += nr_allocated;
2482 		total_allocated += nr_allocated;
2483 		/* if that's all the pages, no need to interleave */
2484 		if (rem_pages <= weight) {
2485 			me->il_weight -= rem_pages;
2486 			return total_allocated;
2487 		}
2488 		/* Otherwise we adjust remaining pages, continue from there */
2489 		rem_pages -= weight;
2490 	}
2491 	/* clear active weight in case of an allocation failure */
2492 	me->il_weight = 0;
2493 	prev_node = node;
2494 
2495 	/* create a local copy of node weights to operate on outside rcu */
2496 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2497 	if (!weights)
2498 		return total_allocated;
2499 
2500 	rcu_read_lock();
2501 	table = rcu_dereference(iw_table);
2502 	if (table)
2503 		memcpy(weights, table, nr_node_ids);
2504 	rcu_read_unlock();
2505 
2506 	/* calculate total, detect system default usage */
2507 	for_each_node_mask(node, nodes) {
2508 		if (!weights[node])
2509 			weights[node] = 1;
2510 		weight_total += weights[node];
2511 	}
2512 
2513 	/*
2514 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2515 	 * Track which node weighted interleave should resume from.
2516 	 *
2517 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2518 	 * the node following prev_node and its weight.
2519 	 */
2520 	rounds = rem_pages / weight_total;
2521 	delta = rem_pages % weight_total;
2522 	resume_node = next_node_in(prev_node, nodes);
2523 	resume_weight = weights[resume_node];
2524 	for (i = 0; i < nnodes; i++) {
2525 		node = next_node_in(prev_node, nodes);
2526 		weight = weights[node];
2527 		node_pages = weight * rounds;
2528 		/* If a delta exists, add this node's portion of the delta */
2529 		if (delta > weight) {
2530 			node_pages += weight;
2531 			delta -= weight;
2532 		} else if (delta) {
2533 			/* when delta is depleted, resume from that node */
2534 			node_pages += delta;
2535 			resume_node = node;
2536 			resume_weight = weight - delta;
2537 			delta = 0;
2538 		}
2539 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2540 		if (!node_pages)
2541 			break;
2542 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2543 						  page_array);
2544 		page_array += nr_allocated;
2545 		total_allocated += nr_allocated;
2546 		if (total_allocated == nr_pages)
2547 			break;
2548 		prev_node = node;
2549 	}
2550 	me->il_prev = resume_node;
2551 	me->il_weight = resume_weight;
2552 	kfree(weights);
2553 	return total_allocated;
2554 }
2555 
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2556 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2557 		struct mempolicy *pol, unsigned long nr_pages,
2558 		struct page **page_array)
2559 {
2560 	gfp_t preferred_gfp;
2561 	unsigned long nr_allocated = 0;
2562 
2563 	preferred_gfp = gfp | __GFP_NOWARN;
2564 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2565 
2566 	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2567 					   nr_pages, page_array);
2568 
2569 	if (nr_allocated < nr_pages)
2570 		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2571 				nr_pages - nr_allocated,
2572 				page_array + nr_allocated);
2573 	return nr_allocated;
2574 }
2575 
2576 /* alloc pages bulk and mempolicy should be considered at the
2577  * same time in some situation such as vmalloc.
2578  *
2579  * It can accelerate memory allocation especially interleaving
2580  * allocate memory.
2581  */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2582 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2583 		unsigned long nr_pages, struct page **page_array)
2584 {
2585 	struct mempolicy *pol = &default_policy;
2586 	nodemask_t *nodemask;
2587 	int nid;
2588 
2589 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2590 		pol = get_task_policy(current);
2591 
2592 	if (pol->mode == MPOL_INTERLEAVE)
2593 		return alloc_pages_bulk_interleave(gfp, pol,
2594 							 nr_pages, page_array);
2595 
2596 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2597 		return alloc_pages_bulk_weighted_interleave(
2598 				  gfp, pol, nr_pages, page_array);
2599 
2600 	if (pol->mode == MPOL_PREFERRED_MANY)
2601 		return alloc_pages_bulk_preferred_many(gfp,
2602 				numa_node_id(), pol, nr_pages, page_array);
2603 
2604 	nid = numa_node_id();
2605 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2606 	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2607 				       nr_pages, page_array);
2608 }
2609 
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2610 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2611 {
2612 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2613 
2614 	if (IS_ERR(pol))
2615 		return PTR_ERR(pol);
2616 	dst->vm_policy = pol;
2617 	return 0;
2618 }
2619 
2620 /*
2621  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2622  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2623  * with the mems_allowed returned by cpuset_mems_allowed().  This
2624  * keeps mempolicies cpuset relative after its cpuset moves.  See
2625  * further kernel/cpuset.c update_nodemask().
2626  *
2627  * current's mempolicy may be rebinded by the other task(the task that changes
2628  * cpuset's mems), so we needn't do rebind work for current task.
2629  */
2630 
2631 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2632 struct mempolicy *__mpol_dup(struct mempolicy *old)
2633 {
2634 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2635 
2636 	if (!new)
2637 		return ERR_PTR(-ENOMEM);
2638 
2639 	/* task's mempolicy is protected by alloc_lock */
2640 	if (old == current->mempolicy) {
2641 		task_lock(current);
2642 		*new = *old;
2643 		task_unlock(current);
2644 	} else
2645 		*new = *old;
2646 
2647 	if (current_cpuset_is_being_rebound()) {
2648 		nodemask_t mems = cpuset_mems_allowed(current);
2649 		mpol_rebind_policy(new, &mems);
2650 	}
2651 	atomic_set(&new->refcnt, 1);
2652 	return new;
2653 }
2654 
2655 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2656 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2657 {
2658 	if (!a || !b)
2659 		return false;
2660 	if (a->mode != b->mode)
2661 		return false;
2662 	if (a->flags != b->flags)
2663 		return false;
2664 	if (a->home_node != b->home_node)
2665 		return false;
2666 	if (mpol_store_user_nodemask(a))
2667 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2668 			return false;
2669 
2670 	switch (a->mode) {
2671 	case MPOL_BIND:
2672 	case MPOL_INTERLEAVE:
2673 	case MPOL_PREFERRED:
2674 	case MPOL_PREFERRED_MANY:
2675 	case MPOL_WEIGHTED_INTERLEAVE:
2676 		return !!nodes_equal(a->nodes, b->nodes);
2677 	case MPOL_LOCAL:
2678 		return true;
2679 	default:
2680 		BUG();
2681 		return false;
2682 	}
2683 }
2684 
2685 /*
2686  * Shared memory backing store policy support.
2687  *
2688  * Remember policies even when nobody has shared memory mapped.
2689  * The policies are kept in Red-Black tree linked from the inode.
2690  * They are protected by the sp->lock rwlock, which should be held
2691  * for any accesses to the tree.
2692  */
2693 
2694 /*
2695  * lookup first element intersecting start-end.  Caller holds sp->lock for
2696  * reading or for writing
2697  */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2698 static struct sp_node *sp_lookup(struct shared_policy *sp,
2699 					pgoff_t start, pgoff_t end)
2700 {
2701 	struct rb_node *n = sp->root.rb_node;
2702 
2703 	while (n) {
2704 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2705 
2706 		if (start >= p->end)
2707 			n = n->rb_right;
2708 		else if (end <= p->start)
2709 			n = n->rb_left;
2710 		else
2711 			break;
2712 	}
2713 	if (!n)
2714 		return NULL;
2715 	for (;;) {
2716 		struct sp_node *w = NULL;
2717 		struct rb_node *prev = rb_prev(n);
2718 		if (!prev)
2719 			break;
2720 		w = rb_entry(prev, struct sp_node, nd);
2721 		if (w->end <= start)
2722 			break;
2723 		n = prev;
2724 	}
2725 	return rb_entry(n, struct sp_node, nd);
2726 }
2727 
2728 /*
2729  * Insert a new shared policy into the list.  Caller holds sp->lock for
2730  * writing.
2731  */
sp_insert(struct shared_policy * sp,struct sp_node * new)2732 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2733 {
2734 	struct rb_node **p = &sp->root.rb_node;
2735 	struct rb_node *parent = NULL;
2736 	struct sp_node *nd;
2737 
2738 	while (*p) {
2739 		parent = *p;
2740 		nd = rb_entry(parent, struct sp_node, nd);
2741 		if (new->start < nd->start)
2742 			p = &(*p)->rb_left;
2743 		else if (new->end > nd->end)
2744 			p = &(*p)->rb_right;
2745 		else
2746 			BUG();
2747 	}
2748 	rb_link_node(&new->nd, parent, p);
2749 	rb_insert_color(&new->nd, &sp->root);
2750 }
2751 
2752 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2753 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2754 						pgoff_t idx)
2755 {
2756 	struct mempolicy *pol = NULL;
2757 	struct sp_node *sn;
2758 
2759 	if (!sp->root.rb_node)
2760 		return NULL;
2761 	read_lock(&sp->lock);
2762 	sn = sp_lookup(sp, idx, idx+1);
2763 	if (sn) {
2764 		mpol_get(sn->policy);
2765 		pol = sn->policy;
2766 	}
2767 	read_unlock(&sp->lock);
2768 	return pol;
2769 }
2770 
sp_free(struct sp_node * n)2771 static void sp_free(struct sp_node *n)
2772 {
2773 	mpol_put(n->policy);
2774 	kmem_cache_free(sn_cache, n);
2775 }
2776 
2777 /**
2778  * mpol_misplaced - check whether current folio node is valid in policy
2779  *
2780  * @folio: folio to be checked
2781  * @vmf: structure describing the fault
2782  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2783  *
2784  * Lookup current policy node id for vma,addr and "compare to" folio's
2785  * node id.  Policy determination "mimics" alloc_page_vma().
2786  * Called from fault path where we know the vma and faulting address.
2787  *
2788  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2789  * policy, or a suitable node ID to allocate a replacement folio from.
2790  */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2791 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2792 		   unsigned long addr)
2793 {
2794 	struct mempolicy *pol;
2795 	pgoff_t ilx;
2796 	struct zoneref *z;
2797 	int curnid = folio_nid(folio);
2798 	struct vm_area_struct *vma = vmf->vma;
2799 	int thiscpu = raw_smp_processor_id();
2800 	int thisnid = numa_node_id();
2801 	int polnid = NUMA_NO_NODE;
2802 	int ret = NUMA_NO_NODE;
2803 
2804 	/*
2805 	 * Make sure ptl is held so that we don't preempt and we
2806 	 * have a stable smp processor id
2807 	 */
2808 	lockdep_assert_held(vmf->ptl);
2809 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2810 	if (!(pol->flags & MPOL_F_MOF))
2811 		goto out;
2812 
2813 	switch (pol->mode) {
2814 	case MPOL_INTERLEAVE:
2815 		polnid = interleave_nid(pol, ilx);
2816 		break;
2817 
2818 	case MPOL_WEIGHTED_INTERLEAVE:
2819 		polnid = weighted_interleave_nid(pol, ilx);
2820 		break;
2821 
2822 	case MPOL_PREFERRED:
2823 		if (node_isset(curnid, pol->nodes))
2824 			goto out;
2825 		polnid = first_node(pol->nodes);
2826 		break;
2827 
2828 	case MPOL_LOCAL:
2829 		polnid = numa_node_id();
2830 		break;
2831 
2832 	case MPOL_BIND:
2833 	case MPOL_PREFERRED_MANY:
2834 		/*
2835 		 * Even though MPOL_PREFERRED_MANY can allocate pages outside
2836 		 * policy nodemask we don't allow numa migration to nodes
2837 		 * outside policy nodemask for now. This is done so that if we
2838 		 * want demotion to slow memory to happen, before allocating
2839 		 * from some DRAM node say 'x', we will end up using a
2840 		 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
2841 		 * we should not promote to node 'x' from slow memory node.
2842 		 */
2843 		if (pol->flags & MPOL_F_MORON) {
2844 			/*
2845 			 * Optimize placement among multiple nodes
2846 			 * via NUMA balancing
2847 			 */
2848 			if (node_isset(thisnid, pol->nodes))
2849 				break;
2850 			goto out;
2851 		}
2852 
2853 		/*
2854 		 * use current page if in policy nodemask,
2855 		 * else select nearest allowed node, if any.
2856 		 * If no allowed nodes, use current [!misplaced].
2857 		 */
2858 		if (node_isset(curnid, pol->nodes))
2859 			goto out;
2860 		z = first_zones_zonelist(
2861 				node_zonelist(thisnid, GFP_HIGHUSER),
2862 				gfp_zone(GFP_HIGHUSER),
2863 				&pol->nodes);
2864 		polnid = zonelist_node_idx(z);
2865 		break;
2866 
2867 	default:
2868 		BUG();
2869 	}
2870 
2871 	/* Migrate the folio towards the node whose CPU is referencing it */
2872 	if (pol->flags & MPOL_F_MORON) {
2873 		polnid = thisnid;
2874 
2875 		if (!should_numa_migrate_memory(current, folio, curnid,
2876 						thiscpu))
2877 			goto out;
2878 	}
2879 
2880 	if (curnid != polnid)
2881 		ret = polnid;
2882 out:
2883 	mpol_cond_put(pol);
2884 
2885 	return ret;
2886 }
2887 
2888 /*
2889  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2890  * dropped after task->mempolicy is set to NULL so that any allocation done as
2891  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2892  * policy.
2893  */
mpol_put_task_policy(struct task_struct * task)2894 void mpol_put_task_policy(struct task_struct *task)
2895 {
2896 	struct mempolicy *pol;
2897 
2898 	task_lock(task);
2899 	pol = task->mempolicy;
2900 	task->mempolicy = NULL;
2901 	task_unlock(task);
2902 	mpol_put(pol);
2903 }
2904 
sp_delete(struct shared_policy * sp,struct sp_node * n)2905 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2906 {
2907 	rb_erase(&n->nd, &sp->root);
2908 	sp_free(n);
2909 }
2910 
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)2911 static void sp_node_init(struct sp_node *node, unsigned long start,
2912 			unsigned long end, struct mempolicy *pol)
2913 {
2914 	node->start = start;
2915 	node->end = end;
2916 	node->policy = pol;
2917 }
2918 
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)2919 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2920 				struct mempolicy *pol)
2921 {
2922 	struct sp_node *n;
2923 	struct mempolicy *newpol;
2924 
2925 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2926 	if (!n)
2927 		return NULL;
2928 
2929 	newpol = mpol_dup(pol);
2930 	if (IS_ERR(newpol)) {
2931 		kmem_cache_free(sn_cache, n);
2932 		return NULL;
2933 	}
2934 	newpol->flags |= MPOL_F_SHARED;
2935 	sp_node_init(n, start, end, newpol);
2936 
2937 	return n;
2938 }
2939 
2940 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)2941 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
2942 				 pgoff_t end, struct sp_node *new)
2943 {
2944 	struct sp_node *n;
2945 	struct sp_node *n_new = NULL;
2946 	struct mempolicy *mpol_new = NULL;
2947 	int ret = 0;
2948 
2949 restart:
2950 	write_lock(&sp->lock);
2951 	n = sp_lookup(sp, start, end);
2952 	/* Take care of old policies in the same range. */
2953 	while (n && n->start < end) {
2954 		struct rb_node *next = rb_next(&n->nd);
2955 		if (n->start >= start) {
2956 			if (n->end <= end)
2957 				sp_delete(sp, n);
2958 			else
2959 				n->start = end;
2960 		} else {
2961 			/* Old policy spanning whole new range. */
2962 			if (n->end > end) {
2963 				if (!n_new)
2964 					goto alloc_new;
2965 
2966 				*mpol_new = *n->policy;
2967 				atomic_set(&mpol_new->refcnt, 1);
2968 				sp_node_init(n_new, end, n->end, mpol_new);
2969 				n->end = start;
2970 				sp_insert(sp, n_new);
2971 				n_new = NULL;
2972 				mpol_new = NULL;
2973 				break;
2974 			} else
2975 				n->end = start;
2976 		}
2977 		if (!next)
2978 			break;
2979 		n = rb_entry(next, struct sp_node, nd);
2980 	}
2981 	if (new)
2982 		sp_insert(sp, new);
2983 	write_unlock(&sp->lock);
2984 	ret = 0;
2985 
2986 err_out:
2987 	if (mpol_new)
2988 		mpol_put(mpol_new);
2989 	if (n_new)
2990 		kmem_cache_free(sn_cache, n_new);
2991 
2992 	return ret;
2993 
2994 alloc_new:
2995 	write_unlock(&sp->lock);
2996 	ret = -ENOMEM;
2997 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2998 	if (!n_new)
2999 		goto err_out;
3000 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3001 	if (!mpol_new)
3002 		goto err_out;
3003 	atomic_set(&mpol_new->refcnt, 1);
3004 	goto restart;
3005 }
3006 
3007 /**
3008  * mpol_shared_policy_init - initialize shared policy for inode
3009  * @sp: pointer to inode shared policy
3010  * @mpol:  struct mempolicy to install
3011  *
3012  * Install non-NULL @mpol in inode's shared policy rb-tree.
3013  * On entry, the current task has a reference on a non-NULL @mpol.
3014  * This must be released on exit.
3015  * This is called at get_inode() calls and we can use GFP_KERNEL.
3016  */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3017 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3018 {
3019 	int ret;
3020 
3021 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
3022 	rwlock_init(&sp->lock);
3023 
3024 	if (mpol) {
3025 		struct sp_node *sn;
3026 		struct mempolicy *npol;
3027 		NODEMASK_SCRATCH(scratch);
3028 
3029 		if (!scratch)
3030 			goto put_mpol;
3031 
3032 		/* contextualize the tmpfs mount point mempolicy to this file */
3033 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3034 		if (IS_ERR(npol))
3035 			goto free_scratch; /* no valid nodemask intersection */
3036 
3037 		task_lock(current);
3038 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3039 		task_unlock(current);
3040 		if (ret)
3041 			goto put_npol;
3042 
3043 		/* alloc node covering entire file; adds ref to file's npol */
3044 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3045 		if (sn)
3046 			sp_insert(sp, sn);
3047 put_npol:
3048 		mpol_put(npol);	/* drop initial ref on file's npol */
3049 free_scratch:
3050 		NODEMASK_SCRATCH_FREE(scratch);
3051 put_mpol:
3052 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
3053 	}
3054 }
3055 
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3056 int mpol_set_shared_policy(struct shared_policy *sp,
3057 			struct vm_area_struct *vma, struct mempolicy *pol)
3058 {
3059 	int err;
3060 	struct sp_node *new = NULL;
3061 	unsigned long sz = vma_pages(vma);
3062 
3063 	if (pol) {
3064 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3065 		if (!new)
3066 			return -ENOMEM;
3067 	}
3068 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3069 	if (err && new)
3070 		sp_free(new);
3071 	return err;
3072 }
3073 
3074 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3075 void mpol_free_shared_policy(struct shared_policy *sp)
3076 {
3077 	struct sp_node *n;
3078 	struct rb_node *next;
3079 
3080 	if (!sp->root.rb_node)
3081 		return;
3082 	write_lock(&sp->lock);
3083 	next = rb_first(&sp->root);
3084 	while (next) {
3085 		n = rb_entry(next, struct sp_node, nd);
3086 		next = rb_next(&n->nd);
3087 		sp_delete(sp, n);
3088 	}
3089 	write_unlock(&sp->lock);
3090 }
3091 
3092 #ifdef CONFIG_NUMA_BALANCING
3093 static int __initdata numabalancing_override;
3094 
check_numabalancing_enable(void)3095 static void __init check_numabalancing_enable(void)
3096 {
3097 	bool numabalancing_default = false;
3098 
3099 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3100 		numabalancing_default = true;
3101 
3102 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3103 	if (numabalancing_override)
3104 		set_numabalancing_state(numabalancing_override == 1);
3105 
3106 	if (num_online_nodes() > 1 && !numabalancing_override) {
3107 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3108 			numabalancing_default ? "Enabling" : "Disabling");
3109 		set_numabalancing_state(numabalancing_default);
3110 	}
3111 }
3112 
setup_numabalancing(char * str)3113 static int __init setup_numabalancing(char *str)
3114 {
3115 	int ret = 0;
3116 	if (!str)
3117 		goto out;
3118 
3119 	if (!strcmp(str, "enable")) {
3120 		numabalancing_override = 1;
3121 		ret = 1;
3122 	} else if (!strcmp(str, "disable")) {
3123 		numabalancing_override = -1;
3124 		ret = 1;
3125 	}
3126 out:
3127 	if (!ret)
3128 		pr_warn("Unable to parse numa_balancing=\n");
3129 
3130 	return ret;
3131 }
3132 __setup("numa_balancing=", setup_numabalancing);
3133 #else
check_numabalancing_enable(void)3134 static inline void __init check_numabalancing_enable(void)
3135 {
3136 }
3137 #endif /* CONFIG_NUMA_BALANCING */
3138 
numa_policy_init(void)3139 void __init numa_policy_init(void)
3140 {
3141 	nodemask_t interleave_nodes;
3142 	unsigned long largest = 0;
3143 	int nid, prefer = 0;
3144 
3145 	policy_cache = kmem_cache_create("numa_policy",
3146 					 sizeof(struct mempolicy),
3147 					 0, SLAB_PANIC, NULL);
3148 
3149 	sn_cache = kmem_cache_create("shared_policy_node",
3150 				     sizeof(struct sp_node),
3151 				     0, SLAB_PANIC, NULL);
3152 
3153 	for_each_node(nid) {
3154 		preferred_node_policy[nid] = (struct mempolicy) {
3155 			.refcnt = ATOMIC_INIT(1),
3156 			.mode = MPOL_PREFERRED,
3157 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3158 			.nodes = nodemask_of_node(nid),
3159 		};
3160 	}
3161 
3162 	/*
3163 	 * Set interleaving policy for system init. Interleaving is only
3164 	 * enabled across suitably sized nodes (default is >= 16MB), or
3165 	 * fall back to the largest node if they're all smaller.
3166 	 */
3167 	nodes_clear(interleave_nodes);
3168 	for_each_node_state(nid, N_MEMORY) {
3169 		unsigned long total_pages = node_present_pages(nid);
3170 
3171 		/* Preserve the largest node */
3172 		if (largest < total_pages) {
3173 			largest = total_pages;
3174 			prefer = nid;
3175 		}
3176 
3177 		/* Interleave this node? */
3178 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3179 			node_set(nid, interleave_nodes);
3180 	}
3181 
3182 	/* All too small, use the largest */
3183 	if (unlikely(nodes_empty(interleave_nodes)))
3184 		node_set(prefer, interleave_nodes);
3185 
3186 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3187 		pr_err("%s: interleaving failed\n", __func__);
3188 
3189 	check_numabalancing_enable();
3190 }
3191 
3192 /* Reset policy of current process to default */
numa_default_policy(void)3193 void numa_default_policy(void)
3194 {
3195 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3196 }
3197 
3198 /*
3199  * Parse and format mempolicy from/to strings
3200  */
3201 static const char * const policy_modes[] =
3202 {
3203 	[MPOL_DEFAULT]    = "default",
3204 	[MPOL_PREFERRED]  = "prefer",
3205 	[MPOL_BIND]       = "bind",
3206 	[MPOL_INTERLEAVE] = "interleave",
3207 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3208 	[MPOL_LOCAL]      = "local",
3209 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3210 };
3211 
3212 #ifdef CONFIG_TMPFS
3213 /**
3214  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3215  * @str:  string containing mempolicy to parse
3216  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3217  *
3218  * Format of input:
3219  *	<mode>[=<flags>][:<nodelist>]
3220  *
3221  * Return: %0 on success, else %1
3222  */
mpol_parse_str(char * str,struct mempolicy ** mpol)3223 int mpol_parse_str(char *str, struct mempolicy **mpol)
3224 {
3225 	struct mempolicy *new = NULL;
3226 	unsigned short mode_flags;
3227 	nodemask_t nodes;
3228 	char *nodelist = strchr(str, ':');
3229 	char *flags = strchr(str, '=');
3230 	int err = 1, mode;
3231 
3232 	if (flags)
3233 		*flags++ = '\0';	/* terminate mode string */
3234 
3235 	if (nodelist) {
3236 		/* NUL-terminate mode or flags string */
3237 		*nodelist++ = '\0';
3238 		if (nodelist_parse(nodelist, nodes))
3239 			goto out;
3240 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3241 			goto out;
3242 	} else
3243 		nodes_clear(nodes);
3244 
3245 	mode = match_string(policy_modes, MPOL_MAX, str);
3246 	if (mode < 0)
3247 		goto out;
3248 
3249 	switch (mode) {
3250 	case MPOL_PREFERRED:
3251 		/*
3252 		 * Insist on a nodelist of one node only, although later
3253 		 * we use first_node(nodes) to grab a single node, so here
3254 		 * nodelist (or nodes) cannot be empty.
3255 		 */
3256 		if (nodelist) {
3257 			char *rest = nodelist;
3258 			while (isdigit(*rest))
3259 				rest++;
3260 			if (*rest)
3261 				goto out;
3262 			if (nodes_empty(nodes))
3263 				goto out;
3264 		}
3265 		break;
3266 	case MPOL_INTERLEAVE:
3267 	case MPOL_WEIGHTED_INTERLEAVE:
3268 		/*
3269 		 * Default to online nodes with memory if no nodelist
3270 		 */
3271 		if (!nodelist)
3272 			nodes = node_states[N_MEMORY];
3273 		break;
3274 	case MPOL_LOCAL:
3275 		/*
3276 		 * Don't allow a nodelist;  mpol_new() checks flags
3277 		 */
3278 		if (nodelist)
3279 			goto out;
3280 		break;
3281 	case MPOL_DEFAULT:
3282 		/*
3283 		 * Insist on a empty nodelist
3284 		 */
3285 		if (!nodelist)
3286 			err = 0;
3287 		goto out;
3288 	case MPOL_PREFERRED_MANY:
3289 	case MPOL_BIND:
3290 		/*
3291 		 * Insist on a nodelist
3292 		 */
3293 		if (!nodelist)
3294 			goto out;
3295 	}
3296 
3297 	mode_flags = 0;
3298 	if (flags) {
3299 		/*
3300 		 * Currently, we only support two mutually exclusive
3301 		 * mode flags.
3302 		 */
3303 		if (!strcmp(flags, "static"))
3304 			mode_flags |= MPOL_F_STATIC_NODES;
3305 		else if (!strcmp(flags, "relative"))
3306 			mode_flags |= MPOL_F_RELATIVE_NODES;
3307 		else
3308 			goto out;
3309 	}
3310 
3311 	new = mpol_new(mode, mode_flags, &nodes);
3312 	if (IS_ERR(new))
3313 		goto out;
3314 
3315 	/*
3316 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3317 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3318 	 */
3319 	if (mode != MPOL_PREFERRED) {
3320 		new->nodes = nodes;
3321 	} else if (nodelist) {
3322 		nodes_clear(new->nodes);
3323 		node_set(first_node(nodes), new->nodes);
3324 	} else {
3325 		new->mode = MPOL_LOCAL;
3326 	}
3327 
3328 	/*
3329 	 * Save nodes for contextualization: this will be used to "clone"
3330 	 * the mempolicy in a specific context [cpuset] at a later time.
3331 	 */
3332 	new->w.user_nodemask = nodes;
3333 
3334 	err = 0;
3335 
3336 out:
3337 	/* Restore string for error message */
3338 	if (nodelist)
3339 		*--nodelist = ':';
3340 	if (flags)
3341 		*--flags = '=';
3342 	if (!err)
3343 		*mpol = new;
3344 	return err;
3345 }
3346 #endif /* CONFIG_TMPFS */
3347 
3348 /**
3349  * mpol_to_str - format a mempolicy structure for printing
3350  * @buffer:  to contain formatted mempolicy string
3351  * @maxlen:  length of @buffer
3352  * @pol:  pointer to mempolicy to be formatted
3353  *
3354  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3355  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3356  * interleave", plus the longest flag flags, "relative|balancing", and to
3357  * display at least a few node ids.
3358  */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3359 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3360 {
3361 	char *p = buffer;
3362 	nodemask_t nodes = NODE_MASK_NONE;
3363 	unsigned short mode = MPOL_DEFAULT;
3364 	unsigned short flags = 0;
3365 
3366 	if (pol &&
3367 	    pol != &default_policy &&
3368 	    !(pol >= &preferred_node_policy[0] &&
3369 	      pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3370 		mode = pol->mode;
3371 		flags = pol->flags;
3372 	}
3373 
3374 	switch (mode) {
3375 	case MPOL_DEFAULT:
3376 	case MPOL_LOCAL:
3377 		break;
3378 	case MPOL_PREFERRED:
3379 	case MPOL_PREFERRED_MANY:
3380 	case MPOL_BIND:
3381 	case MPOL_INTERLEAVE:
3382 	case MPOL_WEIGHTED_INTERLEAVE:
3383 		nodes = pol->nodes;
3384 		break;
3385 	default:
3386 		WARN_ON_ONCE(1);
3387 		snprintf(p, maxlen, "unknown");
3388 		return;
3389 	}
3390 
3391 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3392 
3393 	if (flags & MPOL_MODE_FLAGS) {
3394 		p += snprintf(p, buffer + maxlen - p, "=");
3395 
3396 		/*
3397 		 * Static and relative are mutually exclusive.
3398 		 */
3399 		if (flags & MPOL_F_STATIC_NODES)
3400 			p += snprintf(p, buffer + maxlen - p, "static");
3401 		else if (flags & MPOL_F_RELATIVE_NODES)
3402 			p += snprintf(p, buffer + maxlen - p, "relative");
3403 
3404 		if (flags & MPOL_F_NUMA_BALANCING) {
3405 			if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3406 				p += snprintf(p, buffer + maxlen - p, "|");
3407 			p += snprintf(p, buffer + maxlen - p, "balancing");
3408 		}
3409 	}
3410 
3411 	if (!nodes_empty(nodes))
3412 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3413 			       nodemask_pr_args(&nodes));
3414 }
3415 
3416 #ifdef CONFIG_SYSFS
3417 struct iw_node_attr {
3418 	struct kobj_attribute kobj_attr;
3419 	int nid;
3420 };
3421 
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3422 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3423 			 char *buf)
3424 {
3425 	struct iw_node_attr *node_attr;
3426 	u8 weight;
3427 
3428 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3429 	weight = get_il_weight(node_attr->nid);
3430 	return sysfs_emit(buf, "%d\n", weight);
3431 }
3432 
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3433 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3434 			  const char *buf, size_t count)
3435 {
3436 	struct iw_node_attr *node_attr;
3437 	u8 *new;
3438 	u8 *old;
3439 	u8 weight = 0;
3440 
3441 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3442 	if (count == 0 || sysfs_streq(buf, ""))
3443 		weight = 0;
3444 	else if (kstrtou8(buf, 0, &weight))
3445 		return -EINVAL;
3446 
3447 	new = kzalloc(nr_node_ids, GFP_KERNEL);
3448 	if (!new)
3449 		return -ENOMEM;
3450 
3451 	mutex_lock(&iw_table_lock);
3452 	old = rcu_dereference_protected(iw_table,
3453 					lockdep_is_held(&iw_table_lock));
3454 	if (old)
3455 		memcpy(new, old, nr_node_ids);
3456 	new[node_attr->nid] = weight;
3457 	rcu_assign_pointer(iw_table, new);
3458 	mutex_unlock(&iw_table_lock);
3459 	synchronize_rcu();
3460 	kfree(old);
3461 	return count;
3462 }
3463 
3464 static struct iw_node_attr **node_attrs;
3465 
sysfs_wi_node_release(struct iw_node_attr * node_attr,struct kobject * parent)3466 static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
3467 				  struct kobject *parent)
3468 {
3469 	if (!node_attr)
3470 		return;
3471 	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
3472 	kfree(node_attr->kobj_attr.attr.name);
3473 	kfree(node_attr);
3474 }
3475 
sysfs_wi_release(struct kobject * wi_kobj)3476 static void sysfs_wi_release(struct kobject *wi_kobj)
3477 {
3478 	int i;
3479 
3480 	for (i = 0; i < nr_node_ids; i++)
3481 		sysfs_wi_node_release(node_attrs[i], wi_kobj);
3482 	kobject_put(wi_kobj);
3483 }
3484 
3485 static const struct kobj_type wi_ktype = {
3486 	.sysfs_ops = &kobj_sysfs_ops,
3487 	.release = sysfs_wi_release,
3488 };
3489 
add_weight_node(int nid,struct kobject * wi_kobj)3490 static int add_weight_node(int nid, struct kobject *wi_kobj)
3491 {
3492 	struct iw_node_attr *node_attr;
3493 	char *name;
3494 
3495 	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
3496 	if (!node_attr)
3497 		return -ENOMEM;
3498 
3499 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3500 	if (!name) {
3501 		kfree(node_attr);
3502 		return -ENOMEM;
3503 	}
3504 
3505 	sysfs_attr_init(&node_attr->kobj_attr.attr);
3506 	node_attr->kobj_attr.attr.name = name;
3507 	node_attr->kobj_attr.attr.mode = 0644;
3508 	node_attr->kobj_attr.show = node_show;
3509 	node_attr->kobj_attr.store = node_store;
3510 	node_attr->nid = nid;
3511 
3512 	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
3513 		kfree(node_attr->kobj_attr.attr.name);
3514 		kfree(node_attr);
3515 		pr_err("failed to add attribute to weighted_interleave\n");
3516 		return -ENOMEM;
3517 	}
3518 
3519 	node_attrs[nid] = node_attr;
3520 	return 0;
3521 }
3522 
add_weighted_interleave_group(struct kobject * root_kobj)3523 static int add_weighted_interleave_group(struct kobject *root_kobj)
3524 {
3525 	struct kobject *wi_kobj;
3526 	int nid, err;
3527 
3528 	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
3529 	if (!wi_kobj)
3530 		return -ENOMEM;
3531 
3532 	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
3533 				   "weighted_interleave");
3534 	if (err) {
3535 		kfree(wi_kobj);
3536 		return err;
3537 	}
3538 
3539 	for_each_node_state(nid, N_POSSIBLE) {
3540 		err = add_weight_node(nid, wi_kobj);
3541 		if (err) {
3542 			pr_err("failed to add sysfs [node%d]\n", nid);
3543 			break;
3544 		}
3545 	}
3546 	if (err)
3547 		kobject_put(wi_kobj);
3548 	return 0;
3549 }
3550 
mempolicy_kobj_release(struct kobject * kobj)3551 static void mempolicy_kobj_release(struct kobject *kobj)
3552 {
3553 	u8 *old;
3554 
3555 	mutex_lock(&iw_table_lock);
3556 	old = rcu_dereference_protected(iw_table,
3557 					lockdep_is_held(&iw_table_lock));
3558 	rcu_assign_pointer(iw_table, NULL);
3559 	mutex_unlock(&iw_table_lock);
3560 	synchronize_rcu();
3561 	kfree(old);
3562 	kfree(node_attrs);
3563 	kfree(kobj);
3564 }
3565 
3566 static const struct kobj_type mempolicy_ktype = {
3567 	.release = mempolicy_kobj_release
3568 };
3569 
mempolicy_sysfs_init(void)3570 static int __init mempolicy_sysfs_init(void)
3571 {
3572 	int err;
3573 	static struct kobject *mempolicy_kobj;
3574 
3575 	mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
3576 	if (!mempolicy_kobj) {
3577 		err = -ENOMEM;
3578 		goto err_out;
3579 	}
3580 
3581 	node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
3582 			     GFP_KERNEL);
3583 	if (!node_attrs) {
3584 		err = -ENOMEM;
3585 		goto mempol_out;
3586 	}
3587 
3588 	err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
3589 				   "mempolicy");
3590 	if (err)
3591 		goto node_out;
3592 
3593 	err = add_weighted_interleave_group(mempolicy_kobj);
3594 	if (err) {
3595 		pr_err("mempolicy sysfs structure failed to initialize\n");
3596 		kobject_put(mempolicy_kobj);
3597 		return err;
3598 	}
3599 
3600 	return err;
3601 node_out:
3602 	kfree(node_attrs);
3603 mempol_out:
3604 	kfree(mempolicy_kobj);
3605 err_out:
3606 	pr_err("failed to add mempolicy kobject to the system\n");
3607 	return err;
3608 }
3609 
3610 late_initcall(mempolicy_sysfs_init);
3611 #endif /* CONFIG_SYSFS */
3612