xref: /linux/mm/mempolicy.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Simple NUMA memory policy for the Linux kernel.
4  *
5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support six policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * weighted interleave
23  *                Allocate memory interleaved over a set of nodes based on
24  *                a set of weights (per-node), with normal fallback if it
25  *                fails.  Otherwise operates the same as interleave.
26  *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27  *                on node 0 for every 1 page allocated on node 1.
28  *
29  * bind           Only allocate memory on a specific set of nodes,
30  *                no fallback.
31  *                FIXME: memory is allocated starting with the first node
32  *                to the last. It would be better if bind would truly restrict
33  *                the allocation to memory nodes instead
34  *
35  * preferred      Try a specific node first before normal fallback.
36  *                As a special case NUMA_NO_NODE here means do the allocation
37  *                on the local CPU. This is normally identical to default,
38  *                but useful to set in a VMA when you have a non default
39  *                process policy.
40  *
41  * preferred many Try a set of nodes first before normal fallback. This is
42  *                similar to preferred without the special case.
43  *
44  * default        Allocate on the local node first, or when on a VMA
45  *                use the process policy. This is what Linux always did
46  *		  in a NUMA aware kernel and still does by, ahem, default.
47  *
48  * The process policy is applied for most non interrupt memory allocations
49  * in that process' context. Interrupts ignore the policies and always
50  * try to allocate on the local CPU. The VMA policy is only applied for memory
51  * allocations for a VMA in the VM.
52  *
53  * Currently there are a few corner cases in swapping where the policy
54  * is not applied, but the majority should be handled. When process policy
55  * is used it is not remembered over swap outs/swap ins.
56  *
57  * Only the highest zone in the zone hierarchy gets policied. Allocations
58  * requesting a lower zone just use default policy. This implies that
59  * on systems with highmem kernel lowmem allocation don't get policied.
60  * Same with GFP_DMA allocations.
61  *
62  * For shmem/tmpfs shared memory the policy is shared between
63  * all users and remembered even when nobody has memory mapped.
64  */
65 
66 /* Notebook:
67    fix mmap readahead to honour policy and enable policy for any page cache
68    object
69    statistics for bigpages
70    global policy for page cache? currently it uses process policy. Requires
71    first item above.
72    handle mremap for shared memory (currently ignored for the policy)
73    grows down?
74    make bind policy root only? It can trigger oom much faster and the
75    kernel is not always grateful with that.
76 */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/mempolicy.h>
81 #include <linux/pagewalk.h>
82 #include <linux/highmem.h>
83 #include <linux/hugetlb.h>
84 #include <linux/kernel.h>
85 #include <linux/sched.h>
86 #include <linux/sched/mm.h>
87 #include <linux/sched/numa_balancing.h>
88 #include <linux/sched/task.h>
89 #include <linux/nodemask.h>
90 #include <linux/cpuset.h>
91 #include <linux/slab.h>
92 #include <linux/string.h>
93 #include <linux/export.h>
94 #include <linux/nsproxy.h>
95 #include <linux/interrupt.h>
96 #include <linux/init.h>
97 #include <linux/compat.h>
98 #include <linux/ptrace.h>
99 #include <linux/swap.h>
100 #include <linux/seq_file.h>
101 #include <linux/proc_fs.h>
102 #include <linux/migrate.h>
103 #include <linux/ksm.h>
104 #include <linux/rmap.h>
105 #include <linux/security.h>
106 #include <linux/syscalls.h>
107 #include <linux/ctype.h>
108 #include <linux/mm_inline.h>
109 #include <linux/mmu_notifier.h>
110 #include <linux/printk.h>
111 #include <linux/swapops.h>
112 #include <linux/gcd.h>
113 
114 #include <asm/tlbflush.h>
115 #include <asm/tlb.h>
116 #include <linux/uaccess.h>
117 #include <linux/memory.h>
118 
119 #include "internal.h"
120 
121 /* Internal flags */
122 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
123 #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
124 #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
125 
126 static struct kmem_cache *policy_cache;
127 static struct kmem_cache *sn_cache;
128 
129 /* Highest zone. An specific allocation for a zone below that is not
130    policied. */
131 enum zone_type policy_zone = 0;
132 
133 /*
134  * run-time system-wide default policy => local allocation
135  */
136 static struct mempolicy default_policy = {
137 	.refcnt = ATOMIC_INIT(1), /* never free it */
138 	.mode = MPOL_LOCAL,
139 };
140 
141 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
142 
143 /*
144  * weightiness balances the tradeoff between small weights (cycles through nodes
145  * faster, more fair/even distribution) and large weights (smaller errors
146  * between actual bandwidth ratios and weight ratios). 32 is a number that has
147  * been found to perform at a reasonable compromise between the two goals.
148  */
149 static const int weightiness = 32;
150 
151 /*
152  * A null weighted_interleave_state is interpreted as having .mode="auto",
153  * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
154  */
155 struct weighted_interleave_state {
156 	bool mode_auto;
157 	u8 iw_table[];
158 };
159 static struct weighted_interleave_state __rcu *wi_state;
160 static unsigned int *node_bw_table;
161 
162 /*
163  * wi_state_lock protects both wi_state and node_bw_table.
164  * node_bw_table is only used by writers to update wi_state.
165  */
166 static DEFINE_MUTEX(wi_state_lock);
167 
get_il_weight(int node)168 static u8 get_il_weight(int node)
169 {
170 	struct weighted_interleave_state *state;
171 	u8 weight = 1;
172 
173 	rcu_read_lock();
174 	state = rcu_dereference(wi_state);
175 	if (state)
176 		weight = state->iw_table[node];
177 	rcu_read_unlock();
178 	return weight;
179 }
180 
181 /*
182  * Convert bandwidth values into weighted interleave weights.
183  * Call with wi_state_lock.
184  */
reduce_interleave_weights(unsigned int * bw,u8 * new_iw)185 static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
186 {
187 	u64 sum_bw = 0;
188 	unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
189 	int nid;
190 
191 	for_each_node_state(nid, N_MEMORY)
192 		sum_bw += bw[nid];
193 
194 	/* Scale bandwidths to whole numbers in the range [1, weightiness] */
195 	for_each_node_state(nid, N_MEMORY) {
196 		/*
197 		 * Try not to perform 64-bit division.
198 		 * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
199 		 * If sum_bw > scaling_factor, then round the weight up to 1.
200 		 */
201 		scaling_factor = weightiness * bw[nid];
202 		if (bw[nid] && sum_bw < scaling_factor) {
203 			cast_sum_bw = (unsigned int)sum_bw;
204 			new_iw[nid] = scaling_factor / cast_sum_bw;
205 		} else {
206 			new_iw[nid] = 1;
207 		}
208 		if (!iw_gcd)
209 			iw_gcd = new_iw[nid];
210 		iw_gcd = gcd(iw_gcd, new_iw[nid]);
211 	}
212 
213 	/* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
214 	for_each_node_state(nid, N_MEMORY)
215 		new_iw[nid] /= iw_gcd;
216 }
217 
mempolicy_set_node_perf(unsigned int node,struct access_coordinate * coords)218 int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
219 {
220 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
221 	unsigned int *old_bw, *new_bw;
222 	unsigned int bw_val;
223 	int i;
224 
225 	bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
226 	new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
227 	if (!new_bw)
228 		return -ENOMEM;
229 
230 	new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
231 			       GFP_KERNEL);
232 	if (!new_wi_state) {
233 		kfree(new_bw);
234 		return -ENOMEM;
235 	}
236 	new_wi_state->mode_auto = true;
237 	for (i = 0; i < nr_node_ids; i++)
238 		new_wi_state->iw_table[i] = 1;
239 
240 	/*
241 	 * Update bandwidth info, even in manual mode. That way, when switching
242 	 * to auto mode in the future, iw_table can be overwritten using
243 	 * accurate bw data.
244 	 */
245 	mutex_lock(&wi_state_lock);
246 
247 	old_bw = node_bw_table;
248 	if (old_bw)
249 		memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
250 	new_bw[node] = bw_val;
251 	node_bw_table = new_bw;
252 
253 	old_wi_state = rcu_dereference_protected(wi_state,
254 					lockdep_is_held(&wi_state_lock));
255 	if (old_wi_state && !old_wi_state->mode_auto) {
256 		/* Manual mode; skip reducing weights and updating wi_state */
257 		mutex_unlock(&wi_state_lock);
258 		kfree(new_wi_state);
259 		goto out;
260 	}
261 
262 	/* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
263 	reduce_interleave_weights(new_bw, new_wi_state->iw_table);
264 	rcu_assign_pointer(wi_state, new_wi_state);
265 
266 	mutex_unlock(&wi_state_lock);
267 	if (old_wi_state) {
268 		synchronize_rcu();
269 		kfree(old_wi_state);
270 	}
271 out:
272 	kfree(old_bw);
273 	return 0;
274 }
275 
276 /**
277  * numa_nearest_node - Find nearest node by state
278  * @node: Node id to start the search
279  * @state: State to filter the search
280  *
281  * Lookup the closest node by distance if @nid is not in state.
282  *
283  * Return: this @node if it is in state, otherwise the closest node by distance
284  */
numa_nearest_node(int node,unsigned int state)285 int numa_nearest_node(int node, unsigned int state)
286 {
287 	int min_dist = INT_MAX, dist, n, min_node;
288 
289 	if (state >= NR_NODE_STATES)
290 		return -EINVAL;
291 
292 	if (node == NUMA_NO_NODE || node_state(node, state))
293 		return node;
294 
295 	min_node = node;
296 	for_each_node_state(n, state) {
297 		dist = node_distance(node, n);
298 		if (dist < min_dist) {
299 			min_dist = dist;
300 			min_node = n;
301 		}
302 	}
303 
304 	return min_node;
305 }
306 EXPORT_SYMBOL_GPL(numa_nearest_node);
307 
308 /**
309  * nearest_node_nodemask - Find the node in @mask at the nearest distance
310  *			   from @node.
311  *
312  * @node: a valid node ID to start the search from.
313  * @mask: a pointer to a nodemask representing the allowed nodes.
314  *
315  * This function iterates over all nodes in @mask and calculates the
316  * distance from the starting @node, then it returns the node ID that is
317  * the closest to @node, or MAX_NUMNODES if no node is found.
318  *
319  * Note that @node must be a valid node ID usable with node_distance(),
320  * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
321  * or unexpected behavior.
322  */
nearest_node_nodemask(int node,nodemask_t * mask)323 int nearest_node_nodemask(int node, nodemask_t *mask)
324 {
325 	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
326 
327 	for_each_node_mask(n, *mask) {
328 		dist = node_distance(node, n);
329 		if (dist < min_dist) {
330 			min_dist = dist;
331 			min_node = n;
332 		}
333 	}
334 
335 	return min_node;
336 }
337 EXPORT_SYMBOL_GPL(nearest_node_nodemask);
338 
get_task_policy(struct task_struct * p)339 struct mempolicy *get_task_policy(struct task_struct *p)
340 {
341 	struct mempolicy *pol = p->mempolicy;
342 	int node;
343 
344 	if (pol)
345 		return pol;
346 
347 	node = numa_node_id();
348 	if (node != NUMA_NO_NODE) {
349 		pol = &preferred_node_policy[node];
350 		/* preferred_node_policy is not initialised early in boot */
351 		if (pol->mode)
352 			return pol;
353 	}
354 
355 	return &default_policy;
356 }
357 
358 static const struct mempolicy_operations {
359 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
360 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
361 } mpol_ops[MPOL_MAX];
362 
mpol_store_user_nodemask(const struct mempolicy * pol)363 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
364 {
365 	return pol->flags & MPOL_MODE_FLAGS;
366 }
367 
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)368 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
369 				   const nodemask_t *rel)
370 {
371 	nodemask_t tmp;
372 	nodes_fold(tmp, *orig, nodes_weight(*rel));
373 	nodes_onto(*ret, tmp, *rel);
374 }
375 
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)376 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
377 {
378 	if (nodes_empty(*nodes))
379 		return -EINVAL;
380 	pol->nodes = *nodes;
381 	return 0;
382 }
383 
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)384 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
385 {
386 	if (nodes_empty(*nodes))
387 		return -EINVAL;
388 
389 	nodes_clear(pol->nodes);
390 	node_set(first_node(*nodes), pol->nodes);
391 	return 0;
392 }
393 
394 /*
395  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
396  * any, for the new policy.  mpol_new() has already validated the nodes
397  * parameter with respect to the policy mode and flags.
398  *
399  * Must be called holding task's alloc_lock to protect task's mems_allowed
400  * and mempolicy.  May also be called holding the mmap_lock for write.
401  */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)402 static int mpol_set_nodemask(struct mempolicy *pol,
403 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
404 {
405 	int ret;
406 
407 	/*
408 	 * Default (pol==NULL) resp. local memory policies are not a
409 	 * subject of any remapping. They also do not need any special
410 	 * constructor.
411 	 */
412 	if (!pol || pol->mode == MPOL_LOCAL)
413 		return 0;
414 
415 	/* Check N_MEMORY */
416 	nodes_and(nsc->mask1,
417 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
418 
419 	VM_BUG_ON(!nodes);
420 
421 	if (pol->flags & MPOL_F_RELATIVE_NODES)
422 		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
423 	else
424 		nodes_and(nsc->mask2, *nodes, nsc->mask1);
425 
426 	if (mpol_store_user_nodemask(pol))
427 		pol->w.user_nodemask = *nodes;
428 	else
429 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
430 
431 	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
432 	return ret;
433 }
434 
435 /*
436  * This function just creates a new policy, does some check and simple
437  * initialization. You must invoke mpol_set_nodemask() to set nodes.
438  */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)439 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
440 				  nodemask_t *nodes)
441 {
442 	struct mempolicy *policy;
443 
444 	if (mode == MPOL_DEFAULT) {
445 		if (nodes && !nodes_empty(*nodes))
446 			return ERR_PTR(-EINVAL);
447 		return NULL;
448 	}
449 	VM_BUG_ON(!nodes);
450 
451 	/*
452 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
453 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
454 	 * All other modes require a valid pointer to a non-empty nodemask.
455 	 */
456 	if (mode == MPOL_PREFERRED) {
457 		if (nodes_empty(*nodes)) {
458 			if (((flags & MPOL_F_STATIC_NODES) ||
459 			     (flags & MPOL_F_RELATIVE_NODES)))
460 				return ERR_PTR(-EINVAL);
461 
462 			mode = MPOL_LOCAL;
463 		}
464 	} else if (mode == MPOL_LOCAL) {
465 		if (!nodes_empty(*nodes) ||
466 		    (flags & MPOL_F_STATIC_NODES) ||
467 		    (flags & MPOL_F_RELATIVE_NODES))
468 			return ERR_PTR(-EINVAL);
469 	} else if (nodes_empty(*nodes))
470 		return ERR_PTR(-EINVAL);
471 
472 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
473 	if (!policy)
474 		return ERR_PTR(-ENOMEM);
475 	atomic_set(&policy->refcnt, 1);
476 	policy->mode = mode;
477 	policy->flags = flags;
478 	policy->home_node = NUMA_NO_NODE;
479 
480 	return policy;
481 }
482 
483 /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)484 void __mpol_put(struct mempolicy *pol)
485 {
486 	if (!atomic_dec_and_test(&pol->refcnt))
487 		return;
488 	kmem_cache_free(policy_cache, pol);
489 }
490 
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)491 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
492 {
493 }
494 
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)495 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
496 {
497 	nodemask_t tmp;
498 
499 	if (pol->flags & MPOL_F_STATIC_NODES)
500 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
501 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
502 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
503 	else {
504 		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
505 								*nodes);
506 		pol->w.cpuset_mems_allowed = *nodes;
507 	}
508 
509 	if (nodes_empty(tmp))
510 		tmp = *nodes;
511 
512 	pol->nodes = tmp;
513 }
514 
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)515 static void mpol_rebind_preferred(struct mempolicy *pol,
516 						const nodemask_t *nodes)
517 {
518 	pol->w.cpuset_mems_allowed = *nodes;
519 }
520 
521 /*
522  * mpol_rebind_policy - Migrate a policy to a different set of nodes
523  *
524  * Per-vma policies are protected by mmap_lock. Allocations using per-task
525  * policies are protected by task->mems_allowed_seq to prevent a premature
526  * OOM/allocation failure due to parallel nodemask modification.
527  */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)528 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
529 {
530 	if (!pol || pol->mode == MPOL_LOCAL)
531 		return;
532 	if (!mpol_store_user_nodemask(pol) &&
533 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
534 		return;
535 
536 	mpol_ops[pol->mode].rebind(pol, newmask);
537 }
538 
539 /*
540  * Wrapper for mpol_rebind_policy() that just requires task
541  * pointer, and updates task mempolicy.
542  *
543  * Called with task's alloc_lock held.
544  */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)545 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
546 {
547 	mpol_rebind_policy(tsk->mempolicy, new);
548 }
549 
550 /*
551  * Rebind each vma in mm to new nodemask.
552  *
553  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
554  */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)555 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
556 {
557 	struct vm_area_struct *vma;
558 	VMA_ITERATOR(vmi, mm, 0);
559 
560 	mmap_write_lock(mm);
561 	for_each_vma(vmi, vma) {
562 		vma_start_write(vma);
563 		mpol_rebind_policy(vma->vm_policy, new);
564 	}
565 	mmap_write_unlock(mm);
566 }
567 
568 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
569 	[MPOL_DEFAULT] = {
570 		.rebind = mpol_rebind_default,
571 	},
572 	[MPOL_INTERLEAVE] = {
573 		.create = mpol_new_nodemask,
574 		.rebind = mpol_rebind_nodemask,
575 	},
576 	[MPOL_PREFERRED] = {
577 		.create = mpol_new_preferred,
578 		.rebind = mpol_rebind_preferred,
579 	},
580 	[MPOL_BIND] = {
581 		.create = mpol_new_nodemask,
582 		.rebind = mpol_rebind_nodemask,
583 	},
584 	[MPOL_LOCAL] = {
585 		.rebind = mpol_rebind_default,
586 	},
587 	[MPOL_PREFERRED_MANY] = {
588 		.create = mpol_new_nodemask,
589 		.rebind = mpol_rebind_preferred,
590 	},
591 	[MPOL_WEIGHTED_INTERLEAVE] = {
592 		.create = mpol_new_nodemask,
593 		.rebind = mpol_rebind_nodemask,
594 	},
595 };
596 
597 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
598 				unsigned long flags);
599 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
600 				pgoff_t ilx, int *nid);
601 
strictly_unmovable(unsigned long flags)602 static bool strictly_unmovable(unsigned long flags)
603 {
604 	/*
605 	 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
606 	 * if any misplaced page is found.
607 	 */
608 	return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
609 			 MPOL_MF_STRICT;
610 }
611 
612 struct migration_mpol {		/* for alloc_migration_target_by_mpol() */
613 	struct mempolicy *pol;
614 	pgoff_t ilx;
615 };
616 
617 struct queue_pages {
618 	struct list_head *pagelist;
619 	unsigned long flags;
620 	nodemask_t *nmask;
621 	unsigned long start;
622 	unsigned long end;
623 	struct vm_area_struct *first;
624 	struct folio *large;		/* note last large folio encountered */
625 	long nr_failed;			/* could not be isolated at this time */
626 };
627 
628 /*
629  * Check if the folio's nid is in qp->nmask.
630  *
631  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
632  * in the invert of qp->nmask.
633  */
queue_folio_required(struct folio * folio,struct queue_pages * qp)634 static inline bool queue_folio_required(struct folio *folio,
635 					struct queue_pages *qp)
636 {
637 	int nid = folio_nid(folio);
638 	unsigned long flags = qp->flags;
639 
640 	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
641 }
642 
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)643 static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
644 {
645 	struct folio *folio;
646 	struct queue_pages *qp = walk->private;
647 
648 	if (unlikely(is_pmd_migration_entry(*pmd))) {
649 		qp->nr_failed++;
650 		return;
651 	}
652 	folio = pmd_folio(*pmd);
653 	if (is_huge_zero_folio(folio)) {
654 		walk->action = ACTION_CONTINUE;
655 		return;
656 	}
657 	if (!queue_folio_required(folio, qp))
658 		return;
659 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
660 	    !vma_migratable(walk->vma) ||
661 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
662 		qp->nr_failed++;
663 }
664 
665 /*
666  * Scan through folios, checking if they satisfy the required conditions,
667  * moving them from LRU to local pagelist for migration if they do (or not).
668  *
669  * queue_folios_pte_range() has two possible return values:
670  * 0 - continue walking to scan for more, even if an existing folio on the
671  *     wrong node could not be isolated and queued for migration.
672  * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
673  *        and an existing folio was on a node that does not follow the policy.
674  */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)675 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
676 			unsigned long end, struct mm_walk *walk)
677 {
678 	struct vm_area_struct *vma = walk->vma;
679 	struct folio *folio;
680 	struct queue_pages *qp = walk->private;
681 	unsigned long flags = qp->flags;
682 	pte_t *pte, *mapped_pte;
683 	pte_t ptent;
684 	spinlock_t *ptl;
685 	int max_nr, nr;
686 
687 	ptl = pmd_trans_huge_lock(pmd, vma);
688 	if (ptl) {
689 		queue_folios_pmd(pmd, walk);
690 		spin_unlock(ptl);
691 		goto out;
692 	}
693 
694 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
695 	if (!pte) {
696 		walk->action = ACTION_AGAIN;
697 		return 0;
698 	}
699 	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
700 		max_nr = (end - addr) >> PAGE_SHIFT;
701 		nr = 1;
702 		ptent = ptep_get(pte);
703 		if (pte_none(ptent))
704 			continue;
705 		if (!pte_present(ptent)) {
706 			if (is_migration_entry(pte_to_swp_entry(ptent)))
707 				qp->nr_failed++;
708 			continue;
709 		}
710 		folio = vm_normal_folio(vma, addr, ptent);
711 		if (!folio || folio_is_zone_device(folio))
712 			continue;
713 		if (folio_test_large(folio) && max_nr != 1)
714 			nr = folio_pte_batch(folio, pte, ptent, max_nr);
715 		/*
716 		 * vm_normal_folio() filters out zero pages, but there might
717 		 * still be reserved folios to skip, perhaps in a VDSO.
718 		 */
719 		if (folio_test_reserved(folio))
720 			continue;
721 		if (!queue_folio_required(folio, qp))
722 			continue;
723 		if (folio_test_large(folio)) {
724 			/*
725 			 * A large folio can only be isolated from LRU once,
726 			 * but may be mapped by many PTEs (and Copy-On-Write may
727 			 * intersperse PTEs of other, order 0, folios).  This is
728 			 * a common case, so don't mistake it for failure (but
729 			 * there can be other cases of multi-mapped pages which
730 			 * this quick check does not help to filter out - and a
731 			 * search of the pagelist might grow to be prohibitive).
732 			 *
733 			 * migrate_pages(&pagelist) returns nr_failed folios, so
734 			 * check "large" now so that queue_pages_range() returns
735 			 * a comparable nr_failed folios.  This does imply that
736 			 * if folio could not be isolated for some racy reason
737 			 * at its first PTE, later PTEs will not give it another
738 			 * chance of isolation; but keeps the accounting simple.
739 			 */
740 			if (folio == qp->large)
741 				continue;
742 			qp->large = folio;
743 		}
744 		if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
745 		    !vma_migratable(vma) ||
746 		    !migrate_folio_add(folio, qp->pagelist, flags)) {
747 			qp->nr_failed += nr;
748 			if (strictly_unmovable(flags))
749 				break;
750 		}
751 	}
752 	pte_unmap_unlock(mapped_pte, ptl);
753 	cond_resched();
754 out:
755 	if (qp->nr_failed && strictly_unmovable(flags))
756 		return -EIO;
757 	return 0;
758 }
759 
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)760 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
761 			       unsigned long addr, unsigned long end,
762 			       struct mm_walk *walk)
763 {
764 #ifdef CONFIG_HUGETLB_PAGE
765 	struct queue_pages *qp = walk->private;
766 	unsigned long flags = qp->flags;
767 	struct folio *folio;
768 	spinlock_t *ptl;
769 	pte_t entry;
770 
771 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
772 	entry = huge_ptep_get(walk->mm, addr, pte);
773 	if (!pte_present(entry)) {
774 		if (unlikely(is_hugetlb_entry_migration(entry)))
775 			qp->nr_failed++;
776 		goto unlock;
777 	}
778 	folio = pfn_folio(pte_pfn(entry));
779 	if (!queue_folio_required(folio, qp))
780 		goto unlock;
781 	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
782 	    !vma_migratable(walk->vma)) {
783 		qp->nr_failed++;
784 		goto unlock;
785 	}
786 	/*
787 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
788 	 * Choosing not to migrate a shared folio is not counted as a failure.
789 	 *
790 	 * See folio_maybe_mapped_shared() on possible imprecision when we
791 	 * cannot easily detect if a folio is shared.
792 	 */
793 	if ((flags & MPOL_MF_MOVE_ALL) ||
794 	    (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
795 		if (!folio_isolate_hugetlb(folio, qp->pagelist))
796 			qp->nr_failed++;
797 unlock:
798 	spin_unlock(ptl);
799 	if (qp->nr_failed && strictly_unmovable(flags))
800 		return -EIO;
801 #endif
802 	return 0;
803 }
804 
805 #ifdef CONFIG_NUMA_BALANCING
806 /*
807  * This is used to mark a range of virtual addresses to be inaccessible.
808  * These are later cleared by a NUMA hinting fault. Depending on these
809  * faults, pages may be migrated for better NUMA placement.
810  *
811  * This is assuming that NUMA faults are handled using PROT_NONE. If
812  * an architecture makes a different choice, it will need further
813  * changes to the core.
814  */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)815 unsigned long change_prot_numa(struct vm_area_struct *vma,
816 			unsigned long addr, unsigned long end)
817 {
818 	struct mmu_gather tlb;
819 	long nr_updated;
820 
821 	tlb_gather_mmu(&tlb, vma->vm_mm);
822 
823 	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
824 	if (nr_updated > 0) {
825 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
826 		count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
827 	}
828 
829 	tlb_finish_mmu(&tlb);
830 
831 	return nr_updated;
832 }
833 #endif /* CONFIG_NUMA_BALANCING */
834 
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)835 static int queue_pages_test_walk(unsigned long start, unsigned long end,
836 				struct mm_walk *walk)
837 {
838 	struct vm_area_struct *next, *vma = walk->vma;
839 	struct queue_pages *qp = walk->private;
840 	unsigned long flags = qp->flags;
841 
842 	/* range check first */
843 	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
844 
845 	if (!qp->first) {
846 		qp->first = vma;
847 		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
848 			(qp->start < vma->vm_start))
849 			/* hole at head side of range */
850 			return -EFAULT;
851 	}
852 	next = find_vma(vma->vm_mm, vma->vm_end);
853 	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
854 		((vma->vm_end < qp->end) &&
855 		(!next || vma->vm_end < next->vm_start)))
856 		/* hole at middle or tail of range */
857 		return -EFAULT;
858 
859 	/*
860 	 * Need check MPOL_MF_STRICT to return -EIO if possible
861 	 * regardless of vma_migratable
862 	 */
863 	if (!vma_migratable(vma) &&
864 	    !(flags & MPOL_MF_STRICT))
865 		return 1;
866 
867 	/*
868 	 * Check page nodes, and queue pages to move, in the current vma.
869 	 * But if no moving, and no strict checking, the scan can be skipped.
870 	 */
871 	if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
872 		return 0;
873 	return 1;
874 }
875 
876 static const struct mm_walk_ops queue_pages_walk_ops = {
877 	.hugetlb_entry		= queue_folios_hugetlb,
878 	.pmd_entry		= queue_folios_pte_range,
879 	.test_walk		= queue_pages_test_walk,
880 	.walk_lock		= PGWALK_RDLOCK,
881 };
882 
883 static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
884 	.hugetlb_entry		= queue_folios_hugetlb,
885 	.pmd_entry		= queue_folios_pte_range,
886 	.test_walk		= queue_pages_test_walk,
887 	.walk_lock		= PGWALK_WRLOCK,
888 };
889 
890 /*
891  * Walk through page tables and collect pages to be migrated.
892  *
893  * If pages found in a given range are not on the required set of @nodes,
894  * and migration is allowed, they are isolated and queued to @pagelist.
895  *
896  * queue_pages_range() may return:
897  * 0 - all pages already on the right node, or successfully queued for moving
898  *     (or neither strict checking nor moving requested: only range checking).
899  * >0 - this number of misplaced folios could not be queued for moving
900  *      (a hugetlbfs page or a transparent huge page being counted as 1).
901  * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
902  * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
903  */
904 static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)905 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
906 		nodemask_t *nodes, unsigned long flags,
907 		struct list_head *pagelist)
908 {
909 	int err;
910 	struct queue_pages qp = {
911 		.pagelist = pagelist,
912 		.flags = flags,
913 		.nmask = nodes,
914 		.start = start,
915 		.end = end,
916 		.first = NULL,
917 	};
918 	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
919 			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
920 
921 	err = walk_page_range(mm, start, end, ops, &qp);
922 
923 	if (!qp.first)
924 		/* whole range in hole */
925 		err = -EFAULT;
926 
927 	return err ? : qp.nr_failed;
928 }
929 
930 /*
931  * Apply policy to a single VMA
932  * This must be called with the mmap_lock held for writing.
933  */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)934 static int vma_replace_policy(struct vm_area_struct *vma,
935 				struct mempolicy *pol)
936 {
937 	int err;
938 	struct mempolicy *old;
939 	struct mempolicy *new;
940 
941 	vma_assert_write_locked(vma);
942 
943 	new = mpol_dup(pol);
944 	if (IS_ERR(new))
945 		return PTR_ERR(new);
946 
947 	if (vma->vm_ops && vma->vm_ops->set_policy) {
948 		err = vma->vm_ops->set_policy(vma, new);
949 		if (err)
950 			goto err_out;
951 	}
952 
953 	old = vma->vm_policy;
954 	vma->vm_policy = new; /* protected by mmap_lock */
955 	mpol_put(old);
956 
957 	return 0;
958  err_out:
959 	mpol_put(new);
960 	return err;
961 }
962 
963 /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)964 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
965 		struct vm_area_struct **prev, unsigned long start,
966 		unsigned long end, struct mempolicy *new_pol)
967 {
968 	unsigned long vmstart, vmend;
969 
970 	vmend = min(end, vma->vm_end);
971 	if (start > vma->vm_start) {
972 		*prev = vma;
973 		vmstart = start;
974 	} else {
975 		vmstart = vma->vm_start;
976 	}
977 
978 	if (mpol_equal(vma->vm_policy, new_pol)) {
979 		*prev = vma;
980 		return 0;
981 	}
982 
983 	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
984 	if (IS_ERR(vma))
985 		return PTR_ERR(vma);
986 
987 	*prev = vma;
988 	return vma_replace_policy(vma, new_pol);
989 }
990 
991 /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)992 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
993 			     nodemask_t *nodes)
994 {
995 	struct mempolicy *new, *old;
996 	NODEMASK_SCRATCH(scratch);
997 	int ret;
998 
999 	if (!scratch)
1000 		return -ENOMEM;
1001 
1002 	new = mpol_new(mode, flags, nodes);
1003 	if (IS_ERR(new)) {
1004 		ret = PTR_ERR(new);
1005 		goto out;
1006 	}
1007 
1008 	task_lock(current);
1009 	ret = mpol_set_nodemask(new, nodes, scratch);
1010 	if (ret) {
1011 		task_unlock(current);
1012 		mpol_put(new);
1013 		goto out;
1014 	}
1015 
1016 	old = current->mempolicy;
1017 	current->mempolicy = new;
1018 	if (new && (new->mode == MPOL_INTERLEAVE ||
1019 		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1020 		current->il_prev = MAX_NUMNODES-1;
1021 		current->il_weight = 0;
1022 	}
1023 	task_unlock(current);
1024 	mpol_put(old);
1025 	ret = 0;
1026 out:
1027 	NODEMASK_SCRATCH_FREE(scratch);
1028 	return ret;
1029 }
1030 
1031 /*
1032  * Return nodemask for policy for get_mempolicy() query
1033  *
1034  * Called with task's alloc_lock held
1035  */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)1036 static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1037 {
1038 	nodes_clear(*nodes);
1039 	if (pol == &default_policy)
1040 		return;
1041 
1042 	switch (pol->mode) {
1043 	case MPOL_BIND:
1044 	case MPOL_INTERLEAVE:
1045 	case MPOL_PREFERRED:
1046 	case MPOL_PREFERRED_MANY:
1047 	case MPOL_WEIGHTED_INTERLEAVE:
1048 		*nodes = pol->nodes;
1049 		break;
1050 	case MPOL_LOCAL:
1051 		/* return empty node mask for local allocation */
1052 		break;
1053 	default:
1054 		BUG();
1055 	}
1056 }
1057 
lookup_node(struct mm_struct * mm,unsigned long addr)1058 static int lookup_node(struct mm_struct *mm, unsigned long addr)
1059 {
1060 	struct page *p = NULL;
1061 	int ret;
1062 
1063 	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1064 	if (ret > 0) {
1065 		ret = page_to_nid(p);
1066 		put_page(p);
1067 	}
1068 	return ret;
1069 }
1070 
1071 /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)1072 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
1073 			     unsigned long addr, unsigned long flags)
1074 {
1075 	int err;
1076 	struct mm_struct *mm = current->mm;
1077 	struct vm_area_struct *vma = NULL;
1078 	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1079 
1080 	if (flags &
1081 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1082 		return -EINVAL;
1083 
1084 	if (flags & MPOL_F_MEMS_ALLOWED) {
1085 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1086 			return -EINVAL;
1087 		*policy = 0;	/* just so it's initialized */
1088 		task_lock(current);
1089 		*nmask  = cpuset_current_mems_allowed;
1090 		task_unlock(current);
1091 		return 0;
1092 	}
1093 
1094 	if (flags & MPOL_F_ADDR) {
1095 		pgoff_t ilx;		/* ignored here */
1096 		/*
1097 		 * Do NOT fall back to task policy if the
1098 		 * vma/shared policy at addr is NULL.  We
1099 		 * want to return MPOL_DEFAULT in this case.
1100 		 */
1101 		mmap_read_lock(mm);
1102 		vma = vma_lookup(mm, addr);
1103 		if (!vma) {
1104 			mmap_read_unlock(mm);
1105 			return -EFAULT;
1106 		}
1107 		pol = __get_vma_policy(vma, addr, &ilx);
1108 	} else if (addr)
1109 		return -EINVAL;
1110 
1111 	if (!pol)
1112 		pol = &default_policy;	/* indicates default behavior */
1113 
1114 	if (flags & MPOL_F_NODE) {
1115 		if (flags & MPOL_F_ADDR) {
1116 			/*
1117 			 * Take a refcount on the mpol, because we are about to
1118 			 * drop the mmap_lock, after which only "pol" remains
1119 			 * valid, "vma" is stale.
1120 			 */
1121 			pol_refcount = pol;
1122 			vma = NULL;
1123 			mpol_get(pol);
1124 			mmap_read_unlock(mm);
1125 			err = lookup_node(mm, addr);
1126 			if (err < 0)
1127 				goto out;
1128 			*policy = err;
1129 		} else if (pol == current->mempolicy &&
1130 				pol->mode == MPOL_INTERLEAVE) {
1131 			*policy = next_node_in(current->il_prev, pol->nodes);
1132 		} else if (pol == current->mempolicy &&
1133 				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1134 			if (current->il_weight)
1135 				*policy = current->il_prev;
1136 			else
1137 				*policy = next_node_in(current->il_prev,
1138 						       pol->nodes);
1139 		} else {
1140 			err = -EINVAL;
1141 			goto out;
1142 		}
1143 	} else {
1144 		*policy = pol == &default_policy ? MPOL_DEFAULT :
1145 						pol->mode;
1146 		/*
1147 		 * Internal mempolicy flags must be masked off before exposing
1148 		 * the policy to userspace.
1149 		 */
1150 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1151 	}
1152 
1153 	err = 0;
1154 	if (nmask) {
1155 		if (mpol_store_user_nodemask(pol)) {
1156 			*nmask = pol->w.user_nodemask;
1157 		} else {
1158 			task_lock(current);
1159 			get_policy_nodemask(pol, nmask);
1160 			task_unlock(current);
1161 		}
1162 	}
1163 
1164  out:
1165 	mpol_cond_put(pol);
1166 	if (vma)
1167 		mmap_read_unlock(mm);
1168 	if (pol_refcount)
1169 		mpol_put(pol_refcount);
1170 	return err;
1171 }
1172 
1173 #ifdef CONFIG_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1174 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1175 				unsigned long flags)
1176 {
1177 	/*
1178 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1179 	 * Choosing not to migrate a shared folio is not counted as a failure.
1180 	 *
1181 	 * See folio_maybe_mapped_shared() on possible imprecision when we
1182 	 * cannot easily detect if a folio is shared.
1183 	 */
1184 	if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1185 		if (folio_isolate_lru(folio)) {
1186 			list_add_tail(&folio->lru, foliolist);
1187 			node_stat_mod_folio(folio,
1188 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1189 				folio_nr_pages(folio));
1190 		} else {
1191 			/*
1192 			 * Non-movable folio may reach here.  And, there may be
1193 			 * temporary off LRU folios or non-LRU movable folios.
1194 			 * Treat them as unmovable folios since they can't be
1195 			 * isolated, so they can't be moved at the moment.
1196 			 */
1197 			return false;
1198 		}
1199 	}
1200 	return true;
1201 }
1202 
1203 /*
1204  * Migrate pages from one node to a target node.
1205  * Returns error or the number of pages not migrated.
1206  */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)1207 static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1208 			    int flags)
1209 {
1210 	nodemask_t nmask;
1211 	struct vm_area_struct *vma;
1212 	LIST_HEAD(pagelist);
1213 	long nr_failed;
1214 	long err = 0;
1215 	struct migration_target_control mtc = {
1216 		.nid = dest,
1217 		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1218 		.reason = MR_SYSCALL,
1219 	};
1220 
1221 	nodes_clear(nmask);
1222 	node_set(source, nmask);
1223 
1224 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1225 
1226 	mmap_read_lock(mm);
1227 	vma = find_vma(mm, 0);
1228 	if (unlikely(!vma)) {
1229 		mmap_read_unlock(mm);
1230 		return 0;
1231 	}
1232 
1233 	/*
1234 	 * This does not migrate the range, but isolates all pages that
1235 	 * need migration.  Between passing in the full user address
1236 	 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1237 	 * but passes back the count of pages which could not be isolated.
1238 	 */
1239 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1240 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1241 	mmap_read_unlock(mm);
1242 
1243 	if (!list_empty(&pagelist)) {
1244 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1245 			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1246 		if (err)
1247 			putback_movable_pages(&pagelist);
1248 	}
1249 
1250 	if (err >= 0)
1251 		err += nr_failed;
1252 	return err;
1253 }
1254 
1255 /*
1256  * Move pages between the two nodesets so as to preserve the physical
1257  * layout as much as possible.
1258  *
1259  * Returns the number of page that could not be moved.
1260  */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1261 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1262 		     const nodemask_t *to, int flags)
1263 {
1264 	long nr_failed = 0;
1265 	long err = 0;
1266 	nodemask_t tmp;
1267 
1268 	lru_cache_disable();
1269 
1270 	/*
1271 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1272 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1273 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1274 	 * The pair of nodemasks 'to' and 'from' define the map.
1275 	 *
1276 	 * If no pair of bits is found that way, fallback to picking some
1277 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1278 	 * 'source' and 'dest' bits are the same, this represents a node
1279 	 * that will be migrating to itself, so no pages need move.
1280 	 *
1281 	 * If no bits are left in 'tmp', or if all remaining bits left
1282 	 * in 'tmp' correspond to the same bit in 'to', return false
1283 	 * (nothing left to migrate).
1284 	 *
1285 	 * This lets us pick a pair of nodes to migrate between, such that
1286 	 * if possible the dest node is not already occupied by some other
1287 	 * source node, minimizing the risk of overloading the memory on a
1288 	 * node that would happen if we migrated incoming memory to a node
1289 	 * before migrating outgoing memory source that same node.
1290 	 *
1291 	 * A single scan of tmp is sufficient.  As we go, we remember the
1292 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1293 	 * that not only moved, but what's better, moved to an empty slot
1294 	 * (d is not set in tmp), then we break out then, with that pair.
1295 	 * Otherwise when we finish scanning from_tmp, we at least have the
1296 	 * most recent <s, d> pair that moved.  If we get all the way through
1297 	 * the scan of tmp without finding any node that moved, much less
1298 	 * moved to an empty node, then there is nothing left worth migrating.
1299 	 */
1300 
1301 	tmp = *from;
1302 	while (!nodes_empty(tmp)) {
1303 		int s, d;
1304 		int source = NUMA_NO_NODE;
1305 		int dest = 0;
1306 
1307 		for_each_node_mask(s, tmp) {
1308 
1309 			/*
1310 			 * do_migrate_pages() tries to maintain the relative
1311 			 * node relationship of the pages established between
1312 			 * threads and memory areas.
1313                          *
1314 			 * However if the number of source nodes is not equal to
1315 			 * the number of destination nodes we can not preserve
1316 			 * this node relative relationship.  In that case, skip
1317 			 * copying memory from a node that is in the destination
1318 			 * mask.
1319 			 *
1320 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1321 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1322 			 */
1323 
1324 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1325 						(node_isset(s, *to)))
1326 				continue;
1327 
1328 			d = node_remap(s, *from, *to);
1329 			if (s == d)
1330 				continue;
1331 
1332 			source = s;	/* Node moved. Memorize */
1333 			dest = d;
1334 
1335 			/* dest not in remaining from nodes? */
1336 			if (!node_isset(dest, tmp))
1337 				break;
1338 		}
1339 		if (source == NUMA_NO_NODE)
1340 			break;
1341 
1342 		node_clear(source, tmp);
1343 		err = migrate_to_node(mm, source, dest, flags);
1344 		if (err > 0)
1345 			nr_failed += err;
1346 		if (err < 0)
1347 			break;
1348 	}
1349 
1350 	lru_cache_enable();
1351 	if (err < 0)
1352 		return err;
1353 	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1354 }
1355 
1356 /*
1357  * Allocate a new folio for page migration, according to NUMA mempolicy.
1358  */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1359 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1360 						    unsigned long private)
1361 {
1362 	struct migration_mpol *mmpol = (struct migration_mpol *)private;
1363 	struct mempolicy *pol = mmpol->pol;
1364 	pgoff_t ilx = mmpol->ilx;
1365 	unsigned int order;
1366 	int nid = numa_node_id();
1367 	gfp_t gfp;
1368 
1369 	order = folio_order(src);
1370 	ilx += src->index >> order;
1371 
1372 	if (folio_test_hugetlb(src)) {
1373 		nodemask_t *nodemask;
1374 		struct hstate *h;
1375 
1376 		h = folio_hstate(src);
1377 		gfp = htlb_alloc_mask(h);
1378 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1379 		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
1380 				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1381 	}
1382 
1383 	if (folio_test_large(src))
1384 		gfp = GFP_TRANSHUGE;
1385 	else
1386 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1387 
1388 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1389 }
1390 #else
1391 
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)1392 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1393 				unsigned long flags)
1394 {
1395 	return false;
1396 }
1397 
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)1398 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1399 		     const nodemask_t *to, int flags)
1400 {
1401 	return -ENOSYS;
1402 }
1403 
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)1404 static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1405 						    unsigned long private)
1406 {
1407 	return NULL;
1408 }
1409 #endif
1410 
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1411 static long do_mbind(unsigned long start, unsigned long len,
1412 		     unsigned short mode, unsigned short mode_flags,
1413 		     nodemask_t *nmask, unsigned long flags)
1414 {
1415 	struct mm_struct *mm = current->mm;
1416 	struct vm_area_struct *vma, *prev;
1417 	struct vma_iterator vmi;
1418 	struct migration_mpol mmpol;
1419 	struct mempolicy *new;
1420 	unsigned long end;
1421 	long err;
1422 	long nr_failed;
1423 	LIST_HEAD(pagelist);
1424 
1425 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1426 		return -EINVAL;
1427 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1428 		return -EPERM;
1429 
1430 	if (start & ~PAGE_MASK)
1431 		return -EINVAL;
1432 
1433 	if (mode == MPOL_DEFAULT)
1434 		flags &= ~MPOL_MF_STRICT;
1435 
1436 	len = PAGE_ALIGN(len);
1437 	end = start + len;
1438 
1439 	if (end < start)
1440 		return -EINVAL;
1441 	if (end == start)
1442 		return 0;
1443 
1444 	new = mpol_new(mode, mode_flags, nmask);
1445 	if (IS_ERR(new))
1446 		return PTR_ERR(new);
1447 
1448 	/*
1449 	 * If we are using the default policy then operation
1450 	 * on discontinuous address spaces is okay after all
1451 	 */
1452 	if (!new)
1453 		flags |= MPOL_MF_DISCONTIG_OK;
1454 
1455 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1456 		lru_cache_disable();
1457 	{
1458 		NODEMASK_SCRATCH(scratch);
1459 		if (scratch) {
1460 			mmap_write_lock(mm);
1461 			err = mpol_set_nodemask(new, nmask, scratch);
1462 			if (err)
1463 				mmap_write_unlock(mm);
1464 		} else
1465 			err = -ENOMEM;
1466 		NODEMASK_SCRATCH_FREE(scratch);
1467 	}
1468 	if (err)
1469 		goto mpol_out;
1470 
1471 	/*
1472 	 * Lock the VMAs before scanning for pages to migrate,
1473 	 * to ensure we don't miss a concurrently inserted page.
1474 	 */
1475 	nr_failed = queue_pages_range(mm, start, end, nmask,
1476 			flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1477 
1478 	if (nr_failed < 0) {
1479 		err = nr_failed;
1480 		nr_failed = 0;
1481 	} else {
1482 		vma_iter_init(&vmi, mm, start);
1483 		prev = vma_prev(&vmi);
1484 		for_each_vma_range(vmi, vma, end) {
1485 			err = mbind_range(&vmi, vma, &prev, start, end, new);
1486 			if (err)
1487 				break;
1488 		}
1489 	}
1490 
1491 	if (!err && !list_empty(&pagelist)) {
1492 		/* Convert MPOL_DEFAULT's NULL to task or default policy */
1493 		if (!new) {
1494 			new = get_task_policy(current);
1495 			mpol_get(new);
1496 		}
1497 		mmpol.pol = new;
1498 		mmpol.ilx = 0;
1499 
1500 		/*
1501 		 * In the interleaved case, attempt to allocate on exactly the
1502 		 * targeted nodes, for the first VMA to be migrated; for later
1503 		 * VMAs, the nodes will still be interleaved from the targeted
1504 		 * nodemask, but one by one may be selected differently.
1505 		 */
1506 		if (new->mode == MPOL_INTERLEAVE ||
1507 		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1508 			struct folio *folio;
1509 			unsigned int order;
1510 			unsigned long addr = -EFAULT;
1511 
1512 			list_for_each_entry(folio, &pagelist, lru) {
1513 				if (!folio_test_ksm(folio))
1514 					break;
1515 			}
1516 			if (!list_entry_is_head(folio, &pagelist, lru)) {
1517 				vma_iter_init(&vmi, mm, start);
1518 				for_each_vma_range(vmi, vma, end) {
1519 					addr = page_address_in_vma(folio,
1520 						folio_page(folio, 0), vma);
1521 					if (addr != -EFAULT)
1522 						break;
1523 				}
1524 			}
1525 			if (addr != -EFAULT) {
1526 				order = folio_order(folio);
1527 				/* We already know the pol, but not the ilx */
1528 				mpol_cond_put(get_vma_policy(vma, addr, order,
1529 							     &mmpol.ilx));
1530 				/* Set base from which to increment by index */
1531 				mmpol.ilx -= folio->index >> order;
1532 			}
1533 		}
1534 	}
1535 
1536 	mmap_write_unlock(mm);
1537 
1538 	if (!err && !list_empty(&pagelist)) {
1539 		nr_failed |= migrate_pages(&pagelist,
1540 				alloc_migration_target_by_mpol, NULL,
1541 				(unsigned long)&mmpol, MIGRATE_SYNC,
1542 				MR_MEMPOLICY_MBIND, NULL);
1543 	}
1544 
1545 	if (nr_failed && (flags & MPOL_MF_STRICT))
1546 		err = -EIO;
1547 	if (!list_empty(&pagelist))
1548 		putback_movable_pages(&pagelist);
1549 mpol_out:
1550 	mpol_put(new);
1551 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1552 		lru_cache_enable();
1553 	return err;
1554 }
1555 
1556 /*
1557  * User space interface with variable sized bitmaps for nodelists.
1558  */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1559 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1560 		      unsigned long maxnode)
1561 {
1562 	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1563 	int ret;
1564 
1565 	if (in_compat_syscall())
1566 		ret = compat_get_bitmap(mask,
1567 					(const compat_ulong_t __user *)nmask,
1568 					maxnode);
1569 	else
1570 		ret = copy_from_user(mask, nmask,
1571 				     nlongs * sizeof(unsigned long));
1572 
1573 	if (ret)
1574 		return -EFAULT;
1575 
1576 	if (maxnode % BITS_PER_LONG)
1577 		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1578 
1579 	return 0;
1580 }
1581 
1582 /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)1583 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1584 		     unsigned long maxnode)
1585 {
1586 	--maxnode;
1587 	nodes_clear(*nodes);
1588 	if (maxnode == 0 || !nmask)
1589 		return 0;
1590 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1591 		return -EINVAL;
1592 
1593 	/*
1594 	 * When the user specified more nodes than supported just check
1595 	 * if the non supported part is all zero, one word at a time,
1596 	 * starting at the end.
1597 	 */
1598 	while (maxnode > MAX_NUMNODES) {
1599 		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1600 		unsigned long t;
1601 
1602 		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1603 			return -EFAULT;
1604 
1605 		if (maxnode - bits >= MAX_NUMNODES) {
1606 			maxnode -= bits;
1607 		} else {
1608 			maxnode = MAX_NUMNODES;
1609 			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1610 		}
1611 		if (t)
1612 			return -EINVAL;
1613 	}
1614 
1615 	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1616 }
1617 
1618 /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)1619 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1620 			      nodemask_t *nodes)
1621 {
1622 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1623 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1624 	bool compat = in_compat_syscall();
1625 
1626 	if (compat)
1627 		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1628 
1629 	if (copy > nbytes) {
1630 		if (copy > PAGE_SIZE)
1631 			return -EINVAL;
1632 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1633 			return -EFAULT;
1634 		copy = nbytes;
1635 		maxnode = nr_node_ids;
1636 	}
1637 
1638 	if (compat)
1639 		return compat_put_bitmap((compat_ulong_t __user *)mask,
1640 					 nodes_addr(*nodes), maxnode);
1641 
1642 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1643 }
1644 
1645 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)1646 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1647 {
1648 	*flags = *mode & MPOL_MODE_FLAGS;
1649 	*mode &= ~MPOL_MODE_FLAGS;
1650 
1651 	if ((unsigned int)(*mode) >=  MPOL_MAX)
1652 		return -EINVAL;
1653 	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1654 		return -EINVAL;
1655 	if (*flags & MPOL_F_NUMA_BALANCING) {
1656 		if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
1657 			*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1658 		else
1659 			return -EINVAL;
1660 	}
1661 	return 0;
1662 }
1663 
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1664 static long kernel_mbind(unsigned long start, unsigned long len,
1665 			 unsigned long mode, const unsigned long __user *nmask,
1666 			 unsigned long maxnode, unsigned int flags)
1667 {
1668 	unsigned short mode_flags;
1669 	nodemask_t nodes;
1670 	int lmode = mode;
1671 	int err;
1672 
1673 	start = untagged_addr(start);
1674 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1675 	if (err)
1676 		return err;
1677 
1678 	err = get_nodes(&nodes, nmask, maxnode);
1679 	if (err)
1680 		return err;
1681 
1682 	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1683 }
1684 
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1685 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1686 		unsigned long, home_node, unsigned long, flags)
1687 {
1688 	struct mm_struct *mm = current->mm;
1689 	struct vm_area_struct *vma, *prev;
1690 	struct mempolicy *new, *old;
1691 	unsigned long end;
1692 	int err = -ENOENT;
1693 	VMA_ITERATOR(vmi, mm, start);
1694 
1695 	start = untagged_addr(start);
1696 	if (start & ~PAGE_MASK)
1697 		return -EINVAL;
1698 	/*
1699 	 * flags is used for future extension if any.
1700 	 */
1701 	if (flags != 0)
1702 		return -EINVAL;
1703 
1704 	/*
1705 	 * Check home_node is online to avoid accessing uninitialized
1706 	 * NODE_DATA.
1707 	 */
1708 	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1709 		return -EINVAL;
1710 
1711 	len = PAGE_ALIGN(len);
1712 	end = start + len;
1713 
1714 	if (end < start)
1715 		return -EINVAL;
1716 	if (end == start)
1717 		return 0;
1718 	mmap_write_lock(mm);
1719 	prev = vma_prev(&vmi);
1720 	for_each_vma_range(vmi, vma, end) {
1721 		/*
1722 		 * If any vma in the range got policy other than MPOL_BIND
1723 		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1724 		 * the home node for vmas we already updated before.
1725 		 */
1726 		old = vma_policy(vma);
1727 		if (!old) {
1728 			prev = vma;
1729 			continue;
1730 		}
1731 		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1732 			err = -EOPNOTSUPP;
1733 			break;
1734 		}
1735 		new = mpol_dup(old);
1736 		if (IS_ERR(new)) {
1737 			err = PTR_ERR(new);
1738 			break;
1739 		}
1740 
1741 		vma_start_write(vma);
1742 		new->home_node = home_node;
1743 		err = mbind_range(&vmi, vma, &prev, start, end, new);
1744 		mpol_put(new);
1745 		if (err)
1746 			break;
1747 	}
1748 	mmap_write_unlock(mm);
1749 	return err;
1750 }
1751 
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1752 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1753 		unsigned long, mode, const unsigned long __user *, nmask,
1754 		unsigned long, maxnode, unsigned int, flags)
1755 {
1756 	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1757 }
1758 
1759 /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1760 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1761 				 unsigned long maxnode)
1762 {
1763 	unsigned short mode_flags;
1764 	nodemask_t nodes;
1765 	int lmode = mode;
1766 	int err;
1767 
1768 	err = sanitize_mpol_flags(&lmode, &mode_flags);
1769 	if (err)
1770 		return err;
1771 
1772 	err = get_nodes(&nodes, nmask, maxnode);
1773 	if (err)
1774 		return err;
1775 
1776 	return do_set_mempolicy(lmode, mode_flags, &nodes);
1777 }
1778 
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1779 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1780 		unsigned long, maxnode)
1781 {
1782 	return kernel_set_mempolicy(mode, nmask, maxnode);
1783 }
1784 
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1785 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1786 				const unsigned long __user *old_nodes,
1787 				const unsigned long __user *new_nodes)
1788 {
1789 	struct mm_struct *mm = NULL;
1790 	struct task_struct *task;
1791 	nodemask_t task_nodes;
1792 	int err;
1793 	nodemask_t *old;
1794 	nodemask_t *new;
1795 	NODEMASK_SCRATCH(scratch);
1796 
1797 	if (!scratch)
1798 		return -ENOMEM;
1799 
1800 	old = &scratch->mask1;
1801 	new = &scratch->mask2;
1802 
1803 	err = get_nodes(old, old_nodes, maxnode);
1804 	if (err)
1805 		goto out;
1806 
1807 	err = get_nodes(new, new_nodes, maxnode);
1808 	if (err)
1809 		goto out;
1810 
1811 	/* Find the mm_struct */
1812 	rcu_read_lock();
1813 	task = pid ? find_task_by_vpid(pid) : current;
1814 	if (!task) {
1815 		rcu_read_unlock();
1816 		err = -ESRCH;
1817 		goto out;
1818 	}
1819 	get_task_struct(task);
1820 
1821 	err = -EINVAL;
1822 
1823 	/*
1824 	 * Check if this process has the right to modify the specified process.
1825 	 * Use the regular "ptrace_may_access()" checks.
1826 	 */
1827 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1828 		rcu_read_unlock();
1829 		err = -EPERM;
1830 		goto out_put;
1831 	}
1832 	rcu_read_unlock();
1833 
1834 	task_nodes = cpuset_mems_allowed(task);
1835 	/* Is the user allowed to access the target nodes? */
1836 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1837 		err = -EPERM;
1838 		goto out_put;
1839 	}
1840 
1841 	task_nodes = cpuset_mems_allowed(current);
1842 	nodes_and(*new, *new, task_nodes);
1843 	if (nodes_empty(*new))
1844 		goto out_put;
1845 
1846 	err = security_task_movememory(task);
1847 	if (err)
1848 		goto out_put;
1849 
1850 	mm = get_task_mm(task);
1851 	put_task_struct(task);
1852 
1853 	if (!mm) {
1854 		err = -EINVAL;
1855 		goto out;
1856 	}
1857 
1858 	err = do_migrate_pages(mm, old, new,
1859 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1860 
1861 	mmput(mm);
1862 out:
1863 	NODEMASK_SCRATCH_FREE(scratch);
1864 
1865 	return err;
1866 
1867 out_put:
1868 	put_task_struct(task);
1869 	goto out;
1870 }
1871 
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1872 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1873 		const unsigned long __user *, old_nodes,
1874 		const unsigned long __user *, new_nodes)
1875 {
1876 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1877 }
1878 
1879 /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1880 static int kernel_get_mempolicy(int __user *policy,
1881 				unsigned long __user *nmask,
1882 				unsigned long maxnode,
1883 				unsigned long addr,
1884 				unsigned long flags)
1885 {
1886 	int err;
1887 	int pval;
1888 	nodemask_t nodes;
1889 
1890 	if (nmask != NULL && maxnode < nr_node_ids)
1891 		return -EINVAL;
1892 
1893 	addr = untagged_addr(addr);
1894 
1895 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1896 
1897 	if (err)
1898 		return err;
1899 
1900 	if (policy && put_user(pval, policy))
1901 		return -EFAULT;
1902 
1903 	if (nmask)
1904 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1905 
1906 	return err;
1907 }
1908 
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1909 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1910 		unsigned long __user *, nmask, unsigned long, maxnode,
1911 		unsigned long, addr, unsigned long, flags)
1912 {
1913 	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1914 }
1915 
vma_migratable(struct vm_area_struct * vma)1916 bool vma_migratable(struct vm_area_struct *vma)
1917 {
1918 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1919 		return false;
1920 
1921 	/*
1922 	 * DAX device mappings require predictable access latency, so avoid
1923 	 * incurring periodic faults.
1924 	 */
1925 	if (vma_is_dax(vma))
1926 		return false;
1927 
1928 	if (is_vm_hugetlb_page(vma) &&
1929 		!hugepage_migration_supported(hstate_vma(vma)))
1930 		return false;
1931 
1932 	/*
1933 	 * Migration allocates pages in the highest zone. If we cannot
1934 	 * do so then migration (at least from node to node) is not
1935 	 * possible.
1936 	 */
1937 	if (vma->vm_file &&
1938 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1939 			< policy_zone)
1940 		return false;
1941 	return true;
1942 }
1943 
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)1944 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1945 				   unsigned long addr, pgoff_t *ilx)
1946 {
1947 	*ilx = 0;
1948 	return (vma->vm_ops && vma->vm_ops->get_policy) ?
1949 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
1950 }
1951 
1952 /*
1953  * get_vma_policy(@vma, @addr, @order, @ilx)
1954  * @vma: virtual memory area whose policy is sought
1955  * @addr: address in @vma for shared policy lookup
1956  * @order: 0, or appropriate huge_page_order for interleaving
1957  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
1958  *       MPOL_WEIGHTED_INTERLEAVE
1959  *
1960  * Returns effective policy for a VMA at specified address.
1961  * Falls back to current->mempolicy or system default policy, as necessary.
1962  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1963  * count--added by the get_policy() vm_op, as appropriate--to protect against
1964  * freeing by another task.  It is the caller's responsibility to free the
1965  * extra reference for shared policies.
1966  */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)1967 struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1968 				 unsigned long addr, int order, pgoff_t *ilx)
1969 {
1970 	struct mempolicy *pol;
1971 
1972 	pol = __get_vma_policy(vma, addr, ilx);
1973 	if (!pol)
1974 		pol = get_task_policy(current);
1975 	if (pol->mode == MPOL_INTERLEAVE ||
1976 	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1977 		*ilx += vma->vm_pgoff >> order;
1978 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1979 	}
1980 	return pol;
1981 }
1982 
vma_policy_mof(struct vm_area_struct * vma)1983 bool vma_policy_mof(struct vm_area_struct *vma)
1984 {
1985 	struct mempolicy *pol;
1986 
1987 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1988 		bool ret = false;
1989 		pgoff_t ilx;		/* ignored here */
1990 
1991 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
1992 		if (pol && (pol->flags & MPOL_F_MOF))
1993 			ret = true;
1994 		mpol_cond_put(pol);
1995 
1996 		return ret;
1997 	}
1998 
1999 	pol = vma->vm_policy;
2000 	if (!pol)
2001 		pol = get_task_policy(current);
2002 
2003 	return pol->flags & MPOL_F_MOF;
2004 }
2005 
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)2006 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2007 {
2008 	enum zone_type dynamic_policy_zone = policy_zone;
2009 
2010 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2011 
2012 	/*
2013 	 * if policy->nodes has movable memory only,
2014 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2015 	 *
2016 	 * policy->nodes is intersect with node_states[N_MEMORY].
2017 	 * so if the following test fails, it implies
2018 	 * policy->nodes has movable memory only.
2019 	 */
2020 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2021 		dynamic_policy_zone = ZONE_MOVABLE;
2022 
2023 	return zone >= dynamic_policy_zone;
2024 }
2025 
weighted_interleave_nodes(struct mempolicy * policy)2026 static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2027 {
2028 	unsigned int node;
2029 	unsigned int cpuset_mems_cookie;
2030 
2031 retry:
2032 	/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2033 	cpuset_mems_cookie = read_mems_allowed_begin();
2034 	node = current->il_prev;
2035 	if (!current->il_weight || !node_isset(node, policy->nodes)) {
2036 		node = next_node_in(node, policy->nodes);
2037 		if (read_mems_allowed_retry(cpuset_mems_cookie))
2038 			goto retry;
2039 		if (node == MAX_NUMNODES)
2040 			return node;
2041 		current->il_prev = node;
2042 		current->il_weight = get_il_weight(node);
2043 	}
2044 	current->il_weight--;
2045 	return node;
2046 }
2047 
2048 /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)2049 static unsigned int interleave_nodes(struct mempolicy *policy)
2050 {
2051 	unsigned int nid;
2052 	unsigned int cpuset_mems_cookie;
2053 
2054 	/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2055 	do {
2056 		cpuset_mems_cookie = read_mems_allowed_begin();
2057 		nid = next_node_in(current->il_prev, policy->nodes);
2058 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2059 
2060 	if (nid < MAX_NUMNODES)
2061 		current->il_prev = nid;
2062 	return nid;
2063 }
2064 
2065 /*
2066  * Depending on the memory policy provide a node from which to allocate the
2067  * next slab entry.
2068  */
mempolicy_slab_node(void)2069 unsigned int mempolicy_slab_node(void)
2070 {
2071 	struct mempolicy *policy;
2072 	int node = numa_mem_id();
2073 
2074 	if (!in_task())
2075 		return node;
2076 
2077 	policy = current->mempolicy;
2078 	if (!policy)
2079 		return node;
2080 
2081 	switch (policy->mode) {
2082 	case MPOL_PREFERRED:
2083 		return first_node(policy->nodes);
2084 
2085 	case MPOL_INTERLEAVE:
2086 		return interleave_nodes(policy);
2087 
2088 	case MPOL_WEIGHTED_INTERLEAVE:
2089 		return weighted_interleave_nodes(policy);
2090 
2091 	case MPOL_BIND:
2092 	case MPOL_PREFERRED_MANY:
2093 	{
2094 		struct zoneref *z;
2095 
2096 		/*
2097 		 * Follow bind policy behavior and start allocation at the
2098 		 * first node.
2099 		 */
2100 		struct zonelist *zonelist;
2101 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2102 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2103 		z = first_zones_zonelist(zonelist, highest_zoneidx,
2104 							&policy->nodes);
2105 		return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2106 	}
2107 	case MPOL_LOCAL:
2108 		return node;
2109 
2110 	default:
2111 		BUG();
2112 	}
2113 }
2114 
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)2115 static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2116 					      nodemask_t *mask)
2117 {
2118 	/*
2119 	 * barrier stabilizes the nodemask locally so that it can be iterated
2120 	 * over safely without concern for changes. Allocators validate node
2121 	 * selection does not violate mems_allowed, so this is safe.
2122 	 */
2123 	barrier();
2124 	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2125 	barrier();
2126 	return nodes_weight(*mask);
2127 }
2128 
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2129 static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2130 {
2131 	struct weighted_interleave_state *state;
2132 	nodemask_t nodemask;
2133 	unsigned int target, nr_nodes;
2134 	u8 *table = NULL;
2135 	unsigned int weight_total = 0;
2136 	u8 weight;
2137 	int nid = 0;
2138 
2139 	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2140 	if (!nr_nodes)
2141 		return numa_node_id();
2142 
2143 	rcu_read_lock();
2144 
2145 	state = rcu_dereference(wi_state);
2146 	/* Uninitialized wi_state means we should assume all weights are 1 */
2147 	if (state)
2148 		table = state->iw_table;
2149 
2150 	/* calculate the total weight */
2151 	for_each_node_mask(nid, nodemask)
2152 		weight_total += table ? table[nid] : 1;
2153 
2154 	/* Calculate the node offset based on totals */
2155 	target = ilx % weight_total;
2156 	nid = first_node(nodemask);
2157 	while (target) {
2158 		/* detect system default usage */
2159 		weight = table ? table[nid] : 1;
2160 		if (target < weight)
2161 			break;
2162 		target -= weight;
2163 		nid = next_node_in(nid, nodemask);
2164 	}
2165 	rcu_read_unlock();
2166 	return nid;
2167 }
2168 
2169 /*
2170  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
2171  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2172  * exceeds the number of present nodes.
2173  */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2174 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2175 {
2176 	nodemask_t nodemask;
2177 	unsigned int target, nnodes;
2178 	int i;
2179 	int nid;
2180 
2181 	nnodes = read_once_policy_nodemask(pol, &nodemask);
2182 	if (!nnodes)
2183 		return numa_node_id();
2184 	target = ilx % nnodes;
2185 	nid = first_node(nodemask);
2186 	for (i = 0; i < target; i++)
2187 		nid = next_node(nid, nodemask);
2188 	return nid;
2189 }
2190 
2191 /*
2192  * Return a nodemask representing a mempolicy for filtering nodes for
2193  * page allocation, together with preferred node id (or the input node id).
2194  */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2195 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2196 				   pgoff_t ilx, int *nid)
2197 {
2198 	nodemask_t *nodemask = NULL;
2199 
2200 	switch (pol->mode) {
2201 	case MPOL_PREFERRED:
2202 		/* Override input node id */
2203 		*nid = first_node(pol->nodes);
2204 		break;
2205 	case MPOL_PREFERRED_MANY:
2206 		nodemask = &pol->nodes;
2207 		if (pol->home_node != NUMA_NO_NODE)
2208 			*nid = pol->home_node;
2209 		break;
2210 	case MPOL_BIND:
2211 		/* Restrict to nodemask (but not on lower zones) */
2212 		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2213 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2214 			nodemask = &pol->nodes;
2215 		if (pol->home_node != NUMA_NO_NODE)
2216 			*nid = pol->home_node;
2217 		/*
2218 		 * __GFP_THISNODE shouldn't even be used with the bind policy
2219 		 * because we might easily break the expectation to stay on the
2220 		 * requested node and not break the policy.
2221 		 */
2222 		WARN_ON_ONCE(gfp & __GFP_THISNODE);
2223 		break;
2224 	case MPOL_INTERLEAVE:
2225 		/* Override input node id */
2226 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2227 			interleave_nodes(pol) : interleave_nid(pol, ilx);
2228 		break;
2229 	case MPOL_WEIGHTED_INTERLEAVE:
2230 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2231 			weighted_interleave_nodes(pol) :
2232 			weighted_interleave_nid(pol, ilx);
2233 		break;
2234 	}
2235 
2236 	return nodemask;
2237 }
2238 
2239 #ifdef CONFIG_HUGETLBFS
2240 /*
2241  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2242  * @vma: virtual memory area whose policy is sought
2243  * @addr: address in @vma for shared policy lookup and interleave policy
2244  * @gfp_flags: for requested zone
2245  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2246  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2247  *
2248  * Returns a nid suitable for a huge page allocation and a pointer
2249  * to the struct mempolicy for conditional unref after allocation.
2250  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2251  * to the mempolicy's @nodemask for filtering the zonelist.
2252  */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)2253 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2254 		struct mempolicy **mpol, nodemask_t **nodemask)
2255 {
2256 	pgoff_t ilx;
2257 	int nid;
2258 
2259 	nid = numa_node_id();
2260 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2261 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
2262 	return nid;
2263 }
2264 
2265 /*
2266  * init_nodemask_of_mempolicy
2267  *
2268  * If the current task's mempolicy is "default" [NULL], return 'false'
2269  * to indicate default policy.  Otherwise, extract the policy nodemask
2270  * for 'bind' or 'interleave' policy into the argument nodemask, or
2271  * initialize the argument nodemask to contain the single node for
2272  * 'preferred' or 'local' policy and return 'true' to indicate presence
2273  * of non-default mempolicy.
2274  *
2275  * We don't bother with reference counting the mempolicy [mpol_get/put]
2276  * because the current task is examining it's own mempolicy and a task's
2277  * mempolicy is only ever changed by the task itself.
2278  *
2279  * N.B., it is the caller's responsibility to free a returned nodemask.
2280  */
init_nodemask_of_mempolicy(nodemask_t * mask)2281 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2282 {
2283 	struct mempolicy *mempolicy;
2284 
2285 	if (!(mask && current->mempolicy))
2286 		return false;
2287 
2288 	task_lock(current);
2289 	mempolicy = current->mempolicy;
2290 	switch (mempolicy->mode) {
2291 	case MPOL_PREFERRED:
2292 	case MPOL_PREFERRED_MANY:
2293 	case MPOL_BIND:
2294 	case MPOL_INTERLEAVE:
2295 	case MPOL_WEIGHTED_INTERLEAVE:
2296 		*mask = mempolicy->nodes;
2297 		break;
2298 
2299 	case MPOL_LOCAL:
2300 		init_nodemask_of_node(mask, numa_node_id());
2301 		break;
2302 
2303 	default:
2304 		BUG();
2305 	}
2306 	task_unlock(current);
2307 
2308 	return true;
2309 }
2310 #endif
2311 
2312 /*
2313  * mempolicy_in_oom_domain
2314  *
2315  * If tsk's mempolicy is "bind", check for intersection between mask and
2316  * the policy nodemask. Otherwise, return true for all other policies
2317  * including "interleave", as a tsk with "interleave" policy may have
2318  * memory allocated from all nodes in system.
2319  *
2320  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2321  */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2322 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2323 					const nodemask_t *mask)
2324 {
2325 	struct mempolicy *mempolicy;
2326 	bool ret = true;
2327 
2328 	if (!mask)
2329 		return ret;
2330 
2331 	task_lock(tsk);
2332 	mempolicy = tsk->mempolicy;
2333 	if (mempolicy && mempolicy->mode == MPOL_BIND)
2334 		ret = nodes_intersects(mempolicy->nodes, *mask);
2335 	task_unlock(tsk);
2336 
2337 	return ret;
2338 }
2339 
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)2340 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2341 						int nid, nodemask_t *nodemask)
2342 {
2343 	struct page *page;
2344 	gfp_t preferred_gfp;
2345 
2346 	/*
2347 	 * This is a two pass approach. The first pass will only try the
2348 	 * preferred nodes but skip the direct reclaim and allow the
2349 	 * allocation to fail, while the second pass will try all the
2350 	 * nodes in system.
2351 	 */
2352 	preferred_gfp = gfp | __GFP_NOWARN;
2353 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2354 	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2355 	if (!page)
2356 		page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2357 
2358 	return page;
2359 }
2360 
2361 /**
2362  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2363  * @gfp: GFP flags.
2364  * @order: Order of the page allocation.
2365  * @pol: Pointer to the NUMA mempolicy.
2366  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2367  * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2368  *
2369  * Return: The page on success or NULL if allocation fails.
2370  */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2371 static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2372 		struct mempolicy *pol, pgoff_t ilx, int nid)
2373 {
2374 	nodemask_t *nodemask;
2375 	struct page *page;
2376 
2377 	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2378 
2379 	if (pol->mode == MPOL_PREFERRED_MANY)
2380 		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2381 
2382 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2383 	    /* filter "hugepage" allocation, unless from alloc_pages() */
2384 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2385 		/*
2386 		 * For hugepage allocation and non-interleave policy which
2387 		 * allows the current node (or other explicitly preferred
2388 		 * node) we only try to allocate from the current/preferred
2389 		 * node and don't fall back to other nodes, as the cost of
2390 		 * remote accesses would likely offset THP benefits.
2391 		 *
2392 		 * If the policy is interleave or does not allow the current
2393 		 * node in its nodemask, we allocate the standard way.
2394 		 */
2395 		if (pol->mode != MPOL_INTERLEAVE &&
2396 		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2397 		    (!nodemask || node_isset(nid, *nodemask))) {
2398 			/*
2399 			 * First, try to allocate THP only on local node, but
2400 			 * don't reclaim unnecessarily, just compact.
2401 			 */
2402 			page = __alloc_frozen_pages_noprof(
2403 				gfp | __GFP_THISNODE | __GFP_NORETRY, order,
2404 				nid, NULL);
2405 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2406 				return page;
2407 			/*
2408 			 * If hugepage allocations are configured to always
2409 			 * synchronous compact or the vma has been madvised
2410 			 * to prefer hugepage backing, retry allowing remote
2411 			 * memory with both reclaim and compact as well.
2412 			 */
2413 		}
2414 	}
2415 
2416 	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2417 
2418 	if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2419 		     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2420 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2421 		if (static_branch_likely(&vm_numa_stat_key) &&
2422 		    page_to_nid(page) == nid) {
2423 			preempt_disable();
2424 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2425 			preempt_enable();
2426 		}
2427 	}
2428 
2429 	return page;
2430 }
2431 
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2432 struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2433 		struct mempolicy *pol, pgoff_t ilx, int nid)
2434 {
2435 	struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
2436 			ilx, nid);
2437 	if (!page)
2438 		return NULL;
2439 
2440 	set_page_refcounted(page);
2441 	return page_rmappable_folio(page);
2442 }
2443 
2444 /**
2445  * vma_alloc_folio - Allocate a folio for a VMA.
2446  * @gfp: GFP flags.
2447  * @order: Order of the folio.
2448  * @vma: Pointer to VMA.
2449  * @addr: Virtual address of the allocation.  Must be inside @vma.
2450  *
2451  * Allocate a folio for a specific address in @vma, using the appropriate
2452  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
2453  * VMA to prevent it from going away.  Should be used for all allocations
2454  * for folios that will be mapped into user space, excepting hugetlbfs, and
2455  * excepting where direct use of folio_alloc_mpol() is more appropriate.
2456  *
2457  * Return: The folio on success or NULL if allocation fails.
2458  */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2459 struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
2460 		unsigned long addr)
2461 {
2462 	struct mempolicy *pol;
2463 	pgoff_t ilx;
2464 	struct folio *folio;
2465 
2466 	if (vma->vm_flags & VM_DROPPABLE)
2467 		gfp |= __GFP_NOWARN;
2468 
2469 	pol = get_vma_policy(vma, addr, order, &ilx);
2470 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2471 	mpol_cond_put(pol);
2472 	return folio;
2473 }
2474 EXPORT_SYMBOL(vma_alloc_folio_noprof);
2475 
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)2476 struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
2477 {
2478 	struct mempolicy *pol = &default_policy;
2479 
2480 	/*
2481 	 * No reference counting needed for current->mempolicy
2482 	 * nor system default_policy
2483 	 */
2484 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2485 		pol = get_task_policy(current);
2486 
2487 	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2488 				       numa_node_id());
2489 }
2490 
2491 /**
2492  * alloc_pages - Allocate pages.
2493  * @gfp: GFP flags.
2494  * @order: Power of two of number of pages to allocate.
2495  *
2496  * Allocate 1 << @order contiguous pages.  The physical address of the
2497  * first page is naturally aligned (eg an order-3 allocation will be aligned
2498  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2499  * process is honoured when in process context.
2500  *
2501  * Context: Can be called from any context, providing the appropriate GFP
2502  * flags are used.
2503  * Return: The page on success or NULL if allocation fails.
2504  */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2505 struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
2506 {
2507 	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2508 
2509 	if (page)
2510 		set_page_refcounted(page);
2511 	return page;
2512 }
2513 EXPORT_SYMBOL(alloc_pages_noprof);
2514 
folio_alloc_noprof(gfp_t gfp,unsigned int order)2515 struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2516 {
2517 	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2518 }
2519 EXPORT_SYMBOL(folio_alloc_noprof);
2520 
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2521 static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2522 		struct mempolicy *pol, unsigned long nr_pages,
2523 		struct page **page_array)
2524 {
2525 	int nodes;
2526 	unsigned long nr_pages_per_node;
2527 	int delta;
2528 	int i;
2529 	unsigned long nr_allocated;
2530 	unsigned long total_allocated = 0;
2531 
2532 	nodes = nodes_weight(pol->nodes);
2533 	nr_pages_per_node = nr_pages / nodes;
2534 	delta = nr_pages - nodes * nr_pages_per_node;
2535 
2536 	for (i = 0; i < nodes; i++) {
2537 		if (delta) {
2538 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2539 					interleave_nodes(pol), NULL,
2540 					nr_pages_per_node + 1,
2541 					page_array);
2542 			delta--;
2543 		} else {
2544 			nr_allocated = alloc_pages_bulk_noprof(gfp,
2545 					interleave_nodes(pol), NULL,
2546 					nr_pages_per_node, page_array);
2547 		}
2548 
2549 		page_array += nr_allocated;
2550 		total_allocated += nr_allocated;
2551 	}
2552 
2553 	return total_allocated;
2554 }
2555 
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2556 static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2557 		struct mempolicy *pol, unsigned long nr_pages,
2558 		struct page **page_array)
2559 {
2560 	struct weighted_interleave_state *state;
2561 	struct task_struct *me = current;
2562 	unsigned int cpuset_mems_cookie;
2563 	unsigned long total_allocated = 0;
2564 	unsigned long nr_allocated = 0;
2565 	unsigned long rounds;
2566 	unsigned long node_pages, delta;
2567 	u8 *weights, weight;
2568 	unsigned int weight_total = 0;
2569 	unsigned long rem_pages = nr_pages;
2570 	nodemask_t nodes;
2571 	int nnodes, node;
2572 	int resume_node = MAX_NUMNODES - 1;
2573 	u8 resume_weight = 0;
2574 	int prev_node;
2575 	int i;
2576 
2577 	if (!nr_pages)
2578 		return 0;
2579 
2580 	/* read the nodes onto the stack, retry if done during rebind */
2581 	do {
2582 		cpuset_mems_cookie = read_mems_allowed_begin();
2583 		nnodes = read_once_policy_nodemask(pol, &nodes);
2584 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2585 
2586 	/* if the nodemask has become invalid, we cannot do anything */
2587 	if (!nnodes)
2588 		return 0;
2589 
2590 	/* Continue allocating from most recent node and adjust the nr_pages */
2591 	node = me->il_prev;
2592 	weight = me->il_weight;
2593 	if (weight && node_isset(node, nodes)) {
2594 		node_pages = min(rem_pages, weight);
2595 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2596 						  page_array);
2597 		page_array += nr_allocated;
2598 		total_allocated += nr_allocated;
2599 		/* if that's all the pages, no need to interleave */
2600 		if (rem_pages <= weight) {
2601 			me->il_weight -= rem_pages;
2602 			return total_allocated;
2603 		}
2604 		/* Otherwise we adjust remaining pages, continue from there */
2605 		rem_pages -= weight;
2606 	}
2607 	/* clear active weight in case of an allocation failure */
2608 	me->il_weight = 0;
2609 	prev_node = node;
2610 
2611 	/* create a local copy of node weights to operate on outside rcu */
2612 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2613 	if (!weights)
2614 		return total_allocated;
2615 
2616 	rcu_read_lock();
2617 	state = rcu_dereference(wi_state);
2618 	if (state) {
2619 		memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2620 		rcu_read_unlock();
2621 	} else {
2622 		rcu_read_unlock();
2623 		for (i = 0; i < nr_node_ids; i++)
2624 			weights[i] = 1;
2625 	}
2626 
2627 	/* calculate total, detect system default usage */
2628 	for_each_node_mask(node, nodes)
2629 		weight_total += weights[node];
2630 
2631 	/*
2632 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2633 	 * Track which node weighted interleave should resume from.
2634 	 *
2635 	 * if (rounds > 0) and (delta == 0), resume_node will always be
2636 	 * the node following prev_node and its weight.
2637 	 */
2638 	rounds = rem_pages / weight_total;
2639 	delta = rem_pages % weight_total;
2640 	resume_node = next_node_in(prev_node, nodes);
2641 	resume_weight = weights[resume_node];
2642 	for (i = 0; i < nnodes; i++) {
2643 		node = next_node_in(prev_node, nodes);
2644 		weight = weights[node];
2645 		node_pages = weight * rounds;
2646 		/* If a delta exists, add this node's portion of the delta */
2647 		if (delta > weight) {
2648 			node_pages += weight;
2649 			delta -= weight;
2650 		} else if (delta) {
2651 			/* when delta is depleted, resume from that node */
2652 			node_pages += delta;
2653 			resume_node = node;
2654 			resume_weight = weight - delta;
2655 			delta = 0;
2656 		}
2657 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
2658 		if (!node_pages)
2659 			break;
2660 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2661 						  page_array);
2662 		page_array += nr_allocated;
2663 		total_allocated += nr_allocated;
2664 		if (total_allocated == nr_pages)
2665 			break;
2666 		prev_node = node;
2667 	}
2668 	me->il_prev = resume_node;
2669 	me->il_weight = resume_weight;
2670 	kfree(weights);
2671 	return total_allocated;
2672 }
2673 
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)2674 static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2675 		struct mempolicy *pol, unsigned long nr_pages,
2676 		struct page **page_array)
2677 {
2678 	gfp_t preferred_gfp;
2679 	unsigned long nr_allocated = 0;
2680 
2681 	preferred_gfp = gfp | __GFP_NOWARN;
2682 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2683 
2684 	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2685 					   nr_pages, page_array);
2686 
2687 	if (nr_allocated < nr_pages)
2688 		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2689 				nr_pages - nr_allocated,
2690 				page_array + nr_allocated);
2691 	return nr_allocated;
2692 }
2693 
2694 /* alloc pages bulk and mempolicy should be considered at the
2695  * same time in some situation such as vmalloc.
2696  *
2697  * It can accelerate memory allocation especially interleaving
2698  * allocate memory.
2699  */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)2700 unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2701 		unsigned long nr_pages, struct page **page_array)
2702 {
2703 	struct mempolicy *pol = &default_policy;
2704 	nodemask_t *nodemask;
2705 	int nid;
2706 
2707 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2708 		pol = get_task_policy(current);
2709 
2710 	if (pol->mode == MPOL_INTERLEAVE)
2711 		return alloc_pages_bulk_interleave(gfp, pol,
2712 							 nr_pages, page_array);
2713 
2714 	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2715 		return alloc_pages_bulk_weighted_interleave(
2716 				  gfp, pol, nr_pages, page_array);
2717 
2718 	if (pol->mode == MPOL_PREFERRED_MANY)
2719 		return alloc_pages_bulk_preferred_many(gfp,
2720 				numa_node_id(), pol, nr_pages, page_array);
2721 
2722 	nid = numa_node_id();
2723 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2724 	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2725 				       nr_pages, page_array);
2726 }
2727 
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2728 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2729 {
2730 	struct mempolicy *pol = mpol_dup(src->vm_policy);
2731 
2732 	if (IS_ERR(pol))
2733 		return PTR_ERR(pol);
2734 	dst->vm_policy = pol;
2735 	return 0;
2736 }
2737 
2738 /*
2739  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2740  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2741  * with the mems_allowed returned by cpuset_mems_allowed().  This
2742  * keeps mempolicies cpuset relative after its cpuset moves.  See
2743  * further kernel/cpuset.c update_nodemask().
2744  *
2745  * current's mempolicy may be rebinded by the other task(the task that changes
2746  * cpuset's mems), so we needn't do rebind work for current task.
2747  */
2748 
2749 /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2750 struct mempolicy *__mpol_dup(struct mempolicy *old)
2751 {
2752 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2753 
2754 	if (!new)
2755 		return ERR_PTR(-ENOMEM);
2756 
2757 	/* task's mempolicy is protected by alloc_lock */
2758 	if (old == current->mempolicy) {
2759 		task_lock(current);
2760 		*new = *old;
2761 		task_unlock(current);
2762 	} else
2763 		*new = *old;
2764 
2765 	if (current_cpuset_is_being_rebound()) {
2766 		nodemask_t mems = cpuset_mems_allowed(current);
2767 		mpol_rebind_policy(new, &mems);
2768 	}
2769 	atomic_set(&new->refcnt, 1);
2770 	return new;
2771 }
2772 
2773 /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2774 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2775 {
2776 	if (!a || !b)
2777 		return false;
2778 	if (a->mode != b->mode)
2779 		return false;
2780 	if (a->flags != b->flags)
2781 		return false;
2782 	if (a->home_node != b->home_node)
2783 		return false;
2784 	if (mpol_store_user_nodemask(a))
2785 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2786 			return false;
2787 
2788 	switch (a->mode) {
2789 	case MPOL_BIND:
2790 	case MPOL_INTERLEAVE:
2791 	case MPOL_PREFERRED:
2792 	case MPOL_PREFERRED_MANY:
2793 	case MPOL_WEIGHTED_INTERLEAVE:
2794 		return !!nodes_equal(a->nodes, b->nodes);
2795 	case MPOL_LOCAL:
2796 		return true;
2797 	default:
2798 		BUG();
2799 		return false;
2800 	}
2801 }
2802 
2803 /*
2804  * Shared memory backing store policy support.
2805  *
2806  * Remember policies even when nobody has shared memory mapped.
2807  * The policies are kept in Red-Black tree linked from the inode.
2808  * They are protected by the sp->lock rwlock, which should be held
2809  * for any accesses to the tree.
2810  */
2811 
2812 /*
2813  * lookup first element intersecting start-end.  Caller holds sp->lock for
2814  * reading or for writing
2815  */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)2816 static struct sp_node *sp_lookup(struct shared_policy *sp,
2817 					pgoff_t start, pgoff_t end)
2818 {
2819 	struct rb_node *n = sp->root.rb_node;
2820 
2821 	while (n) {
2822 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2823 
2824 		if (start >= p->end)
2825 			n = n->rb_right;
2826 		else if (end <= p->start)
2827 			n = n->rb_left;
2828 		else
2829 			break;
2830 	}
2831 	if (!n)
2832 		return NULL;
2833 	for (;;) {
2834 		struct sp_node *w = NULL;
2835 		struct rb_node *prev = rb_prev(n);
2836 		if (!prev)
2837 			break;
2838 		w = rb_entry(prev, struct sp_node, nd);
2839 		if (w->end <= start)
2840 			break;
2841 		n = prev;
2842 	}
2843 	return rb_entry(n, struct sp_node, nd);
2844 }
2845 
2846 /*
2847  * Insert a new shared policy into the list.  Caller holds sp->lock for
2848  * writing.
2849  */
sp_insert(struct shared_policy * sp,struct sp_node * new)2850 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2851 {
2852 	struct rb_node **p = &sp->root.rb_node;
2853 	struct rb_node *parent = NULL;
2854 	struct sp_node *nd;
2855 
2856 	while (*p) {
2857 		parent = *p;
2858 		nd = rb_entry(parent, struct sp_node, nd);
2859 		if (new->start < nd->start)
2860 			p = &(*p)->rb_left;
2861 		else if (new->end > nd->end)
2862 			p = &(*p)->rb_right;
2863 		else
2864 			BUG();
2865 	}
2866 	rb_link_node(&new->nd, parent, p);
2867 	rb_insert_color(&new->nd, &sp->root);
2868 }
2869 
2870 /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)2871 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2872 						pgoff_t idx)
2873 {
2874 	struct mempolicy *pol = NULL;
2875 	struct sp_node *sn;
2876 
2877 	if (!sp->root.rb_node)
2878 		return NULL;
2879 	read_lock(&sp->lock);
2880 	sn = sp_lookup(sp, idx, idx+1);
2881 	if (sn) {
2882 		mpol_get(sn->policy);
2883 		pol = sn->policy;
2884 	}
2885 	read_unlock(&sp->lock);
2886 	return pol;
2887 }
2888 
sp_free(struct sp_node * n)2889 static void sp_free(struct sp_node *n)
2890 {
2891 	mpol_put(n->policy);
2892 	kmem_cache_free(sn_cache, n);
2893 }
2894 
2895 /**
2896  * mpol_misplaced - check whether current folio node is valid in policy
2897  *
2898  * @folio: folio to be checked
2899  * @vmf: structure describing the fault
2900  * @addr: virtual address in @vma for shared policy lookup and interleave policy
2901  *
2902  * Lookup current policy node id for vma,addr and "compare to" folio's
2903  * node id.  Policy determination "mimics" alloc_page_vma().
2904  * Called from fault path where we know the vma and faulting address.
2905  *
2906  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2907  * policy, or a suitable node ID to allocate a replacement folio from.
2908  */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2909 int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
2910 		   unsigned long addr)
2911 {
2912 	struct mempolicy *pol;
2913 	pgoff_t ilx;
2914 	struct zoneref *z;
2915 	int curnid = folio_nid(folio);
2916 	struct vm_area_struct *vma = vmf->vma;
2917 	int thiscpu = raw_smp_processor_id();
2918 	int thisnid = numa_node_id();
2919 	int polnid = NUMA_NO_NODE;
2920 	int ret = NUMA_NO_NODE;
2921 
2922 	/*
2923 	 * Make sure ptl is held so that we don't preempt and we
2924 	 * have a stable smp processor id
2925 	 */
2926 	lockdep_assert_held(vmf->ptl);
2927 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
2928 	if (!(pol->flags & MPOL_F_MOF))
2929 		goto out;
2930 
2931 	switch (pol->mode) {
2932 	case MPOL_INTERLEAVE:
2933 		polnid = interleave_nid(pol, ilx);
2934 		break;
2935 
2936 	case MPOL_WEIGHTED_INTERLEAVE:
2937 		polnid = weighted_interleave_nid(pol, ilx);
2938 		break;
2939 
2940 	case MPOL_PREFERRED:
2941 		if (node_isset(curnid, pol->nodes))
2942 			goto out;
2943 		polnid = first_node(pol->nodes);
2944 		break;
2945 
2946 	case MPOL_LOCAL:
2947 		polnid = numa_node_id();
2948 		break;
2949 
2950 	case MPOL_BIND:
2951 	case MPOL_PREFERRED_MANY:
2952 		/*
2953 		 * Even though MPOL_PREFERRED_MANY can allocate pages outside
2954 		 * policy nodemask we don't allow numa migration to nodes
2955 		 * outside policy nodemask for now. This is done so that if we
2956 		 * want demotion to slow memory to happen, before allocating
2957 		 * from some DRAM node say 'x', we will end up using a
2958 		 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
2959 		 * we should not promote to node 'x' from slow memory node.
2960 		 */
2961 		if (pol->flags & MPOL_F_MORON) {
2962 			/*
2963 			 * Optimize placement among multiple nodes
2964 			 * via NUMA balancing
2965 			 */
2966 			if (node_isset(thisnid, pol->nodes))
2967 				break;
2968 			goto out;
2969 		}
2970 
2971 		/*
2972 		 * use current page if in policy nodemask,
2973 		 * else select nearest allowed node, if any.
2974 		 * If no allowed nodes, use current [!misplaced].
2975 		 */
2976 		if (node_isset(curnid, pol->nodes))
2977 			goto out;
2978 		z = first_zones_zonelist(
2979 				node_zonelist(thisnid, GFP_HIGHUSER),
2980 				gfp_zone(GFP_HIGHUSER),
2981 				&pol->nodes);
2982 		polnid = zonelist_node_idx(z);
2983 		break;
2984 
2985 	default:
2986 		BUG();
2987 	}
2988 
2989 	/* Migrate the folio towards the node whose CPU is referencing it */
2990 	if (pol->flags & MPOL_F_MORON) {
2991 		polnid = thisnid;
2992 
2993 		if (!should_numa_migrate_memory(current, folio, curnid,
2994 						thiscpu))
2995 			goto out;
2996 	}
2997 
2998 	if (curnid != polnid)
2999 		ret = polnid;
3000 out:
3001 	mpol_cond_put(pol);
3002 
3003 	return ret;
3004 }
3005 
3006 /*
3007  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
3008  * dropped after task->mempolicy is set to NULL so that any allocation done as
3009  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3010  * policy.
3011  */
mpol_put_task_policy(struct task_struct * task)3012 void mpol_put_task_policy(struct task_struct *task)
3013 {
3014 	struct mempolicy *pol;
3015 
3016 	task_lock(task);
3017 	pol = task->mempolicy;
3018 	task->mempolicy = NULL;
3019 	task_unlock(task);
3020 	mpol_put(pol);
3021 }
3022 
sp_delete(struct shared_policy * sp,struct sp_node * n)3023 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
3024 {
3025 	rb_erase(&n->nd, &sp->root);
3026 	sp_free(n);
3027 }
3028 
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)3029 static void sp_node_init(struct sp_node *node, unsigned long start,
3030 			unsigned long end, struct mempolicy *pol)
3031 {
3032 	node->start = start;
3033 	node->end = end;
3034 	node->policy = pol;
3035 }
3036 
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)3037 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3038 				struct mempolicy *pol)
3039 {
3040 	struct sp_node *n;
3041 	struct mempolicy *newpol;
3042 
3043 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3044 	if (!n)
3045 		return NULL;
3046 
3047 	newpol = mpol_dup(pol);
3048 	if (IS_ERR(newpol)) {
3049 		kmem_cache_free(sn_cache, n);
3050 		return NULL;
3051 	}
3052 	newpol->flags |= MPOL_F_SHARED;
3053 	sp_node_init(n, start, end, newpol);
3054 
3055 	return n;
3056 }
3057 
3058 /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)3059 static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3060 				 pgoff_t end, struct sp_node *new)
3061 {
3062 	struct sp_node *n;
3063 	struct sp_node *n_new = NULL;
3064 	struct mempolicy *mpol_new = NULL;
3065 	int ret = 0;
3066 
3067 restart:
3068 	write_lock(&sp->lock);
3069 	n = sp_lookup(sp, start, end);
3070 	/* Take care of old policies in the same range. */
3071 	while (n && n->start < end) {
3072 		struct rb_node *next = rb_next(&n->nd);
3073 		if (n->start >= start) {
3074 			if (n->end <= end)
3075 				sp_delete(sp, n);
3076 			else
3077 				n->start = end;
3078 		} else {
3079 			/* Old policy spanning whole new range. */
3080 			if (n->end > end) {
3081 				if (!n_new)
3082 					goto alloc_new;
3083 
3084 				*mpol_new = *n->policy;
3085 				atomic_set(&mpol_new->refcnt, 1);
3086 				sp_node_init(n_new, end, n->end, mpol_new);
3087 				n->end = start;
3088 				sp_insert(sp, n_new);
3089 				n_new = NULL;
3090 				mpol_new = NULL;
3091 				break;
3092 			} else
3093 				n->end = start;
3094 		}
3095 		if (!next)
3096 			break;
3097 		n = rb_entry(next, struct sp_node, nd);
3098 	}
3099 	if (new)
3100 		sp_insert(sp, new);
3101 	write_unlock(&sp->lock);
3102 	ret = 0;
3103 
3104 err_out:
3105 	if (mpol_new)
3106 		mpol_put(mpol_new);
3107 	if (n_new)
3108 		kmem_cache_free(sn_cache, n_new);
3109 
3110 	return ret;
3111 
3112 alloc_new:
3113 	write_unlock(&sp->lock);
3114 	ret = -ENOMEM;
3115 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3116 	if (!n_new)
3117 		goto err_out;
3118 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3119 	if (!mpol_new)
3120 		goto err_out;
3121 	atomic_set(&mpol_new->refcnt, 1);
3122 	goto restart;
3123 }
3124 
3125 /**
3126  * mpol_shared_policy_init - initialize shared policy for inode
3127  * @sp: pointer to inode shared policy
3128  * @mpol:  struct mempolicy to install
3129  *
3130  * Install non-NULL @mpol in inode's shared policy rb-tree.
3131  * On entry, the current task has a reference on a non-NULL @mpol.
3132  * This must be released on exit.
3133  * This is called at get_inode() calls and we can use GFP_KERNEL.
3134  */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)3135 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
3136 {
3137 	int ret;
3138 
3139 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
3140 	rwlock_init(&sp->lock);
3141 
3142 	if (mpol) {
3143 		struct sp_node *sn;
3144 		struct mempolicy *npol;
3145 		NODEMASK_SCRATCH(scratch);
3146 
3147 		if (!scratch)
3148 			goto put_mpol;
3149 
3150 		/* contextualize the tmpfs mount point mempolicy to this file */
3151 		npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
3152 		if (IS_ERR(npol))
3153 			goto free_scratch; /* no valid nodemask intersection */
3154 
3155 		task_lock(current);
3156 		ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
3157 		task_unlock(current);
3158 		if (ret)
3159 			goto put_npol;
3160 
3161 		/* alloc node covering entire file; adds ref to file's npol */
3162 		sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
3163 		if (sn)
3164 			sp_insert(sp, sn);
3165 put_npol:
3166 		mpol_put(npol);	/* drop initial ref on file's npol */
3167 free_scratch:
3168 		NODEMASK_SCRATCH_FREE(scratch);
3169 put_mpol:
3170 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
3171 	}
3172 }
3173 
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3174 int mpol_set_shared_policy(struct shared_policy *sp,
3175 			struct vm_area_struct *vma, struct mempolicy *pol)
3176 {
3177 	int err;
3178 	struct sp_node *new = NULL;
3179 	unsigned long sz = vma_pages(vma);
3180 
3181 	if (pol) {
3182 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
3183 		if (!new)
3184 			return -ENOMEM;
3185 	}
3186 	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
3187 	if (err && new)
3188 		sp_free(new);
3189 	return err;
3190 }
3191 
3192 /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3193 void mpol_free_shared_policy(struct shared_policy *sp)
3194 {
3195 	struct sp_node *n;
3196 	struct rb_node *next;
3197 
3198 	if (!sp->root.rb_node)
3199 		return;
3200 	write_lock(&sp->lock);
3201 	next = rb_first(&sp->root);
3202 	while (next) {
3203 		n = rb_entry(next, struct sp_node, nd);
3204 		next = rb_next(&n->nd);
3205 		sp_delete(sp, n);
3206 	}
3207 	write_unlock(&sp->lock);
3208 }
3209 
3210 #ifdef CONFIG_NUMA_BALANCING
3211 static int __initdata numabalancing_override;
3212 
check_numabalancing_enable(void)3213 static void __init check_numabalancing_enable(void)
3214 {
3215 	bool numabalancing_default = false;
3216 
3217 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3218 		numabalancing_default = true;
3219 
3220 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3221 	if (numabalancing_override)
3222 		set_numabalancing_state(numabalancing_override == 1);
3223 
3224 	if (num_online_nodes() > 1 && !numabalancing_override) {
3225 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3226 			numabalancing_default ? "Enabling" : "Disabling");
3227 		set_numabalancing_state(numabalancing_default);
3228 	}
3229 }
3230 
setup_numabalancing(char * str)3231 static int __init setup_numabalancing(char *str)
3232 {
3233 	int ret = 0;
3234 	if (!str)
3235 		goto out;
3236 
3237 	if (!strcmp(str, "enable")) {
3238 		numabalancing_override = 1;
3239 		ret = 1;
3240 	} else if (!strcmp(str, "disable")) {
3241 		numabalancing_override = -1;
3242 		ret = 1;
3243 	}
3244 out:
3245 	if (!ret)
3246 		pr_warn("Unable to parse numa_balancing=\n");
3247 
3248 	return ret;
3249 }
3250 __setup("numa_balancing=", setup_numabalancing);
3251 #else
check_numabalancing_enable(void)3252 static inline void __init check_numabalancing_enable(void)
3253 {
3254 }
3255 #endif /* CONFIG_NUMA_BALANCING */
3256 
numa_policy_init(void)3257 void __init numa_policy_init(void)
3258 {
3259 	nodemask_t interleave_nodes;
3260 	unsigned long largest = 0;
3261 	int nid, prefer = 0;
3262 
3263 	policy_cache = kmem_cache_create("numa_policy",
3264 					 sizeof(struct mempolicy),
3265 					 0, SLAB_PANIC, NULL);
3266 
3267 	sn_cache = kmem_cache_create("shared_policy_node",
3268 				     sizeof(struct sp_node),
3269 				     0, SLAB_PANIC, NULL);
3270 
3271 	for_each_node(nid) {
3272 		preferred_node_policy[nid] = (struct mempolicy) {
3273 			.refcnt = ATOMIC_INIT(1),
3274 			.mode = MPOL_PREFERRED,
3275 			.flags = MPOL_F_MOF | MPOL_F_MORON,
3276 			.nodes = nodemask_of_node(nid),
3277 		};
3278 	}
3279 
3280 	/*
3281 	 * Set interleaving policy for system init. Interleaving is only
3282 	 * enabled across suitably sized nodes (default is >= 16MB), or
3283 	 * fall back to the largest node if they're all smaller.
3284 	 */
3285 	nodes_clear(interleave_nodes);
3286 	for_each_node_state(nid, N_MEMORY) {
3287 		unsigned long total_pages = node_present_pages(nid);
3288 
3289 		/* Preserve the largest node */
3290 		if (largest < total_pages) {
3291 			largest = total_pages;
3292 			prefer = nid;
3293 		}
3294 
3295 		/* Interleave this node? */
3296 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3297 			node_set(nid, interleave_nodes);
3298 	}
3299 
3300 	/* All too small, use the largest */
3301 	if (unlikely(nodes_empty(interleave_nodes)))
3302 		node_set(prefer, interleave_nodes);
3303 
3304 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3305 		pr_err("%s: interleaving failed\n", __func__);
3306 
3307 	check_numabalancing_enable();
3308 }
3309 
3310 /* Reset policy of current process to default */
numa_default_policy(void)3311 void numa_default_policy(void)
3312 {
3313 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
3314 }
3315 
3316 /*
3317  * Parse and format mempolicy from/to strings
3318  */
3319 static const char * const policy_modes[] =
3320 {
3321 	[MPOL_DEFAULT]    = "default",
3322 	[MPOL_PREFERRED]  = "prefer",
3323 	[MPOL_BIND]       = "bind",
3324 	[MPOL_INTERLEAVE] = "interleave",
3325 	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3326 	[MPOL_LOCAL]      = "local",
3327 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
3328 };
3329 
3330 #ifdef CONFIG_TMPFS
3331 /**
3332  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3333  * @str:  string containing mempolicy to parse
3334  * @mpol:  pointer to struct mempolicy pointer, returned on success.
3335  *
3336  * Format of input:
3337  *	<mode>[=<flags>][:<nodelist>]
3338  *
3339  * Return: %0 on success, else %1
3340  */
mpol_parse_str(char * str,struct mempolicy ** mpol)3341 int mpol_parse_str(char *str, struct mempolicy **mpol)
3342 {
3343 	struct mempolicy *new = NULL;
3344 	unsigned short mode_flags;
3345 	nodemask_t nodes;
3346 	char *nodelist = strchr(str, ':');
3347 	char *flags = strchr(str, '=');
3348 	int err = 1, mode;
3349 
3350 	if (flags)
3351 		*flags++ = '\0';	/* terminate mode string */
3352 
3353 	if (nodelist) {
3354 		/* NUL-terminate mode or flags string */
3355 		*nodelist++ = '\0';
3356 		if (nodelist_parse(nodelist, nodes))
3357 			goto out;
3358 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3359 			goto out;
3360 	} else
3361 		nodes_clear(nodes);
3362 
3363 	mode = match_string(policy_modes, MPOL_MAX, str);
3364 	if (mode < 0)
3365 		goto out;
3366 
3367 	switch (mode) {
3368 	case MPOL_PREFERRED:
3369 		/*
3370 		 * Insist on a nodelist of one node only, although later
3371 		 * we use first_node(nodes) to grab a single node, so here
3372 		 * nodelist (or nodes) cannot be empty.
3373 		 */
3374 		if (nodelist) {
3375 			char *rest = nodelist;
3376 			while (isdigit(*rest))
3377 				rest++;
3378 			if (*rest)
3379 				goto out;
3380 			if (nodes_empty(nodes))
3381 				goto out;
3382 		}
3383 		break;
3384 	case MPOL_INTERLEAVE:
3385 	case MPOL_WEIGHTED_INTERLEAVE:
3386 		/*
3387 		 * Default to online nodes with memory if no nodelist
3388 		 */
3389 		if (!nodelist)
3390 			nodes = node_states[N_MEMORY];
3391 		break;
3392 	case MPOL_LOCAL:
3393 		/*
3394 		 * Don't allow a nodelist;  mpol_new() checks flags
3395 		 */
3396 		if (nodelist)
3397 			goto out;
3398 		break;
3399 	case MPOL_DEFAULT:
3400 		/*
3401 		 * Insist on a empty nodelist
3402 		 */
3403 		if (!nodelist)
3404 			err = 0;
3405 		goto out;
3406 	case MPOL_PREFERRED_MANY:
3407 	case MPOL_BIND:
3408 		/*
3409 		 * Insist on a nodelist
3410 		 */
3411 		if (!nodelist)
3412 			goto out;
3413 	}
3414 
3415 	mode_flags = 0;
3416 	if (flags) {
3417 		/*
3418 		 * Currently, we only support two mutually exclusive
3419 		 * mode flags.
3420 		 */
3421 		if (!strcmp(flags, "static"))
3422 			mode_flags |= MPOL_F_STATIC_NODES;
3423 		else if (!strcmp(flags, "relative"))
3424 			mode_flags |= MPOL_F_RELATIVE_NODES;
3425 		else
3426 			goto out;
3427 	}
3428 
3429 	new = mpol_new(mode, mode_flags, &nodes);
3430 	if (IS_ERR(new))
3431 		goto out;
3432 
3433 	/*
3434 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3435 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3436 	 */
3437 	if (mode != MPOL_PREFERRED) {
3438 		new->nodes = nodes;
3439 	} else if (nodelist) {
3440 		nodes_clear(new->nodes);
3441 		node_set(first_node(nodes), new->nodes);
3442 	} else {
3443 		new->mode = MPOL_LOCAL;
3444 	}
3445 
3446 	/*
3447 	 * Save nodes for contextualization: this will be used to "clone"
3448 	 * the mempolicy in a specific context [cpuset] at a later time.
3449 	 */
3450 	new->w.user_nodemask = nodes;
3451 
3452 	err = 0;
3453 
3454 out:
3455 	/* Restore string for error message */
3456 	if (nodelist)
3457 		*--nodelist = ':';
3458 	if (flags)
3459 		*--flags = '=';
3460 	if (!err)
3461 		*mpol = new;
3462 	return err;
3463 }
3464 #endif /* CONFIG_TMPFS */
3465 
3466 /**
3467  * mpol_to_str - format a mempolicy structure for printing
3468  * @buffer:  to contain formatted mempolicy string
3469  * @maxlen:  length of @buffer
3470  * @pol:  pointer to mempolicy to be formatted
3471  *
3472  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3473  * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3474  * interleave", plus the longest flag flags, "relative|balancing", and to
3475  * display at least a few node ids.
3476  */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3477 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3478 {
3479 	char *p = buffer;
3480 	nodemask_t nodes = NODE_MASK_NONE;
3481 	unsigned short mode = MPOL_DEFAULT;
3482 	unsigned short flags = 0;
3483 
3484 	if (pol &&
3485 	    pol != &default_policy &&
3486 	    !(pol >= &preferred_node_policy[0] &&
3487 	      pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3488 		mode = pol->mode;
3489 		flags = pol->flags;
3490 	}
3491 
3492 	switch (mode) {
3493 	case MPOL_DEFAULT:
3494 	case MPOL_LOCAL:
3495 		break;
3496 	case MPOL_PREFERRED:
3497 	case MPOL_PREFERRED_MANY:
3498 	case MPOL_BIND:
3499 	case MPOL_INTERLEAVE:
3500 	case MPOL_WEIGHTED_INTERLEAVE:
3501 		nodes = pol->nodes;
3502 		break;
3503 	default:
3504 		WARN_ON_ONCE(1);
3505 		snprintf(p, maxlen, "unknown");
3506 		return;
3507 	}
3508 
3509 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3510 
3511 	if (flags & MPOL_MODE_FLAGS) {
3512 		p += snprintf(p, buffer + maxlen - p, "=");
3513 
3514 		/*
3515 		 * Static and relative are mutually exclusive.
3516 		 */
3517 		if (flags & MPOL_F_STATIC_NODES)
3518 			p += snprintf(p, buffer + maxlen - p, "static");
3519 		else if (flags & MPOL_F_RELATIVE_NODES)
3520 			p += snprintf(p, buffer + maxlen - p, "relative");
3521 
3522 		if (flags & MPOL_F_NUMA_BALANCING) {
3523 			if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3524 				p += snprintf(p, buffer + maxlen - p, "|");
3525 			p += snprintf(p, buffer + maxlen - p, "balancing");
3526 		}
3527 	}
3528 
3529 	if (!nodes_empty(nodes))
3530 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3531 			       nodemask_pr_args(&nodes));
3532 }
3533 
3534 #ifdef CONFIG_SYSFS
3535 struct iw_node_attr {
3536 	struct kobj_attribute kobj_attr;
3537 	int nid;
3538 };
3539 
3540 struct sysfs_wi_group {
3541 	struct kobject wi_kobj;
3542 	struct mutex kobj_lock;
3543 	struct iw_node_attr *nattrs[];
3544 };
3545 
3546 static struct sysfs_wi_group *wi_group;
3547 
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3548 static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3549 			 char *buf)
3550 {
3551 	struct iw_node_attr *node_attr;
3552 	u8 weight;
3553 
3554 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3555 	weight = get_il_weight(node_attr->nid);
3556 	return sysfs_emit(buf, "%d\n", weight);
3557 }
3558 
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3559 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3560 			  const char *buf, size_t count)
3561 {
3562 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3563 	struct iw_node_attr *node_attr;
3564 	u8 weight = 0;
3565 	int i;
3566 
3567 	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3568 	if (count == 0 || sysfs_streq(buf, "") ||
3569 	    kstrtou8(buf, 0, &weight) || weight == 0)
3570 		return -EINVAL;
3571 
3572 	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3573 			       GFP_KERNEL);
3574 	if (!new_wi_state)
3575 		return -ENOMEM;
3576 
3577 	mutex_lock(&wi_state_lock);
3578 	old_wi_state = rcu_dereference_protected(wi_state,
3579 					lockdep_is_held(&wi_state_lock));
3580 	if (old_wi_state) {
3581 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3582 					nr_node_ids * sizeof(u8));
3583 	} else {
3584 		for (i = 0; i < nr_node_ids; i++)
3585 			new_wi_state->iw_table[i] = 1;
3586 	}
3587 	new_wi_state->iw_table[node_attr->nid] = weight;
3588 	new_wi_state->mode_auto = false;
3589 
3590 	rcu_assign_pointer(wi_state, new_wi_state);
3591 	mutex_unlock(&wi_state_lock);
3592 	if (old_wi_state) {
3593 		synchronize_rcu();
3594 		kfree(old_wi_state);
3595 	}
3596 	return count;
3597 }
3598 
weighted_interleave_auto_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3599 static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3600 		struct kobj_attribute *attr, char *buf)
3601 {
3602 	struct weighted_interleave_state *state;
3603 	bool wi_auto = true;
3604 
3605 	rcu_read_lock();
3606 	state = rcu_dereference(wi_state);
3607 	if (state)
3608 		wi_auto = state->mode_auto;
3609 	rcu_read_unlock();
3610 
3611 	return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3612 }
3613 
weighted_interleave_auto_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3614 static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3615 		struct kobj_attribute *attr, const char *buf, size_t count)
3616 {
3617 	struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3618 	unsigned int *bw;
3619 	bool input;
3620 	int i;
3621 
3622 	if (kstrtobool(buf, &input))
3623 		return -EINVAL;
3624 
3625 	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3626 			       GFP_KERNEL);
3627 	if (!new_wi_state)
3628 		return -ENOMEM;
3629 	for (i = 0; i < nr_node_ids; i++)
3630 		new_wi_state->iw_table[i] = 1;
3631 
3632 	mutex_lock(&wi_state_lock);
3633 	if (!input) {
3634 		old_wi_state = rcu_dereference_protected(wi_state,
3635 					lockdep_is_held(&wi_state_lock));
3636 		if (!old_wi_state)
3637 			goto update_wi_state;
3638 		if (input == old_wi_state->mode_auto) {
3639 			mutex_unlock(&wi_state_lock);
3640 			return count;
3641 		}
3642 
3643 		memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3644 					       nr_node_ids * sizeof(u8));
3645 		goto update_wi_state;
3646 	}
3647 
3648 	bw = node_bw_table;
3649 	if (!bw) {
3650 		mutex_unlock(&wi_state_lock);
3651 		kfree(new_wi_state);
3652 		return -ENODEV;
3653 	}
3654 
3655 	new_wi_state->mode_auto = true;
3656 	reduce_interleave_weights(bw, new_wi_state->iw_table);
3657 
3658 update_wi_state:
3659 	rcu_assign_pointer(wi_state, new_wi_state);
3660 	mutex_unlock(&wi_state_lock);
3661 	if (old_wi_state) {
3662 		synchronize_rcu();
3663 		kfree(old_wi_state);
3664 	}
3665 	return count;
3666 }
3667 
sysfs_wi_node_delete(int nid)3668 static void sysfs_wi_node_delete(int nid)
3669 {
3670 	struct iw_node_attr *attr;
3671 
3672 	if (nid < 0 || nid >= nr_node_ids)
3673 		return;
3674 
3675 	mutex_lock(&wi_group->kobj_lock);
3676 	attr = wi_group->nattrs[nid];
3677 	if (!attr) {
3678 		mutex_unlock(&wi_group->kobj_lock);
3679 		return;
3680 	}
3681 
3682 	wi_group->nattrs[nid] = NULL;
3683 	mutex_unlock(&wi_group->kobj_lock);
3684 
3685 	sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3686 	kfree(attr->kobj_attr.attr.name);
3687 	kfree(attr);
3688 }
3689 
sysfs_wi_node_delete_all(void)3690 static void sysfs_wi_node_delete_all(void)
3691 {
3692 	int nid;
3693 
3694 	for (nid = 0; nid < nr_node_ids; nid++)
3695 		sysfs_wi_node_delete(nid);
3696 }
3697 
wi_state_free(void)3698 static void wi_state_free(void)
3699 {
3700 	struct weighted_interleave_state *old_wi_state;
3701 
3702 	mutex_lock(&wi_state_lock);
3703 	old_wi_state = rcu_dereference_protected(wi_state,
3704 			lockdep_is_held(&wi_state_lock));
3705 	rcu_assign_pointer(wi_state, NULL);
3706 	mutex_unlock(&wi_state_lock);
3707 
3708 	if (old_wi_state) {
3709 		synchronize_rcu();
3710 		kfree(old_wi_state);
3711 	}
3712 }
3713 
3714 static struct kobj_attribute wi_auto_attr =
3715 	__ATTR(auto, 0664, weighted_interleave_auto_show,
3716 			   weighted_interleave_auto_store);
3717 
wi_cleanup(void)3718 static void wi_cleanup(void) {
3719 	sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3720 	sysfs_wi_node_delete_all();
3721 	wi_state_free();
3722 }
3723 
wi_kobj_release(struct kobject * wi_kobj)3724 static void wi_kobj_release(struct kobject *wi_kobj)
3725 {
3726 	kfree(wi_group);
3727 }
3728 
3729 static const struct kobj_type wi_ktype = {
3730 	.sysfs_ops = &kobj_sysfs_ops,
3731 	.release = wi_kobj_release,
3732 };
3733 
sysfs_wi_node_add(int nid)3734 static int sysfs_wi_node_add(int nid)
3735 {
3736 	int ret;
3737 	char *name;
3738 	struct iw_node_attr *new_attr;
3739 
3740 	if (nid < 0 || nid >= nr_node_ids) {
3741 		pr_err("invalid node id: %d\n", nid);
3742 		return -EINVAL;
3743 	}
3744 
3745 	new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
3746 	if (!new_attr)
3747 		return -ENOMEM;
3748 
3749 	name = kasprintf(GFP_KERNEL, "node%d", nid);
3750 	if (!name) {
3751 		kfree(new_attr);
3752 		return -ENOMEM;
3753 	}
3754 
3755 	sysfs_attr_init(&new_attr->kobj_attr.attr);
3756 	new_attr->kobj_attr.attr.name = name;
3757 	new_attr->kobj_attr.attr.mode = 0644;
3758 	new_attr->kobj_attr.show = node_show;
3759 	new_attr->kobj_attr.store = node_store;
3760 	new_attr->nid = nid;
3761 
3762 	mutex_lock(&wi_group->kobj_lock);
3763 	if (wi_group->nattrs[nid]) {
3764 		mutex_unlock(&wi_group->kobj_lock);
3765 		ret = -EEXIST;
3766 		goto out;
3767 	}
3768 
3769 	ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3770 	if (ret) {
3771 		mutex_unlock(&wi_group->kobj_lock);
3772 		goto out;
3773 	}
3774 	wi_group->nattrs[nid] = new_attr;
3775 	mutex_unlock(&wi_group->kobj_lock);
3776 	return 0;
3777 
3778 out:
3779 	kfree(new_attr->kobj_attr.attr.name);
3780 	kfree(new_attr);
3781 	return ret;
3782 }
3783 
wi_node_notifier(struct notifier_block * nb,unsigned long action,void * data)3784 static int wi_node_notifier(struct notifier_block *nb,
3785 			       unsigned long action, void *data)
3786 {
3787 	int err;
3788 	struct node_notify *nn = data;
3789 	int nid = nn->nid;
3790 
3791 	switch (action) {
3792 	case NODE_ADDED_FIRST_MEMORY:
3793 		err = sysfs_wi_node_add(nid);
3794 		if (err)
3795 			pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3796 			       nid, err);
3797 		break;
3798 	case NODE_REMOVED_LAST_MEMORY:
3799 		sysfs_wi_node_delete(nid);
3800 		break;
3801 	}
3802 
3803 	return NOTIFY_OK;
3804 }
3805 
add_weighted_interleave_group(struct kobject * mempolicy_kobj)3806 static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3807 {
3808 	int nid, err;
3809 
3810 	wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
3811 			   GFP_KERNEL);
3812 	if (!wi_group)
3813 		return -ENOMEM;
3814 	mutex_init(&wi_group->kobj_lock);
3815 
3816 	err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3817 				   "weighted_interleave");
3818 	if (err)
3819 		goto err_put_kobj;
3820 
3821 	err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3822 	if (err)
3823 		goto err_put_kobj;
3824 
3825 	for_each_online_node(nid) {
3826 		if (!node_state(nid, N_MEMORY))
3827 			continue;
3828 
3829 		err = sysfs_wi_node_add(nid);
3830 		if (err) {
3831 			pr_err("failed to add sysfs for node%d during init: %d\n",
3832 			       nid, err);
3833 			goto err_cleanup_kobj;
3834 		}
3835 	}
3836 
3837 	hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3838 	return 0;
3839 
3840 err_cleanup_kobj:
3841 	wi_cleanup();
3842 	kobject_del(&wi_group->wi_kobj);
3843 err_put_kobj:
3844 	kobject_put(&wi_group->wi_kobj);
3845 	return err;
3846 }
3847 
mempolicy_sysfs_init(void)3848 static int __init mempolicy_sysfs_init(void)
3849 {
3850 	int err;
3851 	static struct kobject *mempolicy_kobj;
3852 
3853 	mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3854 	if (!mempolicy_kobj)
3855 		return -ENOMEM;
3856 
3857 	err = add_weighted_interleave_group(mempolicy_kobj);
3858 	if (err)
3859 		goto err_kobj;
3860 
3861 	return 0;
3862 
3863 err_kobj:
3864 	kobject_del(mempolicy_kobj);
3865 	kobject_put(mempolicy_kobj);
3866 	return err;
3867 }
3868 
3869 late_initcall(mempolicy_sysfs_init);
3870 #endif /* CONFIG_SYSFS */
3871