xref: /linux/mm/util.c (revision 334fbe734e687404f346eba7d5d96ed2b44d35ab)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/mm.h>
3 #include <linux/slab.h>
4 #include <linux/string.h>
5 #include <linux/compiler.h>
6 #include <linux/export.h>
7 #include <linux/err.h>
8 #include <linux/sched.h>
9 #include <linux/sched/mm.h>
10 #include <linux/sched/signal.h>
11 #include <linux/sched/task_stack.h>
12 #include <linux/security.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15 #include <linux/sysctl.h>
16 #include <linux/mman.h>
17 #include <linux/hugetlb.h>
18 #include <linux/vmalloc.h>
19 #include <linux/userfaultfd_k.h>
20 #include <linux/elf.h>
21 #include <linux/elf-randomize.h>
22 #include <linux/personality.h>
23 #include <linux/random.h>
24 #include <linux/processor.h>
25 #include <linux/sizes.h>
26 #include <linux/compat.h>
27 #include <linux/fsnotify.h>
28 #include <linux/page_idle.h>
29 
30 #include <linux/uaccess.h>
31 
32 #include <kunit/visibility.h>
33 
34 #include "internal.h"
35 #include "swap.h"
36 
37 /**
38  * kfree_const - conditionally free memory
39  * @x: pointer to the memory
40  *
41  * Function calls kfree only if @x is not in .rodata section.
42  */
kfree_const(const void * x)43 void kfree_const(const void *x)
44 {
45 	if (!is_kernel_rodata((unsigned long)x))
46 		kfree(x);
47 }
48 EXPORT_SYMBOL(kfree_const);
49 
50 /**
51  * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated.
52  * @s: The data to copy
53  * @len: The size of the data, not including the NUL terminator
54  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
55  *
56  * Return: newly allocated copy of @s with NUL-termination or %NULL in
57  * case of error
58  */
__kmemdup_nul(const char * s,size_t len,gfp_t gfp)59 static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp)
60 {
61 	char *buf;
62 
63 	/* '+1' for the NUL terminator */
64 	buf = kmalloc_track_caller(len + 1, gfp);
65 	if (!buf)
66 		return NULL;
67 
68 	memcpy(buf, s, len);
69 	/* Ensure the buf is always NUL-terminated, regardless of @s. */
70 	buf[len] = '\0';
71 	return buf;
72 }
73 
74 /**
75  * kstrdup - allocate space for and copy an existing string
76  * @s: the string to duplicate
77  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
78  *
79  * Return: newly allocated copy of @s or %NULL in case of error
80  */
81 noinline
kstrdup(const char * s,gfp_t gfp)82 char *kstrdup(const char *s, gfp_t gfp)
83 {
84 	return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL;
85 }
86 EXPORT_SYMBOL(kstrdup);
87 
88 /**
89  * kstrdup_const - conditionally duplicate an existing const string
90  * @s: the string to duplicate
91  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
92  *
93  * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
94  * must not be passed to krealloc().
95  *
96  * Return: source string if it is in .rodata section otherwise
97  * fallback to kstrdup.
98  */
kstrdup_const(const char * s,gfp_t gfp)99 const char *kstrdup_const(const char *s, gfp_t gfp)
100 {
101 	if (is_kernel_rodata((unsigned long)s))
102 		return s;
103 
104 	return kstrdup(s, gfp);
105 }
106 EXPORT_SYMBOL(kstrdup_const);
107 
108 /**
109  * kstrndup - allocate space for and copy an existing string
110  * @s: the string to duplicate
111  * @max: read at most @max chars from @s
112  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
113  *
114  * Note: Use kmemdup_nul() instead if the size is known exactly.
115  *
116  * Return: newly allocated copy of @s or %NULL in case of error
117  */
kstrndup(const char * s,size_t max,gfp_t gfp)118 char *kstrndup(const char *s, size_t max, gfp_t gfp)
119 {
120 	return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL;
121 }
122 EXPORT_SYMBOL(kstrndup);
123 
124 /**
125  * kmemdup - duplicate region of memory
126  *
127  * @src: memory region to duplicate
128  * @len: memory region length
129  * @gfp: GFP mask to use
130  *
131  * Return: newly allocated copy of @src or %NULL in case of error,
132  * result is physically contiguous. Use kfree() to free.
133  */
kmemdup_noprof(const void * src,size_t len,gfp_t gfp)134 void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
135 {
136 	void *p;
137 
138 	p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
139 	if (p)
140 		memcpy(p, src, len);
141 	return p;
142 }
143 EXPORT_SYMBOL(kmemdup_noprof);
144 
145 /**
146  * kmemdup_array - duplicate a given array.
147  *
148  * @src: array to duplicate.
149  * @count: number of elements to duplicate from array.
150  * @element_size: size of each element of array.
151  * @gfp: GFP mask to use.
152  *
153  * Return: duplicated array of @src or %NULL in case of error,
154  * result is physically contiguous. Use kfree() to free.
155  */
kmemdup_array(const void * src,size_t count,size_t element_size,gfp_t gfp)156 void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp)
157 {
158 	return kmemdup(src, size_mul(element_size, count), gfp);
159 }
160 EXPORT_SYMBOL(kmemdup_array);
161 
162 /**
163  * kvmemdup - duplicate region of memory
164  *
165  * @src: memory region to duplicate
166  * @len: memory region length
167  * @gfp: GFP mask to use
168  *
169  * Return: newly allocated copy of @src or %NULL in case of error,
170  * result may be not physically contiguous. Use kvfree() to free.
171  */
kvmemdup(const void * src,size_t len,gfp_t gfp)172 void *kvmemdup(const void *src, size_t len, gfp_t gfp)
173 {
174 	void *p;
175 
176 	p = kvmalloc(len, gfp);
177 	if (p)
178 		memcpy(p, src, len);
179 	return p;
180 }
181 EXPORT_SYMBOL(kvmemdup);
182 
183 /**
184  * kmemdup_nul - Create a NUL-terminated string from unterminated data
185  * @s: The data to stringify
186  * @len: The size of the data
187  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
188  *
189  * Return: newly allocated copy of @s with NUL-termination or %NULL in
190  * case of error
191  */
kmemdup_nul(const char * s,size_t len,gfp_t gfp)192 char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
193 {
194 	return s ? __kmemdup_nul(s, len, gfp) : NULL;
195 }
196 EXPORT_SYMBOL(kmemdup_nul);
197 
198 static kmem_buckets *user_buckets __ro_after_init;
199 
init_user_buckets(void)200 static int __init init_user_buckets(void)
201 {
202 	user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL);
203 
204 	return 0;
205 }
206 subsys_initcall(init_user_buckets);
207 
208 /**
209  * memdup_user - duplicate memory region from user space
210  *
211  * @src: source address in user space
212  * @len: number of bytes to copy
213  *
214  * Return: an ERR_PTR() on failure.  Result is physically
215  * contiguous, to be freed by kfree().
216  */
memdup_user(const void __user * src,size_t len)217 void *memdup_user(const void __user *src, size_t len)
218 {
219 	void *p;
220 
221 	p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN);
222 	if (!p)
223 		return ERR_PTR(-ENOMEM);
224 
225 	if (copy_from_user(p, src, len)) {
226 		kfree(p);
227 		return ERR_PTR(-EFAULT);
228 	}
229 
230 	return p;
231 }
232 EXPORT_SYMBOL(memdup_user);
233 
234 /**
235  * vmemdup_user - duplicate memory region from user space
236  *
237  * @src: source address in user space
238  * @len: number of bytes to copy
239  *
240  * Return: an ERR_PTR() on failure.  Result may be not
241  * physically contiguous.  Use kvfree() to free.
242  */
vmemdup_user(const void __user * src,size_t len)243 void *vmemdup_user(const void __user *src, size_t len)
244 {
245 	void *p;
246 
247 	p = kmem_buckets_valloc(user_buckets, len, GFP_USER);
248 	if (!p)
249 		return ERR_PTR(-ENOMEM);
250 
251 	if (copy_from_user(p, src, len)) {
252 		kvfree(p);
253 		return ERR_PTR(-EFAULT);
254 	}
255 
256 	return p;
257 }
258 EXPORT_SYMBOL(vmemdup_user);
259 
260 /**
261  * strndup_user - duplicate an existing string from user space
262  * @s: The string to duplicate
263  * @n: Maximum number of bytes to copy, including the trailing NUL.
264  *
265  * Return: newly allocated copy of @s or an ERR_PTR() in case of error
266  */
strndup_user(const char __user * s,long n)267 char *strndup_user(const char __user *s, long n)
268 {
269 	char *p;
270 	long length;
271 
272 	length = strnlen_user(s, n);
273 
274 	if (!length)
275 		return ERR_PTR(-EFAULT);
276 
277 	if (length > n)
278 		return ERR_PTR(-EINVAL);
279 
280 	p = memdup_user(s, length);
281 
282 	if (IS_ERR(p))
283 		return p;
284 
285 	p[length - 1] = '\0';
286 
287 	return p;
288 }
289 EXPORT_SYMBOL(strndup_user);
290 
291 /**
292  * memdup_user_nul - duplicate memory region from user space and NUL-terminate
293  *
294  * @src: source address in user space
295  * @len: number of bytes to copy
296  *
297  * Return: an ERR_PTR() on failure.
298  */
memdup_user_nul(const void __user * src,size_t len)299 void *memdup_user_nul(const void __user *src, size_t len)
300 {
301 	char *p;
302 
303 	p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN);
304 	if (!p)
305 		return ERR_PTR(-ENOMEM);
306 
307 	if (copy_from_user(p, src, len)) {
308 		kfree(p);
309 		return ERR_PTR(-EFAULT);
310 	}
311 	p[len] = '\0';
312 
313 	return p;
314 }
315 EXPORT_SYMBOL(memdup_user_nul);
316 
317 /* Check if the vma is being used as a stack by this task */
vma_is_stack_for_current(const struct vm_area_struct * vma)318 int vma_is_stack_for_current(const struct vm_area_struct *vma)
319 {
320 	struct task_struct * __maybe_unused t = current;
321 
322 	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
323 }
324 
325 /*
326  * Change backing file, only valid to use during initial VMA setup.
327  */
vma_set_file(struct vm_area_struct * vma,struct file * file)328 void vma_set_file(struct vm_area_struct *vma, struct file *file)
329 {
330 	/* Changing an anonymous vma with this is illegal */
331 	get_file(file);
332 	swap(vma->vm_file, file);
333 	fput(file);
334 }
335 EXPORT_SYMBOL(vma_set_file);
336 
337 #ifndef STACK_RND_MASK
338 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
339 #endif
340 
randomize_stack_top(unsigned long stack_top)341 unsigned long randomize_stack_top(unsigned long stack_top)
342 {
343 	unsigned long random_variable = 0;
344 
345 	if (current->flags & PF_RANDOMIZE) {
346 		random_variable = get_random_long();
347 		random_variable &= STACK_RND_MASK;
348 		random_variable <<= PAGE_SHIFT;
349 	}
350 #ifdef CONFIG_STACK_GROWSUP
351 	return PAGE_ALIGN(stack_top) + random_variable;
352 #else
353 	return PAGE_ALIGN(stack_top) - random_variable;
354 #endif
355 }
356 
357 /**
358  * randomize_page - Generate a random, page aligned address
359  * @start:	The smallest acceptable address the caller will take.
360  * @range:	The size of the area, starting at @start, within which the
361  *		random address must fall.
362  *
363  * If @start + @range would overflow, @range is capped.
364  *
365  * NOTE: Historical use of randomize_range, which this replaces, presumed that
366  * @start was already page aligned.  We now align it regardless.
367  *
368  * Return: A page aligned address within [start, start + range).  On error,
369  * @start is returned.
370  */
randomize_page(unsigned long start,unsigned long range)371 unsigned long randomize_page(unsigned long start, unsigned long range)
372 {
373 	if (!PAGE_ALIGNED(start)) {
374 		range -= PAGE_ALIGN(start) - start;
375 		start = PAGE_ALIGN(start);
376 	}
377 
378 	if (start > ULONG_MAX - range)
379 		range = ULONG_MAX - start;
380 
381 	range >>= PAGE_SHIFT;
382 
383 	if (range == 0)
384 		return start;
385 
386 	return start + (get_random_long() % range << PAGE_SHIFT);
387 }
388 
389 #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
arch_randomize_brk(struct mm_struct * mm)390 unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
391 {
392 	/* Is the current task 32bit ? */
393 	if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
394 		return randomize_page(mm->brk, SZ_32M);
395 
396 	return randomize_page(mm->brk, SZ_1G);
397 }
398 
arch_mmap_rnd(void)399 unsigned long arch_mmap_rnd(void)
400 {
401 	unsigned long rnd;
402 
403 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
404 	if (is_compat_task())
405 		rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
406 	else
407 #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
408 		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
409 
410 	return rnd << PAGE_SHIFT;
411 }
412 
mmap_is_legacy(const struct rlimit * rlim_stack)413 static int mmap_is_legacy(const struct rlimit *rlim_stack)
414 {
415 	if (current->personality & ADDR_COMPAT_LAYOUT)
416 		return 1;
417 
418 	/* On parisc the stack always grows up - so a unlimited stack should
419 	 * not be an indicator to use the legacy memory layout. */
420 	if (rlim_stack->rlim_cur == RLIM_INFINITY &&
421 		!IS_ENABLED(CONFIG_STACK_GROWSUP))
422 		return 1;
423 
424 	return sysctl_legacy_va_layout;
425 }
426 
427 /*
428  * Leave enough space between the mmap area and the stack to honour ulimit in
429  * the face of randomisation.
430  */
431 #define MIN_GAP		(SZ_128M)
432 #define MAX_GAP		(STACK_TOP / 6 * 5)
433 
mmap_base(const unsigned long rnd,const struct rlimit * rlim_stack)434 static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack)
435 {
436 #ifdef CONFIG_STACK_GROWSUP
437 	/*
438 	 * For an upwards growing stack the calculation is much simpler.
439 	 * Memory for the maximum stack size is reserved at the top of the
440 	 * task. mmap_base starts directly below the stack and grows
441 	 * downwards.
442 	 */
443 	return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
444 #else
445 	unsigned long gap = rlim_stack->rlim_cur;
446 	unsigned long pad = stack_guard_gap;
447 
448 	/* Account for stack randomization if necessary */
449 	if (current->flags & PF_RANDOMIZE)
450 		pad += (STACK_RND_MASK << PAGE_SHIFT);
451 
452 	/* Values close to RLIM_INFINITY can overflow. */
453 	if (gap + pad > gap)
454 		gap += pad;
455 
456 	if (gap < MIN_GAP && MIN_GAP < MAX_GAP)
457 		gap = MIN_GAP;
458 	else if (gap > MAX_GAP)
459 		gap = MAX_GAP;
460 
461 	return PAGE_ALIGN(STACK_TOP - gap - rnd);
462 #endif
463 }
464 
arch_pick_mmap_layout(struct mm_struct * mm,const struct rlimit * rlim_stack)465 void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
466 {
467 	unsigned long random_factor = 0UL;
468 
469 	if (current->flags & PF_RANDOMIZE)
470 		random_factor = arch_mmap_rnd();
471 
472 	if (mmap_is_legacy(rlim_stack)) {
473 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
474 		mm_flags_clear(MMF_TOPDOWN, mm);
475 	} else {
476 		mm->mmap_base = mmap_base(random_factor, rlim_stack);
477 		mm_flags_set(MMF_TOPDOWN, mm);
478 	}
479 }
480 #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
arch_pick_mmap_layout(struct mm_struct * mm,const struct rlimit * rlim_stack)481 void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
482 {
483 	mm->mmap_base = TASK_UNMAPPED_BASE;
484 	mm_flags_clear(MMF_TOPDOWN, mm);
485 }
486 #endif
487 #ifdef CONFIG_MMU
488 EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout);
489 #endif
490 
491 /**
492  * __account_locked_vm - account locked pages to an mm's locked_vm
493  * @mm:          mm to account against
494  * @pages:       number of pages to account
495  * @inc:         %true if @pages should be considered positive, %false if not
496  * @task:        task used to check RLIMIT_MEMLOCK
497  * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
498  *
499  * Assumes @task and @mm are valid (i.e. at least one reference on each), and
500  * that mmap_lock is held as writer.
501  *
502  * Return:
503  * * 0       on success
504  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
505  */
__account_locked_vm(struct mm_struct * mm,unsigned long pages,bool inc,const struct task_struct * task,bool bypass_rlim)506 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
507 			const struct task_struct *task, bool bypass_rlim)
508 {
509 	unsigned long locked_vm, limit;
510 	int ret = 0;
511 
512 	mmap_assert_write_locked(mm);
513 
514 	locked_vm = mm->locked_vm;
515 	if (inc) {
516 		if (!bypass_rlim) {
517 			limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
518 			if (locked_vm + pages > limit)
519 				ret = -ENOMEM;
520 		}
521 		if (!ret)
522 			mm->locked_vm = locked_vm + pages;
523 	} else {
524 		WARN_ON_ONCE(pages > locked_vm);
525 		mm->locked_vm = locked_vm - pages;
526 	}
527 
528 	pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
529 		 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
530 		 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
531 		 ret ? " - exceeded" : "");
532 
533 	return ret;
534 }
535 EXPORT_SYMBOL_GPL(__account_locked_vm);
536 
537 /**
538  * account_locked_vm - account locked pages to an mm's locked_vm
539  * @mm:          mm to account against, may be NULL
540  * @pages:       number of pages to account
541  * @inc:         %true if @pages should be considered positive, %false if not
542  *
543  * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
544  *
545  * Return:
546  * * 0       on success, or if mm is NULL
547  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
548  */
account_locked_vm(struct mm_struct * mm,unsigned long pages,bool inc)549 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
550 {
551 	int ret;
552 
553 	if (pages == 0 || !mm)
554 		return 0;
555 
556 	mmap_write_lock(mm);
557 	ret = __account_locked_vm(mm, pages, inc, current,
558 				  capable(CAP_IPC_LOCK));
559 	mmap_write_unlock(mm);
560 
561 	return ret;
562 }
563 EXPORT_SYMBOL_GPL(account_locked_vm);
564 
vm_mmap_pgoff(struct file * file,unsigned long addr,unsigned long len,unsigned long prot,unsigned long flag,unsigned long pgoff)565 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
566 	unsigned long len, unsigned long prot,
567 	unsigned long flag, unsigned long pgoff)
568 {
569 	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
570 	unsigned long ret;
571 	struct mm_struct *mm = current->mm;
572 	unsigned long populate;
573 	LIST_HEAD(uf);
574 
575 	ret = security_mmap_file(file, prot, flag);
576 	if (!ret)
577 		ret = fsnotify_mmap_perm(file, prot, off, len);
578 	if (!ret) {
579 		if (mmap_write_lock_killable(mm))
580 			return -EINTR;
581 		ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
582 			      &uf);
583 		mmap_write_unlock(mm);
584 		userfaultfd_unmap_complete(mm, &uf);
585 		if (populate)
586 			mm_populate(ret, populate);
587 	}
588 	return ret;
589 }
590 
591 /*
592  * Perform a userland memory mapping into the current process address space. See
593  * the comment for do_mmap() for more details on this operation in general.
594  *
595  * This differs from do_mmap() in that:
596  *
597  * a. An offset parameter is provided rather than pgoff, which is both checked
598  *    for overflow and page alignment.
599  * b. mmap locking is performed on the caller's behalf.
600  * c. Userfaultfd unmap events and memory population are handled.
601  *
602  * This means that this function performs essentially the same work as if
603  * userland were invoking mmap (2).
604  *
605  * Returns either an error, or the address at which the requested mapping has
606  * been performed.
607  */
vm_mmap(struct file * file,unsigned long addr,unsigned long len,unsigned long prot,unsigned long flag,unsigned long offset)608 unsigned long vm_mmap(struct file *file, unsigned long addr,
609 	unsigned long len, unsigned long prot,
610 	unsigned long flag, unsigned long offset)
611 {
612 	if (unlikely(offset + PAGE_ALIGN(len) < offset))
613 		return -EINVAL;
614 	if (unlikely(offset_in_page(offset)))
615 		return -EINVAL;
616 
617 	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
618 }
619 EXPORT_SYMBOL(vm_mmap);
620 
621 #ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
622 /*
623  * Perform a userland memory mapping for a shadow stack into the current
624  * process address space. This is intended to be used by architectures that
625  * support user shadow stacks.
626  */
vm_mmap_shadow_stack(unsigned long addr,unsigned long len,unsigned long flags)627 unsigned long vm_mmap_shadow_stack(unsigned long addr, unsigned long len,
628 		unsigned long flags)
629 {
630 	struct mm_struct *mm = current->mm;
631 	unsigned long ret, unused;
632 	vm_flags_t vm_flags = VM_SHADOW_STACK;
633 
634 	flags |= MAP_ANONYMOUS | MAP_PRIVATE;
635 	if (addr)
636 		flags |= MAP_FIXED_NOREPLACE;
637 
638 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
639 		vm_flags |= VM_NOHUGEPAGE;
640 
641 	mmap_write_lock(mm);
642 	ret = do_mmap(NULL, addr, len, PROT_READ | PROT_WRITE, flags,
643 		      vm_flags, 0, &unused, NULL);
644 	mmap_write_unlock(mm);
645 
646 	return ret;
647 }
648 #endif /* CONFIG_ARCH_HAS_USER_SHADOW_STACK */
649 
650 /**
651  * __vmalloc_array - allocate memory for a virtually contiguous array.
652  * @n: number of elements.
653  * @size: element size.
654  * @flags: the type of memory to allocate (see kmalloc).
655  */
__vmalloc_array_noprof(size_t n,size_t size,gfp_t flags)656 void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
657 {
658 	size_t bytes;
659 
660 	if (unlikely(check_mul_overflow(n, size, &bytes)))
661 		return NULL;
662 	return __vmalloc_noprof(bytes, flags);
663 }
664 EXPORT_SYMBOL(__vmalloc_array_noprof);
665 
666 /**
667  * vmalloc_array - allocate memory for a virtually contiguous array.
668  * @n: number of elements.
669  * @size: element size.
670  */
vmalloc_array_noprof(size_t n,size_t size)671 void *vmalloc_array_noprof(size_t n, size_t size)
672 {
673 	return __vmalloc_array_noprof(n, size, GFP_KERNEL);
674 }
675 EXPORT_SYMBOL(vmalloc_array_noprof);
676 
677 /**
678  * __vcalloc - allocate and zero memory for a virtually contiguous array.
679  * @n: number of elements.
680  * @size: element size.
681  * @flags: the type of memory to allocate (see kmalloc).
682  */
__vcalloc_noprof(size_t n,size_t size,gfp_t flags)683 void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
684 {
685 	return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO);
686 }
687 EXPORT_SYMBOL(__vcalloc_noprof);
688 
689 /**
690  * vcalloc - allocate and zero memory for a virtually contiguous array.
691  * @n: number of elements.
692  * @size: element size.
693  */
vcalloc_noprof(size_t n,size_t size)694 void *vcalloc_noprof(size_t n, size_t size)
695 {
696 	return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO);
697 }
698 EXPORT_SYMBOL(vcalloc_noprof);
699 
folio_anon_vma(const struct folio * folio)700 struct anon_vma *folio_anon_vma(const struct folio *folio)
701 {
702 	unsigned long mapping = (unsigned long)folio->mapping;
703 
704 	if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
705 		return NULL;
706 	return (void *)(mapping - FOLIO_MAPPING_ANON);
707 }
708 
709 /**
710  * folio_mapping - Find the mapping where this folio is stored.
711  * @folio: The folio.
712  *
713  * For folios which are in the page cache, return the mapping that this
714  * page belongs to.  Folios in the swap cache return the swap mapping
715  * this page is stored in (which is different from the mapping for the
716  * swap file or swap device where the data is stored).
717  *
718  * You can call this for folios which aren't in the swap cache or page
719  * cache and it will return NULL.
720  */
folio_mapping(const struct folio * folio)721 struct address_space *folio_mapping(const struct folio *folio)
722 {
723 	struct address_space *mapping;
724 
725 	/* This happens if someone calls flush_dcache_page on slab page */
726 	if (unlikely(folio_test_slab(folio)))
727 		return NULL;
728 
729 	if (unlikely(folio_test_swapcache(folio)))
730 		return swap_address_space(folio->swap);
731 
732 	mapping = folio->mapping;
733 	if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
734 		return NULL;
735 
736 	return mapping;
737 }
738 EXPORT_SYMBOL(folio_mapping);
739 
740 /**
741  * folio_copy - Copy the contents of one folio to another.
742  * @dst: Folio to copy to.
743  * @src: Folio to copy from.
744  *
745  * The bytes in the folio represented by @src are copied to @dst.
746  * Assumes the caller has validated that @dst is at least as large as @src.
747  * Can be called in atomic context for order-0 folios, but if the folio is
748  * larger, it may sleep.
749  */
folio_copy(struct folio * dst,struct folio * src)750 void folio_copy(struct folio *dst, struct folio *src)
751 {
752 	long i = 0;
753 	long nr = folio_nr_pages(src);
754 
755 	for (;;) {
756 		copy_highpage(folio_page(dst, i), folio_page(src, i));
757 		if (++i == nr)
758 			break;
759 		cond_resched();
760 	}
761 }
762 EXPORT_SYMBOL(folio_copy);
763 
folio_mc_copy(struct folio * dst,struct folio * src)764 int folio_mc_copy(struct folio *dst, struct folio *src)
765 {
766 	long nr = folio_nr_pages(src);
767 	long i = 0;
768 
769 	for (;;) {
770 		if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i)))
771 			return -EHWPOISON;
772 		if (++i == nr)
773 			break;
774 		cond_resched();
775 	}
776 
777 	return 0;
778 }
779 EXPORT_SYMBOL(folio_mc_copy);
780 
781 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
782 static int sysctl_overcommit_ratio __read_mostly = 50;
783 static unsigned long sysctl_overcommit_kbytes __read_mostly;
784 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
785 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
786 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
787 
788 #ifdef CONFIG_SYSCTL
789 
overcommit_ratio_handler(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)790 static int overcommit_ratio_handler(const struct ctl_table *table, int write,
791 				void *buffer, size_t *lenp, loff_t *ppos)
792 {
793 	int ret;
794 
795 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
796 	if (ret == 0 && write)
797 		sysctl_overcommit_kbytes = 0;
798 	return ret;
799 }
800 
sync_overcommit_as(struct work_struct * dummy)801 static void sync_overcommit_as(struct work_struct *dummy)
802 {
803 	percpu_counter_sync(&vm_committed_as);
804 }
805 
overcommit_policy_handler(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)806 static int overcommit_policy_handler(const struct ctl_table *table, int write,
807 				void *buffer, size_t *lenp, loff_t *ppos)
808 {
809 	struct ctl_table t;
810 	int new_policy = -1;
811 	int ret;
812 
813 	/*
814 	 * The deviation of sync_overcommit_as could be big with loose policy
815 	 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
816 	 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
817 	 * with the strict "NEVER", and to avoid possible race condition (even
818 	 * though user usually won't too frequently do the switching to policy
819 	 * OVERCOMMIT_NEVER), the switch is done in the following order:
820 	 *	1. changing the batch
821 	 *	2. sync percpu count on each CPU
822 	 *	3. switch the policy
823 	 */
824 	if (write) {
825 		t = *table;
826 		t.data = &new_policy;
827 		ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
828 		if (ret || new_policy == -1)
829 			return ret;
830 
831 		mm_compute_batch(new_policy);
832 		if (new_policy == OVERCOMMIT_NEVER)
833 			schedule_on_each_cpu(sync_overcommit_as);
834 		sysctl_overcommit_memory = new_policy;
835 	} else {
836 		ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
837 	}
838 
839 	return ret;
840 }
841 
overcommit_kbytes_handler(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)842 static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
843 				void *buffer, size_t *lenp, loff_t *ppos)
844 {
845 	int ret;
846 
847 	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
848 	if (ret == 0 && write)
849 		sysctl_overcommit_ratio = 0;
850 	return ret;
851 }
852 
853 static const struct ctl_table util_sysctl_table[] = {
854 	{
855 		.procname	= "overcommit_memory",
856 		.data		= &sysctl_overcommit_memory,
857 		.maxlen		= sizeof(sysctl_overcommit_memory),
858 		.mode		= 0644,
859 		.proc_handler	= overcommit_policy_handler,
860 		.extra1		= SYSCTL_ZERO,
861 		.extra2		= SYSCTL_TWO,
862 	},
863 	{
864 		.procname	= "overcommit_ratio",
865 		.data		= &sysctl_overcommit_ratio,
866 		.maxlen		= sizeof(sysctl_overcommit_ratio),
867 		.mode		= 0644,
868 		.proc_handler	= overcommit_ratio_handler,
869 	},
870 	{
871 		.procname	= "overcommit_kbytes",
872 		.data		= &sysctl_overcommit_kbytes,
873 		.maxlen		= sizeof(sysctl_overcommit_kbytes),
874 		.mode		= 0644,
875 		.proc_handler	= overcommit_kbytes_handler,
876 	},
877 	{
878 		.procname	= "user_reserve_kbytes",
879 		.data		= &sysctl_user_reserve_kbytes,
880 		.maxlen		= sizeof(sysctl_user_reserve_kbytes),
881 		.mode		= 0644,
882 		.proc_handler	= proc_doulongvec_minmax,
883 	},
884 	{
885 		.procname	= "admin_reserve_kbytes",
886 		.data		= &sysctl_admin_reserve_kbytes,
887 		.maxlen		= sizeof(sysctl_admin_reserve_kbytes),
888 		.mode		= 0644,
889 		.proc_handler	= proc_doulongvec_minmax,
890 	},
891 };
892 
init_vm_util_sysctls(void)893 static int __init init_vm_util_sysctls(void)
894 {
895 	register_sysctl_init("vm", util_sysctl_table);
896 	return 0;
897 }
898 subsys_initcall(init_vm_util_sysctls);
899 #endif /* CONFIG_SYSCTL */
900 
901 /*
902  * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
903  */
vm_commit_limit(void)904 unsigned long vm_commit_limit(void)
905 {
906 	unsigned long allowed;
907 
908 	if (sysctl_overcommit_kbytes)
909 		allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
910 	else
911 		allowed = ((totalram_pages() - hugetlb_total_pages())
912 			   * sysctl_overcommit_ratio / 100);
913 	allowed += total_swap_pages;
914 
915 	return allowed;
916 }
917 
918 /*
919  * Make sure vm_committed_as in one cacheline and not cacheline shared with
920  * other variables. It can be updated by several CPUs frequently.
921  */
922 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
923 
924 /*
925  * The global memory commitment made in the system can be a metric
926  * that can be used to drive ballooning decisions when Linux is hosted
927  * as a guest. On Hyper-V, the host implements a policy engine for dynamically
928  * balancing memory across competing virtual machines that are hosted.
929  * Several metrics drive this policy engine including the guest reported
930  * memory commitment.
931  *
932  * The time cost of this is very low for small platforms, and for big
933  * platform like a 2S/36C/72T Skylake server, in worst case where
934  * vm_committed_as's spinlock is under severe contention, the time cost
935  * could be about 30~40 microseconds.
936  */
vm_memory_committed(void)937 unsigned long vm_memory_committed(void)
938 {
939 	return percpu_counter_sum_positive(&vm_committed_as);
940 }
941 EXPORT_SYMBOL_GPL(vm_memory_committed);
942 
943 /*
944  * Check that a process has enough memory to allocate a new virtual
945  * mapping. 0 means there is enough memory for the allocation to
946  * succeed and -ENOMEM implies there is not.
947  *
948  * We currently support three overcommit policies, which are set via the
949  * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
950  *
951  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
952  * Additional code 2002 Jul 20 by Robert Love.
953  *
954  * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
955  *
956  * Note this is a helper function intended to be used by LSMs which
957  * wish to use this logic.
958  */
__vm_enough_memory(const struct mm_struct * mm,long pages,int cap_sys_admin)959 int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin)
960 {
961 	long allowed;
962 	unsigned long bytes_failed;
963 
964 	vm_acct_memory(pages);
965 
966 	/*
967 	 * Sometimes we want to use more memory than we have
968 	 */
969 	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
970 		return 0;
971 
972 	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
973 		if (pages > totalram_pages() + total_swap_pages)
974 			goto error;
975 		return 0;
976 	}
977 
978 	allowed = vm_commit_limit();
979 	/*
980 	 * Reserve some for root
981 	 */
982 	if (!cap_sys_admin)
983 		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
984 
985 	/*
986 	 * Don't let a single process grow so big a user can't recover
987 	 */
988 	if (mm) {
989 		long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
990 
991 		allowed -= min_t(long, mm->total_vm / 32, reserve);
992 	}
993 
994 	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
995 		return 0;
996 error:
997 	bytes_failed = pages << PAGE_SHIFT;
998 	pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
999 			    __func__, current->pid, current->comm, bytes_failed);
1000 	vm_unacct_memory(pages);
1001 
1002 	return -ENOMEM;
1003 }
1004 
1005 /**
1006  * get_cmdline() - copy the cmdline value to a buffer.
1007  * @task:     the task whose cmdline value to copy.
1008  * @buffer:   the buffer to copy to.
1009  * @buflen:   the length of the buffer. Larger cmdline values are truncated
1010  *            to this length.
1011  *
1012  * Return: the size of the cmdline field copied. Note that the copy does
1013  * not guarantee an ending NULL byte.
1014  */
get_cmdline(struct task_struct * task,char * buffer,int buflen)1015 int get_cmdline(struct task_struct *task, char *buffer, int buflen)
1016 {
1017 	int res = 0;
1018 	unsigned int len;
1019 	struct mm_struct *mm = get_task_mm(task);
1020 	unsigned long arg_start, arg_end, env_start, env_end;
1021 	if (!mm)
1022 		goto out;
1023 	if (!mm->arg_end)
1024 		goto out_mm;	/* Shh! No looking before we're done */
1025 
1026 	spin_lock(&mm->arg_lock);
1027 	arg_start = mm->arg_start;
1028 	arg_end = mm->arg_end;
1029 	env_start = mm->env_start;
1030 	env_end = mm->env_end;
1031 	spin_unlock(&mm->arg_lock);
1032 
1033 	len = arg_end - arg_start;
1034 
1035 	if (len > buflen)
1036 		len = buflen;
1037 
1038 	res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
1039 
1040 	/*
1041 	 * If the nul at the end of args has been overwritten, then
1042 	 * assume application is using setproctitle(3).
1043 	 */
1044 	if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
1045 		len = strnlen(buffer, res);
1046 		if (len < res) {
1047 			res = len;
1048 		} else {
1049 			len = env_end - env_start;
1050 			if (len > buflen - res)
1051 				len = buflen - res;
1052 			res += access_process_vm(task, env_start,
1053 						 buffer+res, len,
1054 						 FOLL_FORCE);
1055 			res = strnlen(buffer, res);
1056 		}
1057 	}
1058 out_mm:
1059 	mmput(mm);
1060 out:
1061 	return res;
1062 }
1063 
memcmp_pages(struct page * page1,struct page * page2)1064 int __weak memcmp_pages(struct page *page1, struct page *page2)
1065 {
1066 	char *addr1, *addr2;
1067 	int ret;
1068 
1069 	addr1 = kmap_local_page(page1);
1070 	addr2 = kmap_local_page(page2);
1071 	ret = memcmp(addr1, addr2, PAGE_SIZE);
1072 	kunmap_local(addr2);
1073 	kunmap_local(addr1);
1074 	return ret;
1075 }
1076 
1077 #ifdef CONFIG_PRINTK
1078 /**
1079  * mem_dump_obj - Print available provenance information
1080  * @object: object for which to find provenance information.
1081  *
1082  * This function uses pr_cont(), so that the caller is expected to have
1083  * printed out whatever preamble is appropriate.  The provenance information
1084  * depends on the type of object and on how much debugging is enabled.
1085  * For example, for a slab-cache object, the slab name is printed, and,
1086  * if available, the return address and stack trace from the allocation
1087  * and last free path of that object.
1088  */
mem_dump_obj(void * object)1089 void mem_dump_obj(void *object)
1090 {
1091 	const char *type;
1092 
1093 	if (kmem_dump_obj(object))
1094 		return;
1095 
1096 	if (vmalloc_dump_obj(object))
1097 		return;
1098 
1099 	if (is_vmalloc_addr(object))
1100 		type = "vmalloc memory";
1101 	else if (virt_addr_valid(object))
1102 		type = "non-slab/vmalloc memory";
1103 	else if (object == NULL)
1104 		type = "NULL pointer";
1105 	else if (object == ZERO_SIZE_PTR)
1106 		type = "zero-size pointer";
1107 	else
1108 		type = "non-paged memory";
1109 
1110 	pr_cont(" %s\n", type);
1111 }
1112 EXPORT_SYMBOL_GPL(mem_dump_obj);
1113 #endif
1114 
1115 /*
1116  * A driver might set a page logically offline -- PageOffline() -- and
1117  * turn the page inaccessible in the hypervisor; after that, access to page
1118  * content can be fatal.
1119  *
1120  * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
1121  * pages after checking PageOffline(); however, these PFN walkers can race
1122  * with drivers that set PageOffline().
1123  *
1124  * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
1125  * synchronize with such drivers, achieving that a page cannot be set
1126  * PageOffline() while frozen.
1127  *
1128  * page_offline_begin()/page_offline_end() is used by drivers that care about
1129  * such races when setting a page PageOffline().
1130  */
1131 static DECLARE_RWSEM(page_offline_rwsem);
1132 
page_offline_freeze(void)1133 void page_offline_freeze(void)
1134 {
1135 	down_read(&page_offline_rwsem);
1136 }
1137 
page_offline_thaw(void)1138 void page_offline_thaw(void)
1139 {
1140 	up_read(&page_offline_rwsem);
1141 }
1142 
page_offline_begin(void)1143 void page_offline_begin(void)
1144 {
1145 	down_write(&page_offline_rwsem);
1146 }
1147 EXPORT_SYMBOL(page_offline_begin);
1148 
page_offline_end(void)1149 void page_offline_end(void)
1150 {
1151 	up_write(&page_offline_rwsem);
1152 }
1153 EXPORT_SYMBOL(page_offline_end);
1154 
1155 #ifndef flush_dcache_folio
flush_dcache_folio(struct folio * folio)1156 void flush_dcache_folio(struct folio *folio)
1157 {
1158 	long i, nr = folio_nr_pages(folio);
1159 
1160 	for (i = 0; i < nr; i++)
1161 		flush_dcache_page(folio_page(folio, i));
1162 }
1163 EXPORT_SYMBOL(flush_dcache_folio);
1164 #endif
1165 
1166 /**
1167  * compat_set_desc_from_vma() - assigns VMA descriptor @desc fields from a VMA.
1168  * @desc: A VMA descriptor whose fields need to be set.
1169  * @file: The file object describing the file being mmap()'d.
1170  * @vma: The VMA whose fields we wish to assign to @desc.
1171  *
1172  * This is a compatibility function to allow an mmap() hook to call
1173  * mmap_prepare() hooks when drivers nest these. This function specifically
1174  * allows the construction of a vm_area_desc value, @desc, from a VMA @vma for
1175  * the purposes of doing this.
1176  *
1177  * Once the conversion of drivers is complete this function will no longer be
1178  * required and will be removed.
1179  */
compat_set_desc_from_vma(struct vm_area_desc * desc,const struct file * file,const struct vm_area_struct * vma)1180 void compat_set_desc_from_vma(struct vm_area_desc *desc,
1181 			      const struct file *file,
1182 			      const struct vm_area_struct *vma)
1183 {
1184 	memset(desc, 0, sizeof(*desc));
1185 
1186 	desc->mm = vma->vm_mm;
1187 	desc->file = (struct file *)file;
1188 	desc->start = vma->vm_start;
1189 	desc->end = vma->vm_end;
1190 
1191 	desc->pgoff = vma->vm_pgoff;
1192 	desc->vm_file = vma->vm_file;
1193 	desc->vma_flags = vma->flags;
1194 	desc->page_prot = vma->vm_page_prot;
1195 
1196 	/* Default. */
1197 	desc->action.type = MMAP_NOTHING;
1198 }
1199 EXPORT_SYMBOL(compat_set_desc_from_vma);
1200 
1201 /**
1202  * __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows
1203  * flexibility as to how the mmap_prepare callback is invoked, which is useful
1204  * for drivers which invoke nested mmap_prepare callbacks in an mmap() hook.
1205  * @desc: A VMA descriptor upon which an mmap_prepare() hook has already been
1206  * executed.
1207  * @vma: The VMA to which @desc should be applied.
1208  *
1209  * The function assumes that you have obtained a VMA descriptor @desc from
1210  * compat_set_desc_from_vma(), and already executed the mmap_prepare() hook upon
1211  * it.
1212  *
1213  * It then performs any specified mmap actions, and invokes the vm_ops->mapped()
1214  * hook if one is present.
1215  *
1216  * See the description of compat_vma_mmap() for more details.
1217  *
1218  * Once the conversion of drivers is complete this function will no longer be
1219  * required and will be removed.
1220  *
1221  * Returns: 0 on success or error.
1222  */
__compat_vma_mmap(struct vm_area_desc * desc,struct vm_area_struct * vma)1223 int __compat_vma_mmap(struct vm_area_desc *desc,
1224 		      struct vm_area_struct *vma)
1225 {
1226 	int err;
1227 
1228 	/* Perform any preparatory tasks for mmap action. */
1229 	err = mmap_action_prepare(desc);
1230 	if (err)
1231 		return err;
1232 	/* Update the VMA from the descriptor. */
1233 	compat_set_vma_from_desc(vma, desc);
1234 	/* Complete any specified mmap actions. */
1235 	return mmap_action_complete(vma, &desc->action);
1236 }
1237 EXPORT_SYMBOL(__compat_vma_mmap);
1238 
1239 /**
1240  * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
1241  * existing VMA and execute any requested actions.
1242  * @file: The file which possesss an f_op->mmap_prepare() hook.
1243  * @vma: The VMA to apply the .mmap_prepare() hook to.
1244  *
1245  * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
1246  * stacked drivers invoke a nested mmap hook of an underlying file.
1247  *
1248  * Until all drivers are converted to use .mmap_prepare(), we must be
1249  * conservative and continue to invoke these stacked drivers using the
1250  * deprecated .mmap() hook.
1251  *
1252  * However we have a problem if the underlying file system possesses an
1253  * .mmap_prepare() hook, as we are in a different context when we invoke the
1254  * .mmap() hook, already having a VMA to deal with.
1255  *
1256  * compat_vma_mmap() is a compatibility function that takes VMA state,
1257  * establishes a struct vm_area_desc descriptor, passes to the underlying
1258  * .mmap_prepare() hook and applies any changes performed by it.
1259  *
1260  * Once the conversion of drivers is complete this function will no longer be
1261  * required and will be removed.
1262  *
1263  * Returns: 0 on success or error.
1264  */
compat_vma_mmap(struct file * file,struct vm_area_struct * vma)1265 int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
1266 {
1267 	struct vm_area_desc desc;
1268 	struct mmap_action *action;
1269 	int err;
1270 
1271 	compat_set_desc_from_vma(&desc, file, vma);
1272 	err = vfs_mmap_prepare(file, &desc);
1273 	if (err)
1274 		return err;
1275 	action = &desc.action;
1276 
1277 	/* being invoked from .mmmap means we don't have to enforce this. */
1278 	action->hide_from_rmap_until_complete = false;
1279 
1280 	return __compat_vma_mmap(&desc, vma);
1281 }
1282 EXPORT_SYMBOL(compat_vma_mmap);
1283 
__vma_check_mmap_hook(struct vm_area_struct * vma)1284 int __vma_check_mmap_hook(struct vm_area_struct *vma)
1285 {
1286 	/* vm_ops->mapped is not valid if mmap() is specified. */
1287 	if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
1288 		return -EINVAL;
1289 
1290 	return 0;
1291 }
1292 EXPORT_SYMBOL(__vma_check_mmap_hook);
1293 
set_ps_flags(struct page_snapshot * ps,const struct folio * folio,const struct page * page)1294 static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
1295 			 const struct page *page)
1296 {
1297 	/*
1298 	 * Only the first page of a high-order buddy page has PageBuddy() set.
1299 	 * So we have to check manually whether this page is part of a high-
1300 	 * order buddy page.
1301 	 */
1302 	if (PageBuddy(page))
1303 		ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
1304 	else if (page_count(page) == 0 && is_free_buddy_page(page))
1305 		ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
1306 
1307 	if (folio_test_idle(folio))
1308 		ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
1309 }
1310 
1311 /**
1312  * snapshot_page() - Create a snapshot of a struct page
1313  * @ps: Pointer to a struct page_snapshot to store the page snapshot
1314  * @page: The page to snapshot
1315  *
1316  * Create a snapshot of the page and store both its struct page and struct
1317  * folio representations in @ps.
1318  *
1319  * A snapshot is marked as "faithful" if the compound state of @page was
1320  * stable and allowed safe reconstruction of the folio representation. In
1321  * rare cases where this is not possible (e.g. due to folio splitting),
1322  * snapshot_page() falls back to treating @page as a single page and the
1323  * snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
1324  * helper can be used to check for this condition.
1325  */
snapshot_page(struct page_snapshot * ps,const struct page * page)1326 void snapshot_page(struct page_snapshot *ps, const struct page *page)
1327 {
1328 	unsigned long info, nr_pages = 1;
1329 	struct folio *foliop;
1330 	int loops = 5;
1331 
1332 	ps->pfn = page_to_pfn(page);
1333 	ps->flags = PAGE_SNAPSHOT_FAITHFUL;
1334 
1335 again:
1336 	memset(&ps->folio_snapshot, 0, sizeof(struct folio));
1337 	memcpy(&ps->page_snapshot, page, sizeof(*page));
1338 	info = ps->page_snapshot.compound_info;
1339 	if (!(info & 1)) {
1340 		ps->idx = 0;
1341 		foliop = (struct folio *)&ps->page_snapshot;
1342 		if (!folio_test_large(foliop)) {
1343 			set_ps_flags(ps, page_folio(page), page);
1344 			memcpy(&ps->folio_snapshot, foliop,
1345 			       sizeof(struct page));
1346 			return;
1347 		}
1348 		foliop = (struct folio *)page;
1349 	} else {
1350 		/* See compound_head() */
1351 		if (compound_info_has_mask()) {
1352 			unsigned long p = (unsigned long)page;
1353 
1354 			foliop = (struct folio *)(p & info);
1355 		} else {
1356 			foliop = (struct folio *)(info - 1);
1357 		}
1358 
1359 		ps->idx = folio_page_idx(foliop, page);
1360 	}
1361 
1362 	if (ps->idx < MAX_FOLIO_NR_PAGES) {
1363 		memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page));
1364 		nr_pages = folio_nr_pages(&ps->folio_snapshot);
1365 		if (nr_pages > 1)
1366 			memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2,
1367 			       sizeof(struct page));
1368 		set_ps_flags(ps, foliop, page);
1369 	}
1370 
1371 	if (ps->idx > nr_pages) {
1372 		if (loops-- > 0)
1373 			goto again;
1374 		clear_compound_head(&ps->page_snapshot);
1375 		foliop = (struct folio *)&ps->page_snapshot;
1376 		memcpy(&ps->folio_snapshot, foliop, sizeof(struct page));
1377 		ps->flags = 0;
1378 		ps->idx = 0;
1379 	}
1380 }
1381 
call_vma_mapped(struct vm_area_struct * vma)1382 static int call_vma_mapped(struct vm_area_struct *vma)
1383 {
1384 	const struct vm_operations_struct *vm_ops = vma->vm_ops;
1385 	void *vm_private_data = vma->vm_private_data;
1386 	int err;
1387 
1388 	if (!vm_ops || !vm_ops->mapped)
1389 		return 0;
1390 
1391 	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
1392 			     vma->vm_file, &vm_private_data);
1393 	if (err)
1394 		return err;
1395 
1396 	if (vm_private_data != vma->vm_private_data)
1397 		vma->vm_private_data = vm_private_data;
1398 	return 0;
1399 }
1400 
mmap_action_finish(struct vm_area_struct * vma,struct mmap_action * action,int err)1401 static int mmap_action_finish(struct vm_area_struct *vma,
1402 			      struct mmap_action *action, int err)
1403 {
1404 	size_t len;
1405 
1406 	if (!err)
1407 		err = call_vma_mapped(vma);
1408 	if (!err && action->success_hook)
1409 		err = action->success_hook(vma);
1410 
1411 	/* do_munmap() might take rmap lock, so release if held. */
1412 	maybe_rmap_unlock_action(vma, action);
1413 	if (!err)
1414 		return 0;
1415 
1416 	/*
1417 	 * If an error occurs, unmap the VMA altogether and return an error. We
1418 	 * only clear the newly allocated VMA, since this function is only
1419 	 * invoked if we do NOT merge, so we only clean up the VMA we created.
1420 	 */
1421 	len = vma_pages(vma) << PAGE_SHIFT;
1422 	do_munmap(current->mm, vma->vm_start, len, NULL);
1423 	if (action->error_hook) {
1424 		/* We may want to filter the error. */
1425 		err = action->error_hook(err);
1426 		/* The caller should not clear the error. */
1427 		VM_WARN_ON_ONCE(!err);
1428 	}
1429 	return err;
1430 }
1431 
1432 #ifdef CONFIG_MMU
1433 /**
1434  * mmap_action_prepare - Perform preparatory setup for an VMA descriptor
1435  * action which need to be performed.
1436  * @desc: The VMA descriptor to prepare for its @desc->action.
1437  *
1438  * Returns: %0 on success, otherwise error.
1439  */
mmap_action_prepare(struct vm_area_desc * desc)1440 int mmap_action_prepare(struct vm_area_desc *desc)
1441 {
1442 	switch (desc->action.type) {
1443 	case MMAP_NOTHING:
1444 		return 0;
1445 	case MMAP_REMAP_PFN:
1446 		return remap_pfn_range_prepare(desc);
1447 	case MMAP_IO_REMAP_PFN:
1448 		return io_remap_pfn_range_prepare(desc);
1449 	case MMAP_SIMPLE_IO_REMAP:
1450 		return simple_ioremap_prepare(desc);
1451 	case MMAP_MAP_KERNEL_PAGES:
1452 		return map_kernel_pages_prepare(desc);
1453 	}
1454 
1455 	WARN_ON_ONCE(1);
1456 	return -EINVAL;
1457 }
1458 EXPORT_SYMBOL(mmap_action_prepare);
1459 
1460 /**
1461  * mmap_action_complete - Execute VMA descriptor action.
1462  * @vma: The VMA to perform the action upon.
1463  * @action: The action to perform.
1464  *
1465  * Similar to mmap_action_prepare().
1466  *
1467  * Return: 0 on success, or error, at which point the VMA will be unmapped.
1468  */
mmap_action_complete(struct vm_area_struct * vma,struct mmap_action * action)1469 int mmap_action_complete(struct vm_area_struct *vma,
1470 			 struct mmap_action *action)
1471 {
1472 	int err = 0;
1473 
1474 	switch (action->type) {
1475 	case MMAP_NOTHING:
1476 		break;
1477 	case MMAP_REMAP_PFN:
1478 		err = remap_pfn_range_complete(vma, action);
1479 		break;
1480 	case MMAP_MAP_KERNEL_PAGES:
1481 		err = map_kernel_pages_complete(vma, action);
1482 		break;
1483 	case MMAP_IO_REMAP_PFN:
1484 	case MMAP_SIMPLE_IO_REMAP:
1485 		/* Should have been delegated. */
1486 		WARN_ON_ONCE(1);
1487 		err = -EINVAL;
1488 		break;
1489 	}
1490 
1491 	return mmap_action_finish(vma, action, err);
1492 }
1493 EXPORT_SYMBOL(mmap_action_complete);
1494 #else
mmap_action_prepare(struct vm_area_desc * desc)1495 int mmap_action_prepare(struct vm_area_desc *desc)
1496 {
1497 	switch (desc->action.type) {
1498 	case MMAP_NOTHING:
1499 		break;
1500 	case MMAP_REMAP_PFN:
1501 	case MMAP_IO_REMAP_PFN:
1502 	case MMAP_SIMPLE_IO_REMAP:
1503 	case MMAP_MAP_KERNEL_PAGES:
1504 		WARN_ON_ONCE(1); /* nommu cannot handle these. */
1505 		break;
1506 	}
1507 
1508 	return 0;
1509 }
1510 EXPORT_SYMBOL(mmap_action_prepare);
1511 
mmap_action_complete(struct vm_area_struct * vma,struct mmap_action * action)1512 int mmap_action_complete(struct vm_area_struct *vma,
1513 			 struct mmap_action *action)
1514 {
1515 	int err = 0;
1516 
1517 	switch (action->type) {
1518 	case MMAP_NOTHING:
1519 		break;
1520 	case MMAP_REMAP_PFN:
1521 	case MMAP_IO_REMAP_PFN:
1522 	case MMAP_SIMPLE_IO_REMAP:
1523 	case MMAP_MAP_KERNEL_PAGES:
1524 		WARN_ON_ONCE(1); /* nommu cannot handle this. */
1525 
1526 		err = -EINVAL;
1527 		break;
1528 	}
1529 
1530 	return mmap_action_finish(vma, action, err);
1531 }
1532 EXPORT_SYMBOL(mmap_action_complete);
1533 #endif
1534 
1535 #ifdef CONFIG_MMU
1536 /**
1537  * folio_pte_batch - detect a PTE batch for a large folio
1538  * @folio: The large folio to detect a PTE batch for.
1539  * @ptep: Page table pointer for the first entry.
1540  * @pte: Page table entry for the first page.
1541  * @max_nr: The maximum number of table entries to consider.
1542  *
1543  * This is a simplified variant of folio_pte_batch_flags().
1544  *
1545  * Detect a PTE batch: consecutive (present) PTEs that map consecutive
1546  * pages of the same large folio in a single VMA and a single page table.
1547  *
1548  * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
1549  * the accessed bit, writable bit, dirt-bit and soft-dirty bit.
1550  *
1551  * ptep must map any page of the folio. max_nr must be at least one and
1552  * must be limited by the caller so scanning cannot exceed a single VMA and
1553  * a single page table.
1554  *
1555  * Return: the number of table entries in the batch.
1556  */
folio_pte_batch(struct folio * folio,pte_t * ptep,pte_t pte,unsigned int max_nr)1557 unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
1558 		unsigned int max_nr)
1559 {
1560 	return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
1561 }
1562 #endif /* CONFIG_MMU */
1563 
1564 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
1565 /**
1566  * page_range_contiguous - test whether the page range is contiguous
1567  * @page: the start of the page range.
1568  * @nr_pages: the number of pages in the range.
1569  *
1570  * Test whether the page range is contiguous, such that they can be iterated
1571  * naively, corresponding to iterating a contiguous PFN range.
1572  *
1573  * This function should primarily only be used for debug checks, or when
1574  * working with page ranges that are not naturally contiguous (e.g., pages
1575  * within a folio are).
1576  *
1577  * Returns true if contiguous, otherwise false.
1578  */
page_range_contiguous(const struct page * page,unsigned long nr_pages)1579 bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
1580 {
1581 	const unsigned long start_pfn = page_to_pfn(page);
1582 	const unsigned long end_pfn = start_pfn + nr_pages;
1583 	unsigned long pfn;
1584 
1585 	/*
1586 	 * The memmap is allocated per memory section, so no need to check
1587 	 * within the first section. However, we need to check each other
1588 	 * spanned memory section once, making sure the first page in a
1589 	 * section could similarly be reached by just iterating pages.
1590 	 */
1591 	for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
1592 	     pfn < end_pfn; pfn += PAGES_PER_SECTION)
1593 		if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
1594 			return false;
1595 	return true;
1596 }
1597 EXPORT_SYMBOL(page_range_contiguous);
1598 #endif
1599