1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * linux/mm/nommu.c
4 *
5 * Replacement code for mm functions to support CPU's that don't
6 * have any form of memory management unit (thus no virtual memory).
7 *
8 * See Documentation/admin-guide/mm/nommu-mmap.rst
9 *
10 * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
11 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
12 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
13 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
14 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
15 */
16
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19 #include <linux/export.h>
20 #include <linux/mm.h>
21 #include <linux/sched/mm.h>
22 #include <linux/mman.h>
23 #include <linux/swap.h>
24 #include <linux/file.h>
25 #include <linux/highmem.h>
26 #include <linux/pagemap.h>
27 #include <linux/slab.h>
28 #include <linux/vmalloc.h>
29 #include <linux/backing-dev.h>
30 #include <linux/compiler.h>
31 #include <linux/mount.h>
32 #include <linux/personality.h>
33 #include <linux/security.h>
34 #include <linux/syscalls.h>
35 #include <linux/audit.h>
36 #include <linux/printk.h>
37
38 #include <linux/uaccess.h>
39 #include <linux/uio.h>
40 #include <asm/tlb.h>
41 #include <asm/tlbflush.h>
42 #include <asm/mmu_context.h>
43 #include "internal.h"
44
45 unsigned long highest_memmap_pfn;
46 int heap_stack_gap = 0;
47
48 atomic_long_t mmap_pages_allocated;
49
50
51 /* list of mapped, potentially shareable regions */
52 static struct kmem_cache *vm_region_jar;
53 struct rb_root nommu_region_tree = RB_ROOT;
54 DECLARE_RWSEM(nommu_region_sem);
55
56 const struct vm_operations_struct generic_file_vm_ops = {
57 };
58
59 /*
60 * Return the total memory allocated for this pointer, not
61 * just what the caller asked for.
62 *
63 * Doesn't have to be accurate, i.e. may have races.
64 */
kobjsize(const void * objp)65 unsigned int kobjsize(const void *objp)
66 {
67 struct page *page;
68
69 /*
70 * If the object we have should not have ksize performed on it,
71 * return size of 0
72 */
73 if (!objp || !virt_addr_valid(objp))
74 return 0;
75
76 page = virt_to_head_page(objp);
77
78 /*
79 * If the allocator sets PageSlab, we know the pointer came from
80 * kmalloc().
81 */
82 if (PageSlab(page))
83 return ksize(objp);
84
85 /*
86 * If it's not a compound page, see if we have a matching VMA
87 * region. This test is intentionally done in reverse order,
88 * so if there's no VMA, we still fall through and hand back
89 * PAGE_SIZE for 0-order pages.
90 */
91 if (!PageCompound(page)) {
92 struct vm_area_struct *vma;
93
94 vma = find_vma(current->mm, (unsigned long)objp);
95 if (vma)
96 return vma->vm_end - vma->vm_start;
97 }
98
99 /*
100 * The ksize() function is only guaranteed to work for pointers
101 * returned by kmalloc(). So handle arbitrary pointers here.
102 */
103 return page_size(page);
104 }
105
vfree(const void * addr)106 void vfree(const void *addr)
107 {
108 kfree(addr);
109 }
110 EXPORT_SYMBOL(vfree);
111
__vmalloc_noprof(unsigned long size,gfp_t gfp_mask)112 void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
113 {
114 /*
115 * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
116 * returns only a logical address.
117 */
118 return kmalloc_noprof(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
119 }
120 EXPORT_SYMBOL(__vmalloc_noprof);
121
vrealloc_noprof(const void * p,size_t size,gfp_t flags)122 void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
123 {
124 return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM);
125 }
126
__vmalloc_node_range_noprof(unsigned long size,unsigned long align,unsigned long start,unsigned long end,gfp_t gfp_mask,pgprot_t prot,unsigned long vm_flags,int node,const void * caller)127 void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
128 unsigned long start, unsigned long end, gfp_t gfp_mask,
129 pgprot_t prot, unsigned long vm_flags, int node,
130 const void *caller)
131 {
132 return __vmalloc_noprof(size, gfp_mask);
133 }
134
__vmalloc_node_noprof(unsigned long size,unsigned long align,gfp_t gfp_mask,int node,const void * caller)135 void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask,
136 int node, const void *caller)
137 {
138 return __vmalloc_noprof(size, gfp_mask);
139 }
140
__vmalloc_user_flags(unsigned long size,gfp_t flags)141 static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
142 {
143 void *ret;
144
145 ret = __vmalloc(size, flags);
146 if (ret) {
147 struct vm_area_struct *vma;
148
149 mmap_write_lock(current->mm);
150 vma = find_vma(current->mm, (unsigned long)ret);
151 if (vma)
152 vm_flags_set(vma, VM_USERMAP);
153 mmap_write_unlock(current->mm);
154 }
155
156 return ret;
157 }
158
vmalloc_user_noprof(unsigned long size)159 void *vmalloc_user_noprof(unsigned long size)
160 {
161 return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO);
162 }
163 EXPORT_SYMBOL(vmalloc_user_noprof);
164
vmalloc_to_page(const void * addr)165 struct page *vmalloc_to_page(const void *addr)
166 {
167 return virt_to_page(addr);
168 }
169 EXPORT_SYMBOL(vmalloc_to_page);
170
vmalloc_to_pfn(const void * addr)171 unsigned long vmalloc_to_pfn(const void *addr)
172 {
173 return page_to_pfn(virt_to_page(addr));
174 }
175 EXPORT_SYMBOL(vmalloc_to_pfn);
176
vread_iter(struct iov_iter * iter,const char * addr,size_t count)177 long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
178 {
179 /* Don't allow overflow */
180 if ((unsigned long) addr + count < count)
181 count = -(unsigned long) addr;
182
183 return copy_to_iter(addr, count, iter);
184 }
185
186 /*
187 * vmalloc - allocate virtually contiguous memory
188 *
189 * @size: allocation size
190 *
191 * Allocate enough pages to cover @size from the page level
192 * allocator and map them into contiguous kernel virtual space.
193 *
194 * For tight control over page level allocator and protection flags
195 * use __vmalloc() instead.
196 */
vmalloc_noprof(unsigned long size)197 void *vmalloc_noprof(unsigned long size)
198 {
199 return __vmalloc_noprof(size, GFP_KERNEL);
200 }
201 EXPORT_SYMBOL(vmalloc_noprof);
202
203 void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
204
205 /*
206 * vzalloc - allocate virtually contiguous memory with zero fill
207 *
208 * @size: allocation size
209 *
210 * Allocate enough pages to cover @size from the page level
211 * allocator and map them into contiguous kernel virtual space.
212 * The memory allocated is set to zero.
213 *
214 * For tight control over page level allocator and protection flags
215 * use __vmalloc() instead.
216 */
vzalloc_noprof(unsigned long size)217 void *vzalloc_noprof(unsigned long size)
218 {
219 return __vmalloc_noprof(size, GFP_KERNEL | __GFP_ZERO);
220 }
221 EXPORT_SYMBOL(vzalloc_noprof);
222
223 /**
224 * vmalloc_node - allocate memory on a specific node
225 * @size: allocation size
226 * @node: numa node
227 *
228 * Allocate enough pages to cover @size from the page level
229 * allocator and map them into contiguous kernel virtual space.
230 *
231 * For tight control over page level allocator and protection flags
232 * use __vmalloc() instead.
233 */
vmalloc_node_noprof(unsigned long size,int node)234 void *vmalloc_node_noprof(unsigned long size, int node)
235 {
236 return vmalloc_noprof(size);
237 }
238 EXPORT_SYMBOL(vmalloc_node_noprof);
239
240 /**
241 * vzalloc_node - allocate memory on a specific node with zero fill
242 * @size: allocation size
243 * @node: numa node
244 *
245 * Allocate enough pages to cover @size from the page level
246 * allocator and map them into contiguous kernel virtual space.
247 * The memory allocated is set to zero.
248 *
249 * For tight control over page level allocator and protection flags
250 * use __vmalloc() instead.
251 */
vzalloc_node_noprof(unsigned long size,int node)252 void *vzalloc_node_noprof(unsigned long size, int node)
253 {
254 return vzalloc_noprof(size);
255 }
256 EXPORT_SYMBOL(vzalloc_node_noprof);
257
258 /**
259 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
260 * @size: allocation size
261 *
262 * Allocate enough 32bit PA addressable pages to cover @size from the
263 * page level allocator and map them into contiguous kernel virtual space.
264 */
vmalloc_32_noprof(unsigned long size)265 void *vmalloc_32_noprof(unsigned long size)
266 {
267 return __vmalloc_noprof(size, GFP_KERNEL);
268 }
269 EXPORT_SYMBOL(vmalloc_32_noprof);
270
271 /**
272 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
273 * @size: allocation size
274 *
275 * The resulting memory area is 32bit addressable and zeroed so it can be
276 * mapped to userspace without leaking data.
277 *
278 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
279 * remap_vmalloc_range() are permissible.
280 */
vmalloc_32_user_noprof(unsigned long size)281 void *vmalloc_32_user_noprof(unsigned long size)
282 {
283 /*
284 * We'll have to sort out the ZONE_DMA bits for 64-bit,
285 * but for now this can simply use vmalloc_user() directly.
286 */
287 return vmalloc_user_noprof(size);
288 }
289 EXPORT_SYMBOL(vmalloc_32_user_noprof);
290
vmap(struct page ** pages,unsigned int count,unsigned long flags,pgprot_t prot)291 void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
292 {
293 BUG();
294 return NULL;
295 }
296 EXPORT_SYMBOL(vmap);
297
vunmap(const void * addr)298 void vunmap(const void *addr)
299 {
300 BUG();
301 }
302 EXPORT_SYMBOL(vunmap);
303
vm_map_ram(struct page ** pages,unsigned int count,int node)304 void *vm_map_ram(struct page **pages, unsigned int count, int node)
305 {
306 BUG();
307 return NULL;
308 }
309 EXPORT_SYMBOL(vm_map_ram);
310
vm_unmap_ram(const void * mem,unsigned int count)311 void vm_unmap_ram(const void *mem, unsigned int count)
312 {
313 BUG();
314 }
315 EXPORT_SYMBOL(vm_unmap_ram);
316
vm_unmap_aliases(void)317 void vm_unmap_aliases(void)
318 {
319 }
320 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
321
free_vm_area(struct vm_struct * area)322 void free_vm_area(struct vm_struct *area)
323 {
324 BUG();
325 }
326 EXPORT_SYMBOL_GPL(free_vm_area);
327
vm_insert_page(struct vm_area_struct * vma,unsigned long addr,struct page * page)328 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
329 struct page *page)
330 {
331 return -EINVAL;
332 }
333 EXPORT_SYMBOL(vm_insert_page);
334
vm_insert_pages(struct vm_area_struct * vma,unsigned long addr,struct page ** pages,unsigned long * num)335 int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
336 struct page **pages, unsigned long *num)
337 {
338 return -EINVAL;
339 }
340 EXPORT_SYMBOL(vm_insert_pages);
341
vm_map_pages(struct vm_area_struct * vma,struct page ** pages,unsigned long num)342 int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
343 unsigned long num)
344 {
345 return -EINVAL;
346 }
347 EXPORT_SYMBOL(vm_map_pages);
348
vm_map_pages_zero(struct vm_area_struct * vma,struct page ** pages,unsigned long num)349 int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
350 unsigned long num)
351 {
352 return -EINVAL;
353 }
354 EXPORT_SYMBOL(vm_map_pages_zero);
355
356 /*
357 * sys_brk() for the most part doesn't need the global kernel
358 * lock, except when an application is doing something nasty
359 * like trying to un-brk an area that has already been mapped
360 * to a regular file. in this case, the unmapping will need
361 * to invoke file system routines that need the global lock.
362 */
SYSCALL_DEFINE1(brk,unsigned long,brk)363 SYSCALL_DEFINE1(brk, unsigned long, brk)
364 {
365 struct mm_struct *mm = current->mm;
366
367 if (brk < mm->start_brk || brk > mm->context.end_brk)
368 return mm->brk;
369
370 if (mm->brk == brk)
371 return mm->brk;
372
373 /*
374 * Always allow shrinking brk
375 */
376 if (brk <= mm->brk) {
377 mm->brk = brk;
378 return brk;
379 }
380
381 /*
382 * Ok, looks good - let it rip.
383 */
384 flush_icache_user_range(mm->brk, brk);
385 return mm->brk = brk;
386 }
387
388 static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
389
390 static const struct ctl_table nommu_table[] = {
391 {
392 .procname = "nr_trim_pages",
393 .data = &sysctl_nr_trim_pages,
394 .maxlen = sizeof(sysctl_nr_trim_pages),
395 .mode = 0644,
396 .proc_handler = proc_dointvec_minmax,
397 .extra1 = SYSCTL_ZERO,
398 },
399 };
400
401 /*
402 * initialise the percpu counter for VM and region record slabs
403 */
mmap_init(void)404 void __init mmap_init(void)
405 {
406 int ret;
407
408 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
409 VM_BUG_ON(ret);
410 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
411 register_sysctl_init("vm", nommu_table);
412 }
413
414 /*
415 * validate the region tree
416 * - the caller must hold the region lock
417 */
418 #ifdef CONFIG_DEBUG_NOMMU_REGIONS
validate_nommu_regions(void)419 static noinline void validate_nommu_regions(void)
420 {
421 struct vm_region *region, *last;
422 struct rb_node *p, *lastp;
423
424 lastp = rb_first(&nommu_region_tree);
425 if (!lastp)
426 return;
427
428 last = rb_entry(lastp, struct vm_region, vm_rb);
429 BUG_ON(last->vm_end <= last->vm_start);
430 BUG_ON(last->vm_top < last->vm_end);
431
432 while ((p = rb_next(lastp))) {
433 region = rb_entry(p, struct vm_region, vm_rb);
434 last = rb_entry(lastp, struct vm_region, vm_rb);
435
436 BUG_ON(region->vm_end <= region->vm_start);
437 BUG_ON(region->vm_top < region->vm_end);
438 BUG_ON(region->vm_start < last->vm_top);
439
440 lastp = p;
441 }
442 }
443 #else
validate_nommu_regions(void)444 static void validate_nommu_regions(void)
445 {
446 }
447 #endif
448
449 /*
450 * add a region into the global tree
451 */
add_nommu_region(struct vm_region * region)452 static void add_nommu_region(struct vm_region *region)
453 {
454 struct vm_region *pregion;
455 struct rb_node **p, *parent;
456
457 validate_nommu_regions();
458
459 parent = NULL;
460 p = &nommu_region_tree.rb_node;
461 while (*p) {
462 parent = *p;
463 pregion = rb_entry(parent, struct vm_region, vm_rb);
464 if (region->vm_start < pregion->vm_start)
465 p = &(*p)->rb_left;
466 else if (region->vm_start > pregion->vm_start)
467 p = &(*p)->rb_right;
468 else if (pregion == region)
469 return;
470 else
471 BUG();
472 }
473
474 rb_link_node(®ion->vm_rb, parent, p);
475 rb_insert_color(®ion->vm_rb, &nommu_region_tree);
476
477 validate_nommu_regions();
478 }
479
480 /*
481 * delete a region from the global tree
482 */
delete_nommu_region(struct vm_region * region)483 static void delete_nommu_region(struct vm_region *region)
484 {
485 BUG_ON(!nommu_region_tree.rb_node);
486
487 validate_nommu_regions();
488 rb_erase(®ion->vm_rb, &nommu_region_tree);
489 validate_nommu_regions();
490 }
491
492 /*
493 * free a contiguous series of pages
494 */
free_page_series(unsigned long from,unsigned long to)495 static void free_page_series(unsigned long from, unsigned long to)
496 {
497 for (; from < to; from += PAGE_SIZE) {
498 struct page *page = virt_to_page((void *)from);
499
500 atomic_long_dec(&mmap_pages_allocated);
501 put_page(page);
502 }
503 }
504
505 /*
506 * release a reference to a region
507 * - the caller must hold the region semaphore for writing, which this releases
508 * - the region may not have been added to the tree yet, in which case vm_top
509 * will equal vm_start
510 */
__put_nommu_region(struct vm_region * region)511 static void __put_nommu_region(struct vm_region *region)
512 __releases(nommu_region_sem)
513 {
514 BUG_ON(!nommu_region_tree.rb_node);
515
516 if (--region->vm_usage == 0) {
517 if (region->vm_top > region->vm_start)
518 delete_nommu_region(region);
519 up_write(&nommu_region_sem);
520
521 if (region->vm_file)
522 fput(region->vm_file);
523
524 /* IO memory and memory shared directly out of the pagecache
525 * from ramfs/tmpfs mustn't be released here */
526 if (region->vm_flags & VM_MAPPED_COPY)
527 free_page_series(region->vm_start, region->vm_top);
528 kmem_cache_free(vm_region_jar, region);
529 } else {
530 up_write(&nommu_region_sem);
531 }
532 }
533
534 /*
535 * release a reference to a region
536 */
put_nommu_region(struct vm_region * region)537 static void put_nommu_region(struct vm_region *region)
538 {
539 down_write(&nommu_region_sem);
540 __put_nommu_region(region);
541 }
542
setup_vma_to_mm(struct vm_area_struct * vma,struct mm_struct * mm)543 static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
544 {
545 vma->vm_mm = mm;
546
547 /* add the VMA to the mapping */
548 if (vma->vm_file) {
549 struct address_space *mapping = vma->vm_file->f_mapping;
550
551 i_mmap_lock_write(mapping);
552 flush_dcache_mmap_lock(mapping);
553 vma_interval_tree_insert(vma, &mapping->i_mmap);
554 flush_dcache_mmap_unlock(mapping);
555 i_mmap_unlock_write(mapping);
556 }
557 }
558
cleanup_vma_from_mm(struct vm_area_struct * vma)559 static void cleanup_vma_from_mm(struct vm_area_struct *vma)
560 {
561 vma->vm_mm->map_count--;
562 /* remove the VMA from the mapping */
563 if (vma->vm_file) {
564 struct address_space *mapping;
565 mapping = vma->vm_file->f_mapping;
566
567 i_mmap_lock_write(mapping);
568 flush_dcache_mmap_lock(mapping);
569 vma_interval_tree_remove(vma, &mapping->i_mmap);
570 flush_dcache_mmap_unlock(mapping);
571 i_mmap_unlock_write(mapping);
572 }
573 }
574
575 /*
576 * delete a VMA from its owning mm_struct and address space
577 */
delete_vma_from_mm(struct vm_area_struct * vma)578 static int delete_vma_from_mm(struct vm_area_struct *vma)
579 {
580 VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start);
581
582 vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
583 if (vma_iter_prealloc(&vmi, NULL)) {
584 pr_warn("Allocation of vma tree for process %d failed\n",
585 current->pid);
586 return -ENOMEM;
587 }
588 cleanup_vma_from_mm(vma);
589
590 /* remove from the MM's tree and list */
591 vma_iter_clear(&vmi);
592 return 0;
593 }
594 /*
595 * destroy a VMA record
596 */
delete_vma(struct mm_struct * mm,struct vm_area_struct * vma)597 static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
598 {
599 vma_close(vma);
600 if (vma->vm_file)
601 fput(vma->vm_file);
602 put_nommu_region(vma->vm_region);
603 vm_area_free(vma);
604 }
605
find_vma_intersection(struct mm_struct * mm,unsigned long start_addr,unsigned long end_addr)606 struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
607 unsigned long start_addr,
608 unsigned long end_addr)
609 {
610 unsigned long index = start_addr;
611
612 mmap_assert_locked(mm);
613 return mt_find(&mm->mm_mt, &index, end_addr - 1);
614 }
615 EXPORT_SYMBOL(find_vma_intersection);
616
617 /*
618 * look up the first VMA in which addr resides, NULL if none
619 * - should be called with mm->mmap_lock at least held readlocked
620 */
find_vma(struct mm_struct * mm,unsigned long addr)621 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
622 {
623 VMA_ITERATOR(vmi, mm, addr);
624
625 return vma_iter_load(&vmi);
626 }
627 EXPORT_SYMBOL(find_vma);
628
629 /*
630 * At least xtensa ends up having protection faults even with no
631 * MMU.. No stack expansion, at least.
632 */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)633 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
634 unsigned long addr, struct pt_regs *regs)
635 {
636 struct vm_area_struct *vma;
637
638 mmap_read_lock(mm);
639 vma = vma_lookup(mm, addr);
640 if (!vma)
641 mmap_read_unlock(mm);
642 return vma;
643 }
644
645 /*
646 * expand a stack to a given address
647 * - not supported under NOMMU conditions
648 */
expand_stack_locked(struct vm_area_struct * vma,unsigned long addr)649 int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
650 {
651 return -ENOMEM;
652 }
653
expand_stack(struct mm_struct * mm,unsigned long addr)654 struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
655 {
656 mmap_read_unlock(mm);
657 return NULL;
658 }
659
660 /*
661 * look up the first VMA exactly that exactly matches addr
662 * - should be called with mm->mmap_lock at least held readlocked
663 */
find_vma_exact(struct mm_struct * mm,unsigned long addr,unsigned long len)664 static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
665 unsigned long addr,
666 unsigned long len)
667 {
668 struct vm_area_struct *vma;
669 unsigned long end = addr + len;
670 VMA_ITERATOR(vmi, mm, addr);
671
672 vma = vma_iter_load(&vmi);
673 if (!vma)
674 return NULL;
675 if (vma->vm_start != addr)
676 return NULL;
677 if (vma->vm_end != end)
678 return NULL;
679
680 return vma;
681 }
682
683 /*
684 * determine whether a mapping should be permitted and, if so, what sort of
685 * mapping we're capable of supporting
686 */
validate_mmap_request(struct file * file,unsigned long addr,unsigned long len,unsigned long prot,unsigned long flags,unsigned long pgoff,unsigned long * _capabilities)687 static int validate_mmap_request(struct file *file,
688 unsigned long addr,
689 unsigned long len,
690 unsigned long prot,
691 unsigned long flags,
692 unsigned long pgoff,
693 unsigned long *_capabilities)
694 {
695 unsigned long capabilities, rlen;
696 int ret;
697
698 /* do the simple checks first */
699 if (flags & MAP_FIXED)
700 return -EINVAL;
701
702 if ((flags & MAP_TYPE) != MAP_PRIVATE &&
703 (flags & MAP_TYPE) != MAP_SHARED)
704 return -EINVAL;
705
706 if (!len)
707 return -EINVAL;
708
709 /* Careful about overflows.. */
710 rlen = PAGE_ALIGN(len);
711 if (!rlen || rlen > TASK_SIZE)
712 return -ENOMEM;
713
714 /* offset overflow? */
715 if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
716 return -EOVERFLOW;
717
718 if (file) {
719 /* files must support mmap */
720 if (!file->f_op->mmap)
721 return -ENODEV;
722
723 /* work out if what we've got could possibly be shared
724 * - we support chardevs that provide their own "memory"
725 * - we support files/blockdevs that are memory backed
726 */
727 if (file->f_op->mmap_capabilities) {
728 capabilities = file->f_op->mmap_capabilities(file);
729 } else {
730 /* no explicit capabilities set, so assume some
731 * defaults */
732 switch (file_inode(file)->i_mode & S_IFMT) {
733 case S_IFREG:
734 case S_IFBLK:
735 capabilities = NOMMU_MAP_COPY;
736 break;
737
738 case S_IFCHR:
739 capabilities =
740 NOMMU_MAP_DIRECT |
741 NOMMU_MAP_READ |
742 NOMMU_MAP_WRITE;
743 break;
744
745 default:
746 return -EINVAL;
747 }
748 }
749
750 /* eliminate any capabilities that we can't support on this
751 * device */
752 if (!file->f_op->get_unmapped_area)
753 capabilities &= ~NOMMU_MAP_DIRECT;
754 if (!(file->f_mode & FMODE_CAN_READ))
755 capabilities &= ~NOMMU_MAP_COPY;
756
757 /* The file shall have been opened with read permission. */
758 if (!(file->f_mode & FMODE_READ))
759 return -EACCES;
760
761 if (flags & MAP_SHARED) {
762 /* do checks for writing, appending and locking */
763 if ((prot & PROT_WRITE) &&
764 !(file->f_mode & FMODE_WRITE))
765 return -EACCES;
766
767 if (IS_APPEND(file_inode(file)) &&
768 (file->f_mode & FMODE_WRITE))
769 return -EACCES;
770
771 if (!(capabilities & NOMMU_MAP_DIRECT))
772 return -ENODEV;
773
774 /* we mustn't privatise shared mappings */
775 capabilities &= ~NOMMU_MAP_COPY;
776 } else {
777 /* we're going to read the file into private memory we
778 * allocate */
779 if (!(capabilities & NOMMU_MAP_COPY))
780 return -ENODEV;
781
782 /* we don't permit a private writable mapping to be
783 * shared with the backing device */
784 if (prot & PROT_WRITE)
785 capabilities &= ~NOMMU_MAP_DIRECT;
786 }
787
788 if (capabilities & NOMMU_MAP_DIRECT) {
789 if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) ||
790 ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
791 ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC))
792 ) {
793 capabilities &= ~NOMMU_MAP_DIRECT;
794 if (flags & MAP_SHARED) {
795 pr_warn("MAP_SHARED not completely supported on !MMU\n");
796 return -EINVAL;
797 }
798 }
799 }
800
801 /* handle executable mappings and implied executable
802 * mappings */
803 if (path_noexec(&file->f_path)) {
804 if (prot & PROT_EXEC)
805 return -EPERM;
806 } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
807 /* handle implication of PROT_EXEC by PROT_READ */
808 if (current->personality & READ_IMPLIES_EXEC) {
809 if (capabilities & NOMMU_MAP_EXEC)
810 prot |= PROT_EXEC;
811 }
812 } else if ((prot & PROT_READ) &&
813 (prot & PROT_EXEC) &&
814 !(capabilities & NOMMU_MAP_EXEC)
815 ) {
816 /* backing file is not executable, try to copy */
817 capabilities &= ~NOMMU_MAP_DIRECT;
818 }
819 } else {
820 /* anonymous mappings are always memory backed and can be
821 * privately mapped
822 */
823 capabilities = NOMMU_MAP_COPY;
824
825 /* handle PROT_EXEC implication by PROT_READ */
826 if ((prot & PROT_READ) &&
827 (current->personality & READ_IMPLIES_EXEC))
828 prot |= PROT_EXEC;
829 }
830
831 /* allow the security API to have its say */
832 ret = security_mmap_addr(addr);
833 if (ret < 0)
834 return ret;
835
836 /* looks okay */
837 *_capabilities = capabilities;
838 return 0;
839 }
840
841 /*
842 * we've determined that we can make the mapping, now translate what we
843 * now know into VMA flags
844 */
determine_vm_flags(struct file * file,unsigned long prot,unsigned long flags,unsigned long capabilities)845 static unsigned long determine_vm_flags(struct file *file,
846 unsigned long prot,
847 unsigned long flags,
848 unsigned long capabilities)
849 {
850 unsigned long vm_flags;
851
852 vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
853
854 if (!file) {
855 /*
856 * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because
857 * there is no fork().
858 */
859 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
860 } else if (flags & MAP_PRIVATE) {
861 /* MAP_PRIVATE file mapping */
862 if (capabilities & NOMMU_MAP_DIRECT)
863 vm_flags |= (capabilities & NOMMU_VMFLAGS);
864 else
865 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
866
867 if (!(prot & PROT_WRITE) && !current->ptrace)
868 /*
869 * R/O private file mapping which cannot be used to
870 * modify memory, especially also not via active ptrace
871 * (e.g., set breakpoints) or later by upgrading
872 * permissions (no mprotect()). We can try overlaying
873 * the file mapping, which will work e.g., on chardevs,
874 * ramfs/tmpfs/shmfs and romfs/cramf.
875 */
876 vm_flags |= VM_MAYOVERLAY;
877 } else {
878 /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */
879 vm_flags |= VM_SHARED | VM_MAYSHARE |
880 (capabilities & NOMMU_VMFLAGS);
881 }
882
883 return vm_flags;
884 }
885
886 /*
887 * set up a shared mapping on a file (the driver or filesystem provides and
888 * pins the storage)
889 */
do_mmap_shared_file(struct vm_area_struct * vma)890 static int do_mmap_shared_file(struct vm_area_struct *vma)
891 {
892 int ret;
893
894 ret = mmap_file(vma->vm_file, vma);
895 if (ret == 0) {
896 vma->vm_region->vm_top = vma->vm_region->vm_end;
897 return 0;
898 }
899 if (ret != -ENOSYS)
900 return ret;
901
902 /* getting -ENOSYS indicates that direct mmap isn't possible (as
903 * opposed to tried but failed) so we can only give a suitable error as
904 * it's not possible to make a private copy if MAP_SHARED was given */
905 return -ENODEV;
906 }
907
908 /*
909 * set up a private mapping or an anonymous shared mapping
910 */
do_mmap_private(struct vm_area_struct * vma,struct vm_region * region,unsigned long len,unsigned long capabilities)911 static int do_mmap_private(struct vm_area_struct *vma,
912 struct vm_region *region,
913 unsigned long len,
914 unsigned long capabilities)
915 {
916 unsigned long total, point;
917 void *base;
918 int ret, order;
919
920 /*
921 * Invoke the file's mapping function so that it can keep track of
922 * shared mappings on devices or memory. VM_MAYOVERLAY will be set if
923 * it may attempt to share, which will make is_nommu_shared_mapping()
924 * happy.
925 */
926 if (capabilities & NOMMU_MAP_DIRECT) {
927 ret = mmap_file(vma->vm_file, vma);
928 /* shouldn't return success if we're not sharing */
929 if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
930 ret = -ENOSYS;
931 if (ret == 0) {
932 vma->vm_region->vm_top = vma->vm_region->vm_end;
933 return 0;
934 }
935 if (ret != -ENOSYS)
936 return ret;
937
938 /* getting an ENOSYS error indicates that direct mmap isn't
939 * possible (as opposed to tried but failed) so we'll try to
940 * make a private copy of the data and map that instead */
941 }
942
943
944 /* allocate some memory to hold the mapping
945 * - note that this may not return a page-aligned address if the object
946 * we're allocating is smaller than a page
947 */
948 order = get_order(len);
949 total = 1 << order;
950 point = len >> PAGE_SHIFT;
951
952 /* we don't want to allocate a power-of-2 sized page set */
953 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
954 total = point;
955
956 base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
957 if (!base)
958 goto enomem;
959
960 atomic_long_add(total, &mmap_pages_allocated);
961
962 vm_flags_set(vma, VM_MAPPED_COPY);
963 region->vm_flags = vma->vm_flags;
964 region->vm_start = (unsigned long) base;
965 region->vm_end = region->vm_start + len;
966 region->vm_top = region->vm_start + (total << PAGE_SHIFT);
967
968 vma->vm_start = region->vm_start;
969 vma->vm_end = region->vm_start + len;
970
971 if (vma->vm_file) {
972 /* read the contents of a file into the copy */
973 loff_t fpos;
974
975 fpos = vma->vm_pgoff;
976 fpos <<= PAGE_SHIFT;
977
978 ret = kernel_read(vma->vm_file, base, len, &fpos);
979 if (ret < 0)
980 goto error_free;
981
982 /* clear the last little bit */
983 if (ret < len)
984 memset(base + ret, 0, len - ret);
985
986 } else {
987 vma_set_anonymous(vma);
988 }
989
990 return 0;
991
992 error_free:
993 free_page_series(region->vm_start, region->vm_top);
994 region->vm_start = vma->vm_start = 0;
995 region->vm_end = vma->vm_end = 0;
996 region->vm_top = 0;
997 return ret;
998
999 enomem:
1000 pr_err("Allocation of length %lu from process %d (%s) failed\n",
1001 len, current->pid, current->comm);
1002 show_mem();
1003 return -ENOMEM;
1004 }
1005
1006 /*
1007 * handle mapping creation for uClinux
1008 */
do_mmap(struct file * file,unsigned long addr,unsigned long len,unsigned long prot,unsigned long flags,vm_flags_t vm_flags,unsigned long pgoff,unsigned long * populate,struct list_head * uf)1009 unsigned long do_mmap(struct file *file,
1010 unsigned long addr,
1011 unsigned long len,
1012 unsigned long prot,
1013 unsigned long flags,
1014 vm_flags_t vm_flags,
1015 unsigned long pgoff,
1016 unsigned long *populate,
1017 struct list_head *uf)
1018 {
1019 struct vm_area_struct *vma;
1020 struct vm_region *region;
1021 struct rb_node *rb;
1022 unsigned long capabilities, result;
1023 int ret;
1024 VMA_ITERATOR(vmi, current->mm, 0);
1025
1026 *populate = 0;
1027
1028 /* decide whether we should attempt the mapping, and if so what sort of
1029 * mapping */
1030 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
1031 &capabilities);
1032 if (ret < 0)
1033 return ret;
1034
1035 /* we ignore the address hint */
1036 addr = 0;
1037 len = PAGE_ALIGN(len);
1038
1039 /* we've determined that we can make the mapping, now translate what we
1040 * now know into VMA flags */
1041 vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
1042
1043
1044 /* we're going to need to record the mapping */
1045 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
1046 if (!region)
1047 goto error_getting_region;
1048
1049 vma = vm_area_alloc(current->mm);
1050 if (!vma)
1051 goto error_getting_vma;
1052
1053 region->vm_usage = 1;
1054 region->vm_flags = vm_flags;
1055 region->vm_pgoff = pgoff;
1056
1057 vm_flags_init(vma, vm_flags);
1058 vma->vm_pgoff = pgoff;
1059
1060 if (file) {
1061 region->vm_file = get_file(file);
1062 vma->vm_file = get_file(file);
1063 }
1064
1065 down_write(&nommu_region_sem);
1066
1067 /* if we want to share, we need to check for regions created by other
1068 * mmap() calls that overlap with our proposed mapping
1069 * - we can only share with a superset match on most regular files
1070 * - shared mappings on character devices and memory backed files are
1071 * permitted to overlap inexactly as far as we are concerned for in
1072 * these cases, sharing is handled in the driver or filesystem rather
1073 * than here
1074 */
1075 if (is_nommu_shared_mapping(vm_flags)) {
1076 struct vm_region *pregion;
1077 unsigned long pglen, rpglen, pgend, rpgend, start;
1078
1079 pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1080 pgend = pgoff + pglen;
1081
1082 for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
1083 pregion = rb_entry(rb, struct vm_region, vm_rb);
1084
1085 if (!is_nommu_shared_mapping(pregion->vm_flags))
1086 continue;
1087
1088 /* search for overlapping mappings on the same file */
1089 if (file_inode(pregion->vm_file) !=
1090 file_inode(file))
1091 continue;
1092
1093 if (pregion->vm_pgoff >= pgend)
1094 continue;
1095
1096 rpglen = pregion->vm_end - pregion->vm_start;
1097 rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1098 rpgend = pregion->vm_pgoff + rpglen;
1099 if (pgoff >= rpgend)
1100 continue;
1101
1102 /* handle inexactly overlapping matches between
1103 * mappings */
1104 if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
1105 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
1106 /* new mapping is not a subset of the region */
1107 if (!(capabilities & NOMMU_MAP_DIRECT))
1108 goto sharing_violation;
1109 continue;
1110 }
1111
1112 /* we've found a region we can share */
1113 pregion->vm_usage++;
1114 vma->vm_region = pregion;
1115 start = pregion->vm_start;
1116 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
1117 vma->vm_start = start;
1118 vma->vm_end = start + len;
1119
1120 if (pregion->vm_flags & VM_MAPPED_COPY)
1121 vm_flags_set(vma, VM_MAPPED_COPY);
1122 else {
1123 ret = do_mmap_shared_file(vma);
1124 if (ret < 0) {
1125 vma->vm_region = NULL;
1126 vma->vm_start = 0;
1127 vma->vm_end = 0;
1128 pregion->vm_usage--;
1129 pregion = NULL;
1130 goto error_just_free;
1131 }
1132 }
1133 fput(region->vm_file);
1134 kmem_cache_free(vm_region_jar, region);
1135 region = pregion;
1136 result = start;
1137 goto share;
1138 }
1139
1140 /* obtain the address at which to make a shared mapping
1141 * - this is the hook for quasi-memory character devices to
1142 * tell us the location of a shared mapping
1143 */
1144 if (capabilities & NOMMU_MAP_DIRECT) {
1145 addr = file->f_op->get_unmapped_area(file, addr, len,
1146 pgoff, flags);
1147 if (IS_ERR_VALUE(addr)) {
1148 ret = addr;
1149 if (ret != -ENOSYS)
1150 goto error_just_free;
1151
1152 /* the driver refused to tell us where to site
1153 * the mapping so we'll have to attempt to copy
1154 * it */
1155 ret = -ENODEV;
1156 if (!(capabilities & NOMMU_MAP_COPY))
1157 goto error_just_free;
1158
1159 capabilities &= ~NOMMU_MAP_DIRECT;
1160 } else {
1161 vma->vm_start = region->vm_start = addr;
1162 vma->vm_end = region->vm_end = addr + len;
1163 }
1164 }
1165 }
1166
1167 vma->vm_region = region;
1168
1169 /* set up the mapping
1170 * - the region is filled in if NOMMU_MAP_DIRECT is still set
1171 */
1172 if (file && vma->vm_flags & VM_SHARED)
1173 ret = do_mmap_shared_file(vma);
1174 else
1175 ret = do_mmap_private(vma, region, len, capabilities);
1176 if (ret < 0)
1177 goto error_just_free;
1178 add_nommu_region(region);
1179
1180 /* clear anonymous mappings that don't ask for uninitialized data */
1181 if (!vma->vm_file &&
1182 (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
1183 !(flags & MAP_UNINITIALIZED)))
1184 memset((void *)region->vm_start, 0,
1185 region->vm_end - region->vm_start);
1186
1187 /* okay... we have a mapping; now we have to register it */
1188 result = vma->vm_start;
1189
1190 current->mm->total_vm += len >> PAGE_SHIFT;
1191
1192 share:
1193 BUG_ON(!vma->vm_region);
1194 vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
1195 if (vma_iter_prealloc(&vmi, vma))
1196 goto error_just_free;
1197
1198 setup_vma_to_mm(vma, current->mm);
1199 current->mm->map_count++;
1200 /* add the VMA to the tree */
1201 vma_iter_store_new(&vmi, vma);
1202
1203 /* we flush the region from the icache only when the first executable
1204 * mapping of it is made */
1205 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1206 flush_icache_user_range(region->vm_start, region->vm_end);
1207 region->vm_icache_flushed = true;
1208 }
1209
1210 up_write(&nommu_region_sem);
1211
1212 return result;
1213
1214 error_just_free:
1215 up_write(&nommu_region_sem);
1216 error:
1217 vma_iter_free(&vmi);
1218 if (region->vm_file)
1219 fput(region->vm_file);
1220 kmem_cache_free(vm_region_jar, region);
1221 if (vma->vm_file)
1222 fput(vma->vm_file);
1223 vm_area_free(vma);
1224 return ret;
1225
1226 sharing_violation:
1227 up_write(&nommu_region_sem);
1228 pr_warn("Attempt to share mismatched mappings\n");
1229 ret = -EINVAL;
1230 goto error;
1231
1232 error_getting_vma:
1233 kmem_cache_free(vm_region_jar, region);
1234 pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
1235 len, current->pid);
1236 show_mem();
1237 return -ENOMEM;
1238
1239 error_getting_region:
1240 pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
1241 len, current->pid);
1242 show_mem();
1243 return -ENOMEM;
1244 }
1245
ksys_mmap_pgoff(unsigned long addr,unsigned long len,unsigned long prot,unsigned long flags,unsigned long fd,unsigned long pgoff)1246 unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
1247 unsigned long prot, unsigned long flags,
1248 unsigned long fd, unsigned long pgoff)
1249 {
1250 struct file *file = NULL;
1251 unsigned long retval = -EBADF;
1252
1253 audit_mmap_fd(fd, flags);
1254 if (!(flags & MAP_ANONYMOUS)) {
1255 file = fget(fd);
1256 if (!file)
1257 goto out;
1258 }
1259
1260 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1261
1262 if (file)
1263 fput(file);
1264 out:
1265 return retval;
1266 }
1267
SYSCALL_DEFINE6(mmap_pgoff,unsigned long,addr,unsigned long,len,unsigned long,prot,unsigned long,flags,unsigned long,fd,unsigned long,pgoff)1268 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1269 unsigned long, prot, unsigned long, flags,
1270 unsigned long, fd, unsigned long, pgoff)
1271 {
1272 return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
1273 }
1274
1275 #ifdef __ARCH_WANT_SYS_OLD_MMAP
1276 struct mmap_arg_struct {
1277 unsigned long addr;
1278 unsigned long len;
1279 unsigned long prot;
1280 unsigned long flags;
1281 unsigned long fd;
1282 unsigned long offset;
1283 };
1284
SYSCALL_DEFINE1(old_mmap,struct mmap_arg_struct __user *,arg)1285 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1286 {
1287 struct mmap_arg_struct a;
1288
1289 if (copy_from_user(&a, arg, sizeof(a)))
1290 return -EFAULT;
1291 if (offset_in_page(a.offset))
1292 return -EINVAL;
1293
1294 return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1295 a.offset >> PAGE_SHIFT);
1296 }
1297 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
1298
1299 /*
1300 * split a vma into two pieces at address 'addr', a new vma is allocated either
1301 * for the first part or the tail.
1302 */
split_vma(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long addr,int new_below)1303 static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
1304 unsigned long addr, int new_below)
1305 {
1306 struct vm_area_struct *new;
1307 struct vm_region *region;
1308 unsigned long npages;
1309 struct mm_struct *mm;
1310
1311 /* we're only permitted to split anonymous regions (these should have
1312 * only a single usage on the region) */
1313 if (vma->vm_file)
1314 return -ENOMEM;
1315
1316 mm = vma->vm_mm;
1317 if (mm->map_count >= sysctl_max_map_count)
1318 return -ENOMEM;
1319
1320 region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
1321 if (!region)
1322 return -ENOMEM;
1323
1324 new = vm_area_dup(vma);
1325 if (!new)
1326 goto err_vma_dup;
1327
1328 /* most fields are the same, copy all, and then fixup */
1329 *region = *vma->vm_region;
1330 new->vm_region = region;
1331
1332 npages = (addr - vma->vm_start) >> PAGE_SHIFT;
1333
1334 if (new_below) {
1335 region->vm_top = region->vm_end = new->vm_end = addr;
1336 } else {
1337 region->vm_start = new->vm_start = addr;
1338 region->vm_pgoff = new->vm_pgoff += npages;
1339 }
1340
1341 vma_iter_config(vmi, new->vm_start, new->vm_end);
1342 if (vma_iter_prealloc(vmi, vma)) {
1343 pr_warn("Allocation of vma tree for process %d failed\n",
1344 current->pid);
1345 goto err_vmi_preallocate;
1346 }
1347
1348 if (new->vm_ops && new->vm_ops->open)
1349 new->vm_ops->open(new);
1350
1351 down_write(&nommu_region_sem);
1352 delete_nommu_region(vma->vm_region);
1353 if (new_below) {
1354 vma->vm_region->vm_start = vma->vm_start = addr;
1355 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
1356 } else {
1357 vma->vm_region->vm_end = vma->vm_end = addr;
1358 vma->vm_region->vm_top = addr;
1359 }
1360 add_nommu_region(vma->vm_region);
1361 add_nommu_region(new->vm_region);
1362 up_write(&nommu_region_sem);
1363
1364 setup_vma_to_mm(vma, mm);
1365 setup_vma_to_mm(new, mm);
1366 vma_iter_store_new(vmi, new);
1367 mm->map_count++;
1368 return 0;
1369
1370 err_vmi_preallocate:
1371 vm_area_free(new);
1372 err_vma_dup:
1373 kmem_cache_free(vm_region_jar, region);
1374 return -ENOMEM;
1375 }
1376
1377 /*
1378 * shrink a VMA by removing the specified chunk from either the beginning or
1379 * the end
1380 */
vmi_shrink_vma(struct vma_iterator * vmi,struct vm_area_struct * vma,unsigned long from,unsigned long to)1381 static int vmi_shrink_vma(struct vma_iterator *vmi,
1382 struct vm_area_struct *vma,
1383 unsigned long from, unsigned long to)
1384 {
1385 struct vm_region *region;
1386
1387 /* adjust the VMA's pointers, which may reposition it in the MM's tree
1388 * and list */
1389 if (from > vma->vm_start) {
1390 if (vma_iter_clear_gfp(vmi, from, vma->vm_end, GFP_KERNEL))
1391 return -ENOMEM;
1392 vma->vm_end = from;
1393 } else {
1394 if (vma_iter_clear_gfp(vmi, vma->vm_start, to, GFP_KERNEL))
1395 return -ENOMEM;
1396 vma->vm_start = to;
1397 }
1398
1399 /* cut the backing region down to size */
1400 region = vma->vm_region;
1401 BUG_ON(region->vm_usage != 1);
1402
1403 down_write(&nommu_region_sem);
1404 delete_nommu_region(region);
1405 if (from > region->vm_start) {
1406 to = region->vm_top;
1407 region->vm_top = region->vm_end = from;
1408 } else {
1409 region->vm_start = to;
1410 }
1411 add_nommu_region(region);
1412 up_write(&nommu_region_sem);
1413
1414 free_page_series(from, to);
1415 return 0;
1416 }
1417
1418 /*
1419 * release a mapping
1420 * - under NOMMU conditions the chunk to be unmapped must be backed by a single
1421 * VMA, though it need not cover the whole VMA
1422 */
do_munmap(struct mm_struct * mm,unsigned long start,size_t len,struct list_head * uf)1423 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
1424 {
1425 VMA_ITERATOR(vmi, mm, start);
1426 struct vm_area_struct *vma;
1427 unsigned long end;
1428 int ret = 0;
1429
1430 len = PAGE_ALIGN(len);
1431 if (len == 0)
1432 return -EINVAL;
1433
1434 end = start + len;
1435
1436 /* find the first potentially overlapping VMA */
1437 vma = vma_find(&vmi, end);
1438 if (!vma) {
1439 static int limit;
1440 if (limit < 5) {
1441 pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
1442 current->pid, current->comm,
1443 start, start + len - 1);
1444 limit++;
1445 }
1446 return -EINVAL;
1447 }
1448
1449 /* we're allowed to split an anonymous VMA but not a file-backed one */
1450 if (vma->vm_file) {
1451 do {
1452 if (start > vma->vm_start)
1453 return -EINVAL;
1454 if (end == vma->vm_end)
1455 goto erase_whole_vma;
1456 vma = vma_find(&vmi, end);
1457 } while (vma);
1458 return -EINVAL;
1459 } else {
1460 /* the chunk must be a subset of the VMA found */
1461 if (start == vma->vm_start && end == vma->vm_end)
1462 goto erase_whole_vma;
1463 if (start < vma->vm_start || end > vma->vm_end)
1464 return -EINVAL;
1465 if (offset_in_page(start))
1466 return -EINVAL;
1467 if (end != vma->vm_end && offset_in_page(end))
1468 return -EINVAL;
1469 if (start != vma->vm_start && end != vma->vm_end) {
1470 ret = split_vma(&vmi, vma, start, 1);
1471 if (ret < 0)
1472 return ret;
1473 }
1474 return vmi_shrink_vma(&vmi, vma, start, end);
1475 }
1476
1477 erase_whole_vma:
1478 if (delete_vma_from_mm(vma))
1479 ret = -ENOMEM;
1480 else
1481 delete_vma(mm, vma);
1482 return ret;
1483 }
1484
vm_munmap(unsigned long addr,size_t len)1485 int vm_munmap(unsigned long addr, size_t len)
1486 {
1487 struct mm_struct *mm = current->mm;
1488 int ret;
1489
1490 mmap_write_lock(mm);
1491 ret = do_munmap(mm, addr, len, NULL);
1492 mmap_write_unlock(mm);
1493 return ret;
1494 }
1495 EXPORT_SYMBOL(vm_munmap);
1496
SYSCALL_DEFINE2(munmap,unsigned long,addr,size_t,len)1497 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1498 {
1499 return vm_munmap(addr, len);
1500 }
1501
1502 /*
1503 * release all the mappings made in a process's VM space
1504 */
exit_mmap(struct mm_struct * mm)1505 void exit_mmap(struct mm_struct *mm)
1506 {
1507 VMA_ITERATOR(vmi, mm, 0);
1508 struct vm_area_struct *vma;
1509
1510 if (!mm)
1511 return;
1512
1513 mm->total_vm = 0;
1514
1515 /*
1516 * Lock the mm to avoid assert complaining even though this is the only
1517 * user of the mm
1518 */
1519 mmap_write_lock(mm);
1520 for_each_vma(vmi, vma) {
1521 cleanup_vma_from_mm(vma);
1522 delete_vma(mm, vma);
1523 cond_resched();
1524 }
1525 __mt_destroy(&mm->mm_mt);
1526 mmap_write_unlock(mm);
1527 }
1528
1529 /*
1530 * expand (or shrink) an existing mapping, potentially moving it at the same
1531 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1532 *
1533 * under NOMMU conditions, we only permit changing a mapping's size, and only
1534 * as long as it stays within the region allocated by do_mmap_private() and the
1535 * block is not shareable
1536 *
1537 * MREMAP_FIXED is not supported under NOMMU conditions
1538 */
do_mremap(unsigned long addr,unsigned long old_len,unsigned long new_len,unsigned long flags,unsigned long new_addr)1539 static unsigned long do_mremap(unsigned long addr,
1540 unsigned long old_len, unsigned long new_len,
1541 unsigned long flags, unsigned long new_addr)
1542 {
1543 struct vm_area_struct *vma;
1544
1545 /* insanity checks first */
1546 old_len = PAGE_ALIGN(old_len);
1547 new_len = PAGE_ALIGN(new_len);
1548 if (old_len == 0 || new_len == 0)
1549 return (unsigned long) -EINVAL;
1550
1551 if (offset_in_page(addr))
1552 return -EINVAL;
1553
1554 if (flags & MREMAP_FIXED && new_addr != addr)
1555 return (unsigned long) -EINVAL;
1556
1557 vma = find_vma_exact(current->mm, addr, old_len);
1558 if (!vma)
1559 return (unsigned long) -EINVAL;
1560
1561 if (vma->vm_end != vma->vm_start + old_len)
1562 return (unsigned long) -EFAULT;
1563
1564 if (is_nommu_shared_mapping(vma->vm_flags))
1565 return (unsigned long) -EPERM;
1566
1567 if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
1568 return (unsigned long) -ENOMEM;
1569
1570 /* all checks complete - do it */
1571 vma->vm_end = vma->vm_start + new_len;
1572 return vma->vm_start;
1573 }
1574
SYSCALL_DEFINE5(mremap,unsigned long,addr,unsigned long,old_len,unsigned long,new_len,unsigned long,flags,unsigned long,new_addr)1575 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1576 unsigned long, new_len, unsigned long, flags,
1577 unsigned long, new_addr)
1578 {
1579 unsigned long ret;
1580
1581 mmap_write_lock(current->mm);
1582 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1583 mmap_write_unlock(current->mm);
1584 return ret;
1585 }
1586
remap_pfn_range(struct vm_area_struct * vma,unsigned long addr,unsigned long pfn,unsigned long size,pgprot_t prot)1587 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1588 unsigned long pfn, unsigned long size, pgprot_t prot)
1589 {
1590 if (addr != (pfn << PAGE_SHIFT))
1591 return -EINVAL;
1592
1593 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
1594 return 0;
1595 }
1596 EXPORT_SYMBOL(remap_pfn_range);
1597
vm_iomap_memory(struct vm_area_struct * vma,phys_addr_t start,unsigned long len)1598 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1599 {
1600 unsigned long pfn = start >> PAGE_SHIFT;
1601 unsigned long vm_len = vma->vm_end - vma->vm_start;
1602
1603 pfn += vma->vm_pgoff;
1604 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1605 }
1606 EXPORT_SYMBOL(vm_iomap_memory);
1607
remap_vmalloc_range(struct vm_area_struct * vma,void * addr,unsigned long pgoff)1608 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1609 unsigned long pgoff)
1610 {
1611 unsigned int size = vma->vm_end - vma->vm_start;
1612
1613 if (!(vma->vm_flags & VM_USERMAP))
1614 return -EINVAL;
1615
1616 vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
1617 vma->vm_end = vma->vm_start + size;
1618
1619 return 0;
1620 }
1621 EXPORT_SYMBOL(remap_vmalloc_range);
1622
filemap_fault(struct vm_fault * vmf)1623 vm_fault_t filemap_fault(struct vm_fault *vmf)
1624 {
1625 BUG();
1626 return 0;
1627 }
1628 EXPORT_SYMBOL(filemap_fault);
1629
filemap_map_pages(struct vm_fault * vmf,pgoff_t start_pgoff,pgoff_t end_pgoff)1630 vm_fault_t filemap_map_pages(struct vm_fault *vmf,
1631 pgoff_t start_pgoff, pgoff_t end_pgoff)
1632 {
1633 BUG();
1634 return 0;
1635 }
1636 EXPORT_SYMBOL(filemap_map_pages);
1637
__access_remote_vm(struct mm_struct * mm,unsigned long addr,void * buf,int len,unsigned int gup_flags)1638 static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
1639 void *buf, int len, unsigned int gup_flags)
1640 {
1641 struct vm_area_struct *vma;
1642 int write = gup_flags & FOLL_WRITE;
1643
1644 if (mmap_read_lock_killable(mm))
1645 return 0;
1646
1647 /* the access must start within one of the target process's mappings */
1648 vma = find_vma(mm, addr);
1649 if (vma) {
1650 /* don't overrun this mapping */
1651 if (addr + len >= vma->vm_end)
1652 len = vma->vm_end - addr;
1653
1654 /* only read or write mappings where it is permitted */
1655 if (write && vma->vm_flags & VM_MAYWRITE)
1656 copy_to_user_page(vma, NULL, addr,
1657 (void *) addr, buf, len);
1658 else if (!write && vma->vm_flags & VM_MAYREAD)
1659 copy_from_user_page(vma, NULL, addr,
1660 buf, (void *) addr, len);
1661 else
1662 len = 0;
1663 } else {
1664 len = 0;
1665 }
1666
1667 mmap_read_unlock(mm);
1668
1669 return len;
1670 }
1671
1672 /**
1673 * access_remote_vm - access another process' address space
1674 * @mm: the mm_struct of the target address space
1675 * @addr: start address to access
1676 * @buf: source or destination buffer
1677 * @len: number of bytes to transfer
1678 * @gup_flags: flags modifying lookup behaviour
1679 *
1680 * The caller must hold a reference on @mm.
1681 */
access_remote_vm(struct mm_struct * mm,unsigned long addr,void * buf,int len,unsigned int gup_flags)1682 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
1683 void *buf, int len, unsigned int gup_flags)
1684 {
1685 return __access_remote_vm(mm, addr, buf, len, gup_flags);
1686 }
1687
1688 /*
1689 * Access another process' address space.
1690 * - source/target buffer must be kernel space
1691 */
access_process_vm(struct task_struct * tsk,unsigned long addr,void * buf,int len,unsigned int gup_flags)1692 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len,
1693 unsigned int gup_flags)
1694 {
1695 struct mm_struct *mm;
1696
1697 if (addr + len < addr)
1698 return 0;
1699
1700 mm = get_task_mm(tsk);
1701 if (!mm)
1702 return 0;
1703
1704 len = __access_remote_vm(mm, addr, buf, len, gup_flags);
1705
1706 mmput(mm);
1707 return len;
1708 }
1709 EXPORT_SYMBOL_GPL(access_process_vm);
1710
1711 #ifdef CONFIG_BPF_SYSCALL
1712 /*
1713 * Copy a string from another process's address space as given in mm.
1714 * If there is any error return -EFAULT.
1715 */
__copy_remote_vm_str(struct mm_struct * mm,unsigned long addr,void * buf,int len)1716 static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
1717 void *buf, int len)
1718 {
1719 unsigned long addr_end;
1720 struct vm_area_struct *vma;
1721 int ret = -EFAULT;
1722
1723 *(char *)buf = '\0';
1724
1725 if (mmap_read_lock_killable(mm))
1726 return ret;
1727
1728 /* the access must start within one of the target process's mappings */
1729 vma = find_vma(mm, addr);
1730 if (!vma)
1731 goto out;
1732
1733 if (check_add_overflow(addr, len, &addr_end))
1734 goto out;
1735
1736 /* don't overrun this mapping */
1737 if (addr_end > vma->vm_end)
1738 len = vma->vm_end - addr;
1739
1740 /* only read mappings where it is permitted */
1741 if (vma->vm_flags & VM_MAYREAD) {
1742 ret = strscpy(buf, (char *)addr, len);
1743 if (ret < 0)
1744 ret = len - 1;
1745 }
1746
1747 out:
1748 mmap_read_unlock(mm);
1749 return ret;
1750 }
1751
1752 /**
1753 * copy_remote_vm_str - copy a string from another process's address space.
1754 * @tsk: the task of the target address space
1755 * @addr: start address to read from
1756 * @buf: destination buffer
1757 * @len: number of bytes to copy
1758 * @gup_flags: flags modifying lookup behaviour (unused)
1759 *
1760 * The caller must hold a reference on @mm.
1761 *
1762 * Return: number of bytes copied from @addr (source) to @buf (destination);
1763 * not including the trailing NUL. Always guaranteed to leave NUL-terminated
1764 * buffer. On any error, return -EFAULT.
1765 */
copy_remote_vm_str(struct task_struct * tsk,unsigned long addr,void * buf,int len,unsigned int gup_flags)1766 int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
1767 void *buf, int len, unsigned int gup_flags)
1768 {
1769 struct mm_struct *mm;
1770 int ret;
1771
1772 if (unlikely(len == 0))
1773 return 0;
1774
1775 mm = get_task_mm(tsk);
1776 if (!mm) {
1777 *(char *)buf = '\0';
1778 return -EFAULT;
1779 }
1780
1781 ret = __copy_remote_vm_str(mm, addr, buf, len);
1782
1783 mmput(mm);
1784
1785 return ret;
1786 }
1787 EXPORT_SYMBOL_GPL(copy_remote_vm_str);
1788 #endif /* CONFIG_BPF_SYSCALL */
1789
1790 /**
1791 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
1792 * @inode: The inode to check
1793 * @size: The current filesize of the inode
1794 * @newsize: The proposed filesize of the inode
1795 *
1796 * Check the shared mappings on an inode on behalf of a shrinking truncate to
1797 * make sure that any outstanding VMAs aren't broken and then shrink the
1798 * vm_regions that extend beyond so that do_mmap() doesn't
1799 * automatically grant mappings that are too large.
1800 */
nommu_shrink_inode_mappings(struct inode * inode,size_t size,size_t newsize)1801 int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
1802 size_t newsize)
1803 {
1804 struct vm_area_struct *vma;
1805 struct vm_region *region;
1806 pgoff_t low, high;
1807 size_t r_size, r_top;
1808
1809 low = newsize >> PAGE_SHIFT;
1810 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1811
1812 down_write(&nommu_region_sem);
1813 i_mmap_lock_read(inode->i_mapping);
1814
1815 /* search for VMAs that fall within the dead zone */
1816 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
1817 /* found one - only interested if it's shared out of the page
1818 * cache */
1819 if (vma->vm_flags & VM_SHARED) {
1820 i_mmap_unlock_read(inode->i_mapping);
1821 up_write(&nommu_region_sem);
1822 return -ETXTBSY; /* not quite true, but near enough */
1823 }
1824 }
1825
1826 /* reduce any regions that overlap the dead zone - if in existence,
1827 * these will be pointed to by VMAs that don't overlap the dead zone
1828 *
1829 * we don't check for any regions that start beyond the EOF as there
1830 * shouldn't be any
1831 */
1832 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
1833 if (!(vma->vm_flags & VM_SHARED))
1834 continue;
1835
1836 region = vma->vm_region;
1837 r_size = region->vm_top - region->vm_start;
1838 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
1839
1840 if (r_top > newsize) {
1841 region->vm_top -= r_top - newsize;
1842 if (region->vm_end > region->vm_top)
1843 region->vm_end = region->vm_top;
1844 }
1845 }
1846
1847 i_mmap_unlock_read(inode->i_mapping);
1848 up_write(&nommu_region_sem);
1849 return 0;
1850 }
1851
1852 /*
1853 * Initialise sysctl_user_reserve_kbytes.
1854 *
1855 * This is intended to prevent a user from starting a single memory hogging
1856 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
1857 * mode.
1858 *
1859 * The default value is min(3% of free memory, 128MB)
1860 * 128MB is enough to recover with sshd/login, bash, and top/kill.
1861 */
init_user_reserve(void)1862 static int __meminit init_user_reserve(void)
1863 {
1864 unsigned long free_kbytes;
1865
1866 free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
1867
1868 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
1869 return 0;
1870 }
1871 subsys_initcall(init_user_reserve);
1872
1873 /*
1874 * Initialise sysctl_admin_reserve_kbytes.
1875 *
1876 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
1877 * to log in and kill a memory hogging process.
1878 *
1879 * Systems with more than 256MB will reserve 8MB, enough to recover
1880 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
1881 * only reserve 3% of free pages by default.
1882 */
init_admin_reserve(void)1883 static int __meminit init_admin_reserve(void)
1884 {
1885 unsigned long free_kbytes;
1886
1887 free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
1888
1889 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
1890 return 0;
1891 }
1892 subsys_initcall(init_admin_reserve);
1893