1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Slab allocator functions that are independent of the allocator strategy
4 *
5 * (C) 2012 Christoph Lameter <cl@gentwo.org>
6 */
7 #include <linux/slab.h>
8
9 #include <linux/mm.h>
10 #include <linux/poison.h>
11 #include <linux/interrupt.h>
12 #include <linux/memory.h>
13 #include <linux/cache.h>
14 #include <linux/compiler.h>
15 #include <linux/kfence.h>
16 #include <linux/module.h>
17 #include <linux/cpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/seq_file.h>
20 #include <linux/dma-mapping.h>
21 #include <linux/swiotlb.h>
22 #include <linux/proc_fs.h>
23 #include <linux/debugfs.h>
24 #include <linux/kmemleak.h>
25 #include <linux/kasan.h>
26 #include <asm/cacheflush.h>
27 #include <asm/tlbflush.h>
28 #include <asm/page.h>
29 #include <linux/memcontrol.h>
30 #include <linux/stackdepot.h>
31 #include <trace/events/rcu.h>
32
33 #include "../kernel/rcu/rcu.h"
34 #include "internal.h"
35 #include "slab.h"
36
37 #define CREATE_TRACE_POINTS
38 #include <trace/events/kmem.h>
39
40 enum slab_state slab_state;
41 LIST_HEAD(slab_caches);
42 DEFINE_MUTEX(slab_mutex);
43 struct kmem_cache *kmem_cache;
44
45 /*
46 * Set of flags that will prevent slab merging.
47 * Any flag that adds per-object metadata should be included,
48 * since slab merging can update s->inuse that affects the metadata layout.
49 */
50 #define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \
51 SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \
52 SLAB_OBJ_EXT_IN_OBJ)
53
54 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
55 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
56
57 /*
58 * Merge control. If this is set then no merging of slab caches will occur.
59 */
60 static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
61
setup_slab_nomerge(char * str)62 static int __init setup_slab_nomerge(char *str)
63 {
64 slab_nomerge = true;
65 return 1;
66 }
67
setup_slab_merge(char * str)68 static int __init setup_slab_merge(char *str)
69 {
70 slab_nomerge = false;
71 return 1;
72 }
73
74 __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
75 __setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
76
77 __setup("slab_nomerge", setup_slab_nomerge);
78 __setup("slab_merge", setup_slab_merge);
79
80 /*
81 * Determine the size of a slab object
82 */
kmem_cache_size(struct kmem_cache * s)83 unsigned int kmem_cache_size(struct kmem_cache *s)
84 {
85 return s->object_size;
86 }
87 EXPORT_SYMBOL(kmem_cache_size);
88
89 #ifdef CONFIG_DEBUG_VM
90
kmem_cache_is_duplicate_name(const char * name)91 static bool kmem_cache_is_duplicate_name(const char *name)
92 {
93 struct kmem_cache *s;
94
95 list_for_each_entry(s, &slab_caches, list) {
96 if (!strcmp(s->name, name))
97 return true;
98 }
99
100 return false;
101 }
102
kmem_cache_sanity_check(const char * name,unsigned int size)103 static int kmem_cache_sanity_check(const char *name, unsigned int size)
104 {
105 if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
106 pr_err("kmem_cache_create(%s) integrity check failed\n", name);
107 return -EINVAL;
108 }
109
110 /* Duplicate names will confuse slabtop, et al */
111 WARN(kmem_cache_is_duplicate_name(name),
112 "kmem_cache of name '%s' already exists\n", name);
113
114 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
115 return 0;
116 }
117 #else
kmem_cache_sanity_check(const char * name,unsigned int size)118 static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
119 {
120 return 0;
121 }
122 #endif
123
124 /*
125 * Figure out what the alignment of the objects will be given a set of
126 * flags, a user specified alignment and the size of the objects.
127 */
calculate_alignment(slab_flags_t flags,unsigned int align,unsigned int size)128 static unsigned int calculate_alignment(slab_flags_t flags,
129 unsigned int align, unsigned int size)
130 {
131 /*
132 * If the user wants hardware cache aligned objects then follow that
133 * suggestion if the object is sufficiently large.
134 *
135 * The hardware cache alignment cannot override the specified
136 * alignment though. If that is greater then use it.
137 */
138 if (flags & SLAB_HWCACHE_ALIGN) {
139 unsigned int ralign;
140
141 ralign = cache_line_size();
142 while (size <= ralign / 2)
143 ralign /= 2;
144 align = max(align, ralign);
145 }
146
147 align = max(align, arch_slab_minalign());
148
149 return ALIGN(align, sizeof(void *));
150 }
151
152 /*
153 * Find a mergeable slab cache
154 */
slab_unmergeable(struct kmem_cache * s)155 int slab_unmergeable(struct kmem_cache *s)
156 {
157 if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
158 return 1;
159
160 if (s->ctor)
161 return 1;
162
163 #ifdef CONFIG_HARDENED_USERCOPY
164 if (s->usersize)
165 return 1;
166 #endif
167
168 /*
169 * We may have set a slab to be unmergeable during bootstrap.
170 */
171 if (s->refcount < 0)
172 return 1;
173
174 return 0;
175 }
176
slab_args_unmergeable(struct kmem_cache_args * args,slab_flags_t flags)177 bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags)
178 {
179 if (slab_nomerge)
180 return true;
181
182 if (args->ctor)
183 return true;
184
185 if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize)
186 return true;
187
188 if (flags & SLAB_NEVER_MERGE)
189 return true;
190
191 return false;
192 }
193
find_mergeable(unsigned int size,slab_flags_t flags,const char * name,struct kmem_cache_args * args)194 static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags,
195 const char *name, struct kmem_cache_args *args)
196 {
197 struct kmem_cache *s;
198 unsigned int align;
199
200 flags = kmem_cache_flags(flags, name);
201 if (slab_args_unmergeable(args, flags))
202 return NULL;
203
204 size = ALIGN(size, sizeof(void *));
205 align = calculate_alignment(flags, args->align, size);
206 size = ALIGN(size, align);
207
208 list_for_each_entry_reverse(s, &slab_caches, list) {
209 if (slab_unmergeable(s))
210 continue;
211
212 if (size > s->size)
213 continue;
214
215 if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
216 continue;
217 /*
218 * Check if alignment is compatible.
219 * Courtesy of Adrian Drzewiecki
220 */
221 if ((s->size & ~(align - 1)) != s->size)
222 continue;
223
224 if (s->size - size >= sizeof(void *))
225 continue;
226
227 return s;
228 }
229 return NULL;
230 }
231
create_cache(const char * name,unsigned int object_size,struct kmem_cache_args * args,slab_flags_t flags)232 static struct kmem_cache *create_cache(const char *name,
233 unsigned int object_size,
234 struct kmem_cache_args *args,
235 slab_flags_t flags)
236 {
237 struct kmem_cache *s;
238 int err;
239
240 /* If a custom freelist pointer is requested make sure it's sane. */
241 err = -EINVAL;
242 if (args->use_freeptr_offset &&
243 (args->freeptr_offset >= object_size ||
244 (!(flags & SLAB_TYPESAFE_BY_RCU) && !args->ctor) ||
245 !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
246 goto out;
247
248 err = -ENOMEM;
249 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
250 if (!s)
251 goto out;
252 err = do_kmem_cache_create(s, name, object_size, args, flags);
253 if (err)
254 goto out_free_cache;
255
256 s->refcount = 1;
257 list_add(&s->list, &slab_caches);
258 return s;
259
260 out_free_cache:
261 kmem_cache_free(kmem_cache, s);
262 out:
263 return ERR_PTR(err);
264 }
265
266 static struct kmem_cache *
__kmem_cache_alias(const char * name,unsigned int size,slab_flags_t flags,struct kmem_cache_args * args)267 __kmem_cache_alias(const char *name, unsigned int size, slab_flags_t flags,
268 struct kmem_cache_args *args)
269 {
270 struct kmem_cache *s;
271
272 s = find_mergeable(size, flags, name, args);
273 if (s) {
274 if (sysfs_slab_alias(s, name))
275 pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
276 name);
277
278 s->refcount++;
279
280 /*
281 * Adjust the object sizes so that we clear
282 * the complete object on kzalloc.
283 */
284 s->object_size = max(s->object_size, size);
285 s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
286 }
287
288 return s;
289 }
290
291 /**
292 * __kmem_cache_create_args - Create a kmem cache.
293 * @name: A string which is used in /proc/slabinfo to identify this cache.
294 * @object_size: The size of objects to be created in this cache.
295 * @args: Additional arguments for the cache creation (see
296 * &struct kmem_cache_args).
297 * @flags: See the descriptions of individual flags. The common ones are listed
298 * in the description below.
299 *
300 * Not to be called directly, use the kmem_cache_create() wrapper with the same
301 * parameters.
302 *
303 * Commonly used @flags:
304 *
305 * &SLAB_ACCOUNT - Account allocations to memcg.
306 *
307 * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
308 *
309 * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
310 *
311 * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
312 * by a grace period - see the full description before using.
313 *
314 * Context: Cannot be called within a interrupt, but can be interrupted.
315 *
316 * Return: a pointer to the cache on success, NULL on failure.
317 */
__kmem_cache_create_args(const char * name,unsigned int object_size,struct kmem_cache_args * args,slab_flags_t flags)318 struct kmem_cache *__kmem_cache_create_args(const char *name,
319 unsigned int object_size,
320 struct kmem_cache_args *args,
321 slab_flags_t flags)
322 {
323 struct kmem_cache *s = NULL;
324 const char *cache_name;
325 int err;
326
327 #ifdef CONFIG_SLUB_DEBUG
328 /*
329 * If no slab_debug was enabled globally, the static key is not yet
330 * enabled by setup_slub_debug(). Enable it if the cache is being
331 * created with any of the debugging flags passed explicitly.
332 * It's also possible that this is the first cache created with
333 * SLAB_STORE_USER and we should init stack_depot for it.
334 */
335 if (flags & SLAB_DEBUG_FLAGS)
336 static_branch_enable(&slub_debug_enabled);
337 if (flags & SLAB_STORE_USER)
338 stack_depot_init();
339 #else
340 flags &= ~SLAB_DEBUG_FLAGS;
341 #endif
342
343 /*
344 * Caches with specific capacity are special enough. It's simpler to
345 * make them unmergeable.
346 */
347 if (args->sheaf_capacity)
348 flags |= SLAB_NO_MERGE;
349
350 mutex_lock(&slab_mutex);
351
352 err = kmem_cache_sanity_check(name, object_size);
353 if (err) {
354 goto out_unlock;
355 }
356
357 if (flags & ~SLAB_FLAGS_PERMITTED) {
358 err = -EINVAL;
359 goto out_unlock;
360 }
361
362 /* Fail closed on bad usersize of useroffset values. */
363 if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
364 WARN_ON(!args->usersize && args->useroffset) ||
365 WARN_ON(object_size < args->usersize ||
366 object_size - args->usersize < args->useroffset))
367 args->usersize = args->useroffset = 0;
368
369 s = __kmem_cache_alias(name, object_size, flags, args);
370 if (s)
371 goto out_unlock;
372
373 cache_name = kstrdup_const(name, GFP_KERNEL);
374 if (!cache_name) {
375 err = -ENOMEM;
376 goto out_unlock;
377 }
378
379 args->align = calculate_alignment(flags, args->align, object_size);
380 s = create_cache(cache_name, object_size, args, flags);
381 if (IS_ERR(s)) {
382 err = PTR_ERR(s);
383 kfree_const(cache_name);
384 }
385
386 out_unlock:
387 mutex_unlock(&slab_mutex);
388
389 if (err) {
390 if (flags & SLAB_PANIC)
391 panic("%s: Failed to create slab '%s'. Error %d\n",
392 __func__, name, err);
393 else {
394 pr_warn("%s(%s) failed with error %d\n",
395 __func__, name, err);
396 dump_stack();
397 }
398 return NULL;
399 }
400 return s;
401 }
402 EXPORT_SYMBOL(__kmem_cache_create_args);
403
404 static struct kmem_cache *kmem_buckets_cache __ro_after_init;
405
406 /**
407 * kmem_buckets_create - Create a set of caches that handle dynamic sized
408 * allocations via kmem_buckets_alloc()
409 * @name: A prefix string which is used in /proc/slabinfo to identify this
410 * cache. The individual caches with have their sizes as the suffix.
411 * @flags: SLAB flags (see kmem_cache_create() for details).
412 * @useroffset: Starting offset within an allocation that may be copied
413 * to/from userspace.
414 * @usersize: How many bytes, starting at @useroffset, may be copied
415 * to/from userspace.
416 * @ctor: A constructor for the objects, run when new allocations are made.
417 *
418 * Cannot be called within an interrupt, but can be interrupted.
419 *
420 * Return: a pointer to the cache on success, NULL on failure. When
421 * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and
422 * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc().
423 * (i.e. callers only need to check for NULL on failure.)
424 */
kmem_buckets_create(const char * name,slab_flags_t flags,unsigned int useroffset,unsigned int usersize,void (* ctor)(void *))425 kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
426 unsigned int useroffset,
427 unsigned int usersize,
428 void (*ctor)(void *))
429 {
430 unsigned long mask = 0;
431 unsigned int idx;
432 kmem_buckets *b;
433
434 BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG);
435
436 /*
437 * When the separate buckets API is not built in, just return
438 * a non-NULL value for the kmem_buckets pointer, which will be
439 * unused when performing allocations.
440 */
441 if (!IS_ENABLED(CONFIG_SLAB_BUCKETS))
442 return ZERO_SIZE_PTR;
443
444 if (WARN_ON(!kmem_buckets_cache))
445 return NULL;
446
447 b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO);
448 if (WARN_ON(!b))
449 return NULL;
450
451 flags |= SLAB_NO_MERGE;
452
453 for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) {
454 char *short_size, *cache_name;
455 unsigned int cache_useroffset, cache_usersize;
456 unsigned int size, aligned_idx;
457
458 if (!kmalloc_caches[KMALLOC_NORMAL][idx])
459 continue;
460
461 size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size;
462 if (!size)
463 continue;
464
465 short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, '-');
466 if (WARN_ON(!short_size))
467 goto fail;
468
469 if (useroffset >= size) {
470 cache_useroffset = 0;
471 cache_usersize = 0;
472 } else {
473 cache_useroffset = useroffset;
474 cache_usersize = min(size - cache_useroffset, usersize);
475 }
476
477 aligned_idx = __kmalloc_index(size, false);
478 if (!(*b)[aligned_idx]) {
479 cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1);
480 if (WARN_ON(!cache_name))
481 goto fail;
482 (*b)[aligned_idx] = kmem_cache_create_usercopy(cache_name, size,
483 0, flags, cache_useroffset,
484 cache_usersize, ctor);
485 kfree(cache_name);
486 if (WARN_ON(!(*b)[aligned_idx]))
487 goto fail;
488 set_bit(aligned_idx, &mask);
489 }
490 if (idx != aligned_idx)
491 (*b)[idx] = (*b)[aligned_idx];
492 }
493
494 return b;
495
496 fail:
497 for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]))
498 kmem_cache_destroy((*b)[idx]);
499 kmem_cache_free(kmem_buckets_cache, b);
500
501 return NULL;
502 }
503 EXPORT_SYMBOL(kmem_buckets_create);
504
505 /*
506 * For a given kmem_cache, kmem_cache_destroy() should only be called
507 * once or there will be a use-after-free problem. The actual deletion
508 * and release of the kobject does not need slab_mutex or cpu_hotplug_lock
509 * protection. So they are now done without holding those locks.
510 */
kmem_cache_release(struct kmem_cache * s)511 static void kmem_cache_release(struct kmem_cache *s)
512 {
513 kfence_shutdown_cache(s);
514 if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL)
515 sysfs_slab_release(s);
516 else
517 slab_kmem_cache_release(s);
518 }
519
slab_kmem_cache_release(struct kmem_cache * s)520 void slab_kmem_cache_release(struct kmem_cache *s)
521 {
522 __kmem_cache_release(s);
523 kfree_const(s->name);
524 kmem_cache_free(kmem_cache, s);
525 }
526
kmem_cache_destroy(struct kmem_cache * s)527 void kmem_cache_destroy(struct kmem_cache *s)
528 {
529 int err;
530
531 if (unlikely(!s) || !kasan_check_byte(s))
532 return;
533
534 /* in-flight kfree_rcu()'s may include objects from our cache */
535 kvfree_rcu_barrier_on_cache(s);
536
537 if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
538 (s->flags & SLAB_TYPESAFE_BY_RCU)) {
539 /*
540 * Under CONFIG_SLUB_RCU_DEBUG, when objects in a
541 * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally
542 * defer their freeing with call_rcu().
543 * Wait for such call_rcu() invocations here before actually
544 * destroying the cache.
545 *
546 * It doesn't matter that we haven't looked at the slab refcount
547 * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so
548 * the refcount should be 1 here.
549 */
550 rcu_barrier();
551 }
552
553 /* Wait for deferred work from kmalloc/kfree_nolock() */
554 defer_free_barrier();
555
556 cpus_read_lock();
557 mutex_lock(&slab_mutex);
558
559 s->refcount--;
560 if (s->refcount) {
561 mutex_unlock(&slab_mutex);
562 cpus_read_unlock();
563 return;
564 }
565
566 /* free asan quarantined objects */
567 kasan_cache_shutdown(s);
568
569 err = __kmem_cache_shutdown(s);
570 if (!slab_in_kunit_test())
571 WARN(err, "%s %s: Slab cache still has objects when called from %pS",
572 __func__, s->name, (void *)_RET_IP_);
573
574 list_del(&s->list);
575
576 mutex_unlock(&slab_mutex);
577 cpus_read_unlock();
578
579 if (slab_state >= FULL)
580 sysfs_slab_unlink(s);
581 debugfs_slab_release(s);
582
583 if (err)
584 return;
585
586 if (s->flags & SLAB_TYPESAFE_BY_RCU)
587 rcu_barrier();
588
589 kmem_cache_release(s);
590 }
591 EXPORT_SYMBOL(kmem_cache_destroy);
592
593 /**
594 * kmem_cache_shrink - Shrink a cache.
595 * @cachep: The cache to shrink.
596 *
597 * Releases as many slabs as possible for a cache.
598 * To help debugging, a zero exit status indicates all slabs were released.
599 *
600 * Return: %0 if all slabs were released, non-zero otherwise
601 */
kmem_cache_shrink(struct kmem_cache * cachep)602 int kmem_cache_shrink(struct kmem_cache *cachep)
603 {
604 kasan_cache_shrink(cachep);
605
606 return __kmem_cache_shrink(cachep);
607 }
608 EXPORT_SYMBOL(kmem_cache_shrink);
609
slab_is_available(void)610 bool slab_is_available(void)
611 {
612 return slab_state >= UP;
613 }
614
615 #ifdef CONFIG_PRINTK
kmem_obj_info(struct kmem_obj_info * kpp,void * object,struct slab * slab)616 static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
617 {
618 if (__kfence_obj_info(kpp, object, slab))
619 return;
620 __kmem_obj_info(kpp, object, slab);
621 }
622
623 /**
624 * kmem_dump_obj - Print available slab provenance information
625 * @object: slab object for which to find provenance information.
626 *
627 * This function uses pr_cont(), so that the caller is expected to have
628 * printed out whatever preamble is appropriate. The provenance information
629 * depends on the type of object and on how much debugging is enabled.
630 * For a slab-cache object, the fact that it is a slab object is printed,
631 * and, if available, the slab name, return address, and stack trace from
632 * the allocation and last free path of that object.
633 *
634 * Return: %true if the pointer is to a not-yet-freed object from
635 * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
636 * is to an already-freed object, and %false otherwise.
637 */
kmem_dump_obj(void * object)638 bool kmem_dump_obj(void *object)
639 {
640 char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
641 int i;
642 struct slab *slab;
643 unsigned long ptroffset;
644 struct kmem_obj_info kp = { };
645
646 /* Some arches consider ZERO_SIZE_PTR to be a valid address. */
647 if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
648 return false;
649 slab = virt_to_slab(object);
650 if (!slab)
651 return false;
652
653 kmem_obj_info(&kp, object, slab);
654 if (kp.kp_slab_cache)
655 pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
656 else
657 pr_cont(" slab%s", cp);
658 if (is_kfence_address(object))
659 pr_cont(" (kfence)");
660 if (kp.kp_objp)
661 pr_cont(" start %px", kp.kp_objp);
662 if (kp.kp_data_offset)
663 pr_cont(" data offset %lu", kp.kp_data_offset);
664 if (kp.kp_objp) {
665 ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
666 pr_cont(" pointer offset %lu", ptroffset);
667 }
668 if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
669 pr_cont(" size %u", kp.kp_slab_cache->object_size);
670 if (kp.kp_ret)
671 pr_cont(" allocated at %pS\n", kp.kp_ret);
672 else
673 pr_cont("\n");
674 for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
675 if (!kp.kp_stack[i])
676 break;
677 pr_info(" %pS\n", kp.kp_stack[i]);
678 }
679
680 if (kp.kp_free_stack[0])
681 pr_cont(" Free path:\n");
682
683 for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
684 if (!kp.kp_free_stack[i])
685 break;
686 pr_info(" %pS\n", kp.kp_free_stack[i]);
687 }
688
689 return true;
690 }
691 EXPORT_SYMBOL_GPL(kmem_dump_obj);
692 #endif
693
694 /* Create a cache during boot when no slab services are available yet */
create_boot_cache(struct kmem_cache * s,const char * name,unsigned int size,slab_flags_t flags,unsigned int useroffset,unsigned int usersize)695 void __init create_boot_cache(struct kmem_cache *s, const char *name,
696 unsigned int size, slab_flags_t flags,
697 unsigned int useroffset, unsigned int usersize)
698 {
699 int err;
700 unsigned int align = ARCH_KMALLOC_MINALIGN;
701 struct kmem_cache_args kmem_args = {};
702
703 /*
704 * kmalloc caches guarantee alignment of at least the largest
705 * power-of-two divisor of the size. For power-of-two sizes,
706 * it is the size itself.
707 */
708 if (flags & SLAB_KMALLOC)
709 align = max(align, 1U << (ffs(size) - 1));
710 kmem_args.align = calculate_alignment(flags, align, size);
711
712 #ifdef CONFIG_HARDENED_USERCOPY
713 kmem_args.useroffset = useroffset;
714 kmem_args.usersize = usersize;
715 #endif
716
717 err = do_kmem_cache_create(s, name, size, &kmem_args, flags);
718
719 if (err)
720 panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
721 name, size, err);
722
723 s->refcount = -1; /* Exempt from merging for now */
724 }
725
create_kmalloc_cache(const char * name,unsigned int size,slab_flags_t flags)726 static struct kmem_cache *__init create_kmalloc_cache(const char *name,
727 unsigned int size,
728 slab_flags_t flags)
729 {
730 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
731
732 if (!s)
733 panic("Out of memory when creating slab %s\n", name);
734
735 create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size);
736 list_add(&s->list, &slab_caches);
737 s->refcount = 1;
738 return s;
739 }
740
741 kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init =
742 { /* initialization for https://llvm.org/pr42570 */ };
743 EXPORT_SYMBOL(kmalloc_caches);
744
745 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
746 unsigned long random_kmalloc_seed __ro_after_init;
747 EXPORT_SYMBOL(random_kmalloc_seed);
748 #endif
749
750 /*
751 * Conversion table for small slabs sizes / 8 to the index in the
752 * kmalloc array. This is necessary for slabs < 192 since we have non power
753 * of two cache sizes there. The size of larger slabs can be determined using
754 * fls.
755 */
756 u8 kmalloc_size_index[24] __ro_after_init = {
757 3, /* 8 */
758 4, /* 16 */
759 5, /* 24 */
760 5, /* 32 */
761 6, /* 40 */
762 6, /* 48 */
763 6, /* 56 */
764 6, /* 64 */
765 1, /* 72 */
766 1, /* 80 */
767 1, /* 88 */
768 1, /* 96 */
769 7, /* 104 */
770 7, /* 112 */
771 7, /* 120 */
772 7, /* 128 */
773 2, /* 136 */
774 2, /* 144 */
775 2, /* 152 */
776 2, /* 160 */
777 2, /* 168 */
778 2, /* 176 */
779 2, /* 184 */
780 2 /* 192 */
781 };
782
kmalloc_size_roundup(size_t size)783 size_t kmalloc_size_roundup(size_t size)
784 {
785 if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
786 /*
787 * The flags don't matter since size_index is common to all.
788 * Neither does the caller for just getting ->object_size.
789 */
790 return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size;
791 }
792
793 /* Above the smaller buckets, size is a multiple of page size. */
794 if (size && size <= KMALLOC_MAX_SIZE)
795 return PAGE_SIZE << get_order(size);
796
797 /*
798 * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR
799 * and very large size - kmalloc() may fail.
800 */
801 return size;
802
803 }
804 EXPORT_SYMBOL(kmalloc_size_roundup);
805
806 #ifdef CONFIG_ZONE_DMA
807 #define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
808 #else
809 #define KMALLOC_DMA_NAME(sz)
810 #endif
811
812 #ifdef CONFIG_MEMCG
813 #define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
814 #else
815 #define KMALLOC_CGROUP_NAME(sz)
816 #endif
817
818 #ifndef CONFIG_SLUB_TINY
819 #define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz,
820 #else
821 #define KMALLOC_RCL_NAME(sz)
822 #endif
823
824 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
825 #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
826 #define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
827 #define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz,
828 #define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz,
829 #define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz,
830 #define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz,
831 #define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz,
832 #define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz,
833 #define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz,
834 #define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz,
835 #define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz,
836 #define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
837 #define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
838 #define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
839 #define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
840 #define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
841 #define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
842 #else // CONFIG_RANDOM_KMALLOC_CACHES
843 #define KMALLOC_RANDOM_NAME(N, sz)
844 #endif
845
846 #define INIT_KMALLOC_INFO(__size, __short_size) \
847 { \
848 .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
849 KMALLOC_RCL_NAME(__short_size) \
850 KMALLOC_CGROUP_NAME(__short_size) \
851 KMALLOC_DMA_NAME(__short_size) \
852 KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \
853 .size = __size, \
854 }
855
856 /*
857 * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time.
858 * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
859 * kmalloc-2M.
860 */
861 const struct kmalloc_info_struct kmalloc_info[] __initconst = {
862 INIT_KMALLOC_INFO(0, 0),
863 INIT_KMALLOC_INFO(96, 96),
864 INIT_KMALLOC_INFO(192, 192),
865 INIT_KMALLOC_INFO(8, 8),
866 INIT_KMALLOC_INFO(16, 16),
867 INIT_KMALLOC_INFO(32, 32),
868 INIT_KMALLOC_INFO(64, 64),
869 INIT_KMALLOC_INFO(128, 128),
870 INIT_KMALLOC_INFO(256, 256),
871 INIT_KMALLOC_INFO(512, 512),
872 INIT_KMALLOC_INFO(1024, 1k),
873 INIT_KMALLOC_INFO(2048, 2k),
874 INIT_KMALLOC_INFO(4096, 4k),
875 INIT_KMALLOC_INFO(8192, 8k),
876 INIT_KMALLOC_INFO(16384, 16k),
877 INIT_KMALLOC_INFO(32768, 32k),
878 INIT_KMALLOC_INFO(65536, 64k),
879 INIT_KMALLOC_INFO(131072, 128k),
880 INIT_KMALLOC_INFO(262144, 256k),
881 INIT_KMALLOC_INFO(524288, 512k),
882 INIT_KMALLOC_INFO(1048576, 1M),
883 INIT_KMALLOC_INFO(2097152, 2M)
884 };
885
886 /*
887 * Patch up the size_index table if we have strange large alignment
888 * requirements for the kmalloc array. This is only the case for
889 * MIPS it seems. The standard arches will not generate any code here.
890 *
891 * Largest permitted alignment is 256 bytes due to the way we
892 * handle the index determination for the smaller caches.
893 *
894 * Make sure that nothing crazy happens if someone starts tinkering
895 * around with ARCH_KMALLOC_MINALIGN
896 */
setup_kmalloc_cache_index_table(void)897 void __init setup_kmalloc_cache_index_table(void)
898 {
899 unsigned int i;
900
901 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
902 !is_power_of_2(KMALLOC_MIN_SIZE));
903
904 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
905 unsigned int elem = size_index_elem(i);
906
907 if (elem >= ARRAY_SIZE(kmalloc_size_index))
908 break;
909 kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW;
910 }
911
912 if (KMALLOC_MIN_SIZE >= 64) {
913 /*
914 * The 96 byte sized cache is not used if the alignment
915 * is 64 byte.
916 */
917 for (i = 64 + 8; i <= 96; i += 8)
918 kmalloc_size_index[size_index_elem(i)] = 7;
919
920 }
921
922 if (KMALLOC_MIN_SIZE >= 128) {
923 /*
924 * The 192 byte sized cache is not used if the alignment
925 * is 128 byte. Redirect kmalloc to use the 256 byte cache
926 * instead.
927 */
928 for (i = 128 + 8; i <= 192; i += 8)
929 kmalloc_size_index[size_index_elem(i)] = 8;
930 }
931 }
932
__kmalloc_minalign(void)933 static unsigned int __kmalloc_minalign(void)
934 {
935 unsigned int minalign = dma_get_cache_alignment();
936
937 if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
938 is_swiotlb_allocated())
939 minalign = ARCH_KMALLOC_MINALIGN;
940
941 return max(minalign, arch_slab_minalign());
942 }
943
944 static void __init
new_kmalloc_cache(int idx,enum kmalloc_cache_type type)945 new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
946 {
947 slab_flags_t flags = 0;
948 unsigned int minalign = __kmalloc_minalign();
949 unsigned int aligned_size = kmalloc_info[idx].size;
950 int aligned_idx = idx;
951
952 if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
953 flags |= SLAB_RECLAIM_ACCOUNT;
954 } else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) {
955 if (mem_cgroup_kmem_disabled()) {
956 kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
957 return;
958 }
959 flags |= SLAB_ACCOUNT;
960 } else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
961 flags |= SLAB_CACHE_DMA;
962 }
963
964 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
965 if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
966 flags |= SLAB_NO_MERGE;
967 #endif
968
969 /*
970 * If CONFIG_MEMCG is enabled, disable cache merging for
971 * KMALLOC_NORMAL caches.
972 */
973 if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL))
974 flags |= SLAB_NO_MERGE;
975
976 if (minalign > ARCH_KMALLOC_MINALIGN) {
977 aligned_size = ALIGN(aligned_size, minalign);
978 aligned_idx = __kmalloc_index(aligned_size, false);
979 }
980
981 if (!kmalloc_caches[type][aligned_idx])
982 kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
983 kmalloc_info[aligned_idx].name[type],
984 aligned_size, flags);
985 if (idx != aligned_idx)
986 kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
987 }
988
989 /*
990 * Create the kmalloc array. Some of the regular kmalloc arrays
991 * may already have been created because they were needed to
992 * enable allocations for slab creation.
993 */
create_kmalloc_caches(void)994 void __init create_kmalloc_caches(void)
995 {
996 int i;
997 enum kmalloc_cache_type type;
998
999 /*
1000 * Including KMALLOC_CGROUP if CONFIG_MEMCG defined
1001 */
1002 for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
1003 /* Caches that are NOT of the two-to-the-power-of size. */
1004 if (KMALLOC_MIN_SIZE <= 32)
1005 new_kmalloc_cache(1, type);
1006 if (KMALLOC_MIN_SIZE <= 64)
1007 new_kmalloc_cache(2, type);
1008
1009 /* Caches that are of the two-to-the-power-of size. */
1010 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
1011 new_kmalloc_cache(i, type);
1012 }
1013 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
1014 random_kmalloc_seed = get_random_u64();
1015 #endif
1016
1017 /* Kmalloc array is now usable */
1018 slab_state = UP;
1019
1020 if (IS_ENABLED(CONFIG_SLAB_BUCKETS))
1021 kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
1022 sizeof(kmem_buckets),
1023 0, SLAB_NO_MERGE, NULL);
1024 }
1025
kmalloc_fix_flags(gfp_t flags)1026 gfp_t kmalloc_fix_flags(gfp_t flags)
1027 {
1028 gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1029
1030 flags &= ~GFP_SLAB_BUG_MASK;
1031 pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1032 invalid_mask, &invalid_mask, flags, &flags);
1033 dump_stack();
1034
1035 return flags;
1036 }
1037
1038 #ifdef CONFIG_SLAB_FREELIST_RANDOM
1039 /* Randomize a generic freelist */
freelist_randomize(unsigned int * list,unsigned int count)1040 static void freelist_randomize(unsigned int *list,
1041 unsigned int count)
1042 {
1043 unsigned int rand;
1044 unsigned int i;
1045
1046 for (i = 0; i < count; i++)
1047 list[i] = i;
1048
1049 /* Fisher-Yates shuffle */
1050 for (i = count - 1; i > 0; i--) {
1051 rand = get_random_u32_below(i + 1);
1052 swap(list[i], list[rand]);
1053 }
1054 }
1055
1056 /* Create a random sequence per cache */
cache_random_seq_create(struct kmem_cache * cachep,unsigned int count,gfp_t gfp)1057 int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
1058 gfp_t gfp)
1059 {
1060
1061 if (count < 2 || cachep->random_seq)
1062 return 0;
1063
1064 cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
1065 if (!cachep->random_seq)
1066 return -ENOMEM;
1067
1068 freelist_randomize(cachep->random_seq, count);
1069 return 0;
1070 }
1071
1072 /* Destroy the per-cache random freelist sequence */
cache_random_seq_destroy(struct kmem_cache * cachep)1073 void cache_random_seq_destroy(struct kmem_cache *cachep)
1074 {
1075 kfree(cachep->random_seq);
1076 cachep->random_seq = NULL;
1077 }
1078 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1079
1080 #ifdef CONFIG_SLUB_DEBUG
1081 #define SLABINFO_RIGHTS (0400)
1082
print_slabinfo_header(struct seq_file * m)1083 static void print_slabinfo_header(struct seq_file *m)
1084 {
1085 /*
1086 * Output format version, so at least we can change it
1087 * without _too_ many complaints.
1088 */
1089 seq_puts(m, "slabinfo - version: 2.1\n");
1090 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
1091 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
1092 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
1093 seq_putc(m, '\n');
1094 }
1095
slab_start(struct seq_file * m,loff_t * pos)1096 static void *slab_start(struct seq_file *m, loff_t *pos)
1097 {
1098 mutex_lock(&slab_mutex);
1099 return seq_list_start(&slab_caches, *pos);
1100 }
1101
slab_next(struct seq_file * m,void * p,loff_t * pos)1102 static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
1103 {
1104 return seq_list_next(p, &slab_caches, pos);
1105 }
1106
slab_stop(struct seq_file * m,void * p)1107 static void slab_stop(struct seq_file *m, void *p)
1108 {
1109 mutex_unlock(&slab_mutex);
1110 }
1111
cache_show(struct kmem_cache * s,struct seq_file * m)1112 static void cache_show(struct kmem_cache *s, struct seq_file *m)
1113 {
1114 struct slabinfo sinfo;
1115
1116 memset(&sinfo, 0, sizeof(sinfo));
1117 get_slabinfo(s, &sinfo);
1118
1119 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
1120 s->name, sinfo.active_objs, sinfo.num_objs, s->size,
1121 sinfo.objects_per_slab, (1 << sinfo.cache_order));
1122
1123 seq_printf(m, " : tunables %4u %4u %4u",
1124 sinfo.limit, sinfo.batchcount, sinfo.shared);
1125 seq_printf(m, " : slabdata %6lu %6lu %6lu",
1126 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
1127 seq_putc(m, '\n');
1128 }
1129
slab_show(struct seq_file * m,void * p)1130 static int slab_show(struct seq_file *m, void *p)
1131 {
1132 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
1133
1134 if (p == slab_caches.next)
1135 print_slabinfo_header(m);
1136 cache_show(s, m);
1137 return 0;
1138 }
1139
dump_unreclaimable_slab(void)1140 void dump_unreclaimable_slab(void)
1141 {
1142 struct kmem_cache *s;
1143 struct slabinfo sinfo;
1144
1145 /*
1146 * Here acquiring slab_mutex is risky since we don't prefer to get
1147 * sleep in oom path. But, without mutex hold, it may introduce a
1148 * risk of crash.
1149 * Use mutex_trylock to protect the list traverse, dump nothing
1150 * without acquiring the mutex.
1151 */
1152 if (!mutex_trylock(&slab_mutex)) {
1153 pr_warn("excessive unreclaimable slab but cannot dump stats\n");
1154 return;
1155 }
1156
1157 pr_info("Unreclaimable slab info:\n");
1158 pr_info("Name Used Total\n");
1159
1160 list_for_each_entry(s, &slab_caches, list) {
1161 if (s->flags & SLAB_RECLAIM_ACCOUNT)
1162 continue;
1163
1164 get_slabinfo(s, &sinfo);
1165
1166 if (sinfo.num_objs > 0)
1167 pr_info("%-17s %10luKB %10luKB\n", s->name,
1168 (sinfo.active_objs * s->size) / 1024,
1169 (sinfo.num_objs * s->size) / 1024);
1170 }
1171 mutex_unlock(&slab_mutex);
1172 }
1173
1174 /*
1175 * slabinfo_op - iterator that generates /proc/slabinfo
1176 *
1177 * Output layout:
1178 * cache-name
1179 * num-active-objs
1180 * total-objs
1181 * object size
1182 * num-active-slabs
1183 * total-slabs
1184 * num-pages-per-slab
1185 * + further values on SMP and with statistics enabled
1186 */
1187 static const struct seq_operations slabinfo_op = {
1188 .start = slab_start,
1189 .next = slab_next,
1190 .stop = slab_stop,
1191 .show = slab_show,
1192 };
1193
slabinfo_open(struct inode * inode,struct file * file)1194 static int slabinfo_open(struct inode *inode, struct file *file)
1195 {
1196 return seq_open(file, &slabinfo_op);
1197 }
1198
1199 static const struct proc_ops slabinfo_proc_ops = {
1200 .proc_flags = PROC_ENTRY_PERMANENT,
1201 .proc_open = slabinfo_open,
1202 .proc_read = seq_read,
1203 .proc_lseek = seq_lseek,
1204 .proc_release = seq_release,
1205 };
1206
slab_proc_init(void)1207 static int __init slab_proc_init(void)
1208 {
1209 proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops);
1210 return 0;
1211 }
1212 module_init(slab_proc_init);
1213
1214 #endif /* CONFIG_SLUB_DEBUG */
1215
1216 /**
1217 * kfree_sensitive - Clear sensitive information in memory before freeing
1218 * @p: object to free memory of
1219 *
1220 * The memory of the object @p points to is zeroed before freed.
1221 * If @p is %NULL, kfree_sensitive() does nothing.
1222 *
1223 * Note: this function zeroes the whole allocated buffer which can be a good
1224 * deal bigger than the requested buffer size passed to kmalloc(). So be
1225 * careful when using this function in performance sensitive code.
1226 */
kfree_sensitive(const void * p)1227 void kfree_sensitive(const void *p)
1228 {
1229 size_t ks;
1230 void *mem = (void *)p;
1231
1232 ks = ksize(mem);
1233 if (ks) {
1234 kasan_unpoison_range(mem, ks);
1235 memzero_explicit(mem, ks);
1236 }
1237 kfree(mem);
1238 }
1239 EXPORT_SYMBOL(kfree_sensitive);
1240
1241 #ifdef CONFIG_BPF_SYSCALL
1242 #include <linux/btf.h>
1243
1244 __bpf_kfunc_start_defs();
1245
bpf_get_kmem_cache(u64 addr)1246 __bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr)
1247 {
1248 struct slab *slab;
1249
1250 if (!virt_addr_valid((void *)(long)addr))
1251 return NULL;
1252
1253 slab = virt_to_slab((void *)(long)addr);
1254 return slab ? slab->slab_cache : NULL;
1255 }
1256
1257 __bpf_kfunc_end_defs();
1258 #endif /* CONFIG_BPF_SYSCALL */
1259
1260 /* Tracepoints definitions. */
1261 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
1262 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
1263 EXPORT_TRACEPOINT_SYMBOL(kfree);
1264 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
1265
1266 #ifndef CONFIG_KVFREE_RCU_BATCHED
1267
kvfree_call_rcu(struct rcu_head * head,void * ptr)1268 void kvfree_call_rcu(struct rcu_head *head, void *ptr)
1269 {
1270 if (head) {
1271 kasan_record_aux_stack(ptr);
1272 call_rcu(head, kvfree_rcu_cb);
1273 return;
1274 }
1275
1276 // kvfree_rcu(one_arg) call.
1277 might_sleep();
1278 synchronize_rcu();
1279 kvfree(ptr);
1280 }
1281 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
1282
kvfree_rcu_init(void)1283 void __init kvfree_rcu_init(void)
1284 {
1285 }
1286
1287 #else /* CONFIG_KVFREE_RCU_BATCHED */
1288
1289 /*
1290 * This rcu parameter is runtime-read-only. It reflects
1291 * a minimum allowed number of objects which can be cached
1292 * per-CPU. Object size is equal to one page. This value
1293 * can be changed at boot time.
1294 */
1295 static int rcu_min_cached_objs = 5;
1296 module_param(rcu_min_cached_objs, int, 0444);
1297
1298 // A page shrinker can ask for pages to be freed to make them
1299 // available for other parts of the system. This usually happens
1300 // under low memory conditions, and in that case we should also
1301 // defer page-cache filling for a short time period.
1302 //
1303 // The default value is 5 seconds, which is long enough to reduce
1304 // interference with the shrinker while it asks other systems to
1305 // drain their caches.
1306 static int rcu_delay_page_cache_fill_msec = 5000;
1307 module_param(rcu_delay_page_cache_fill_msec, int, 0444);
1308
1309 static struct workqueue_struct *rcu_reclaim_wq;
1310
1311 /* Maximum number of jiffies to wait before draining a batch. */
1312 #define KFREE_DRAIN_JIFFIES (5 * HZ)
1313 #define KFREE_N_BATCHES 2
1314 #define FREE_N_CHANNELS 2
1315
1316 /**
1317 * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
1318 * @list: List node. All blocks are linked between each other
1319 * @gp_snap: Snapshot of RCU state for objects placed to this bulk
1320 * @nr_records: Number of active pointers in the array
1321 * @records: Array of the kvfree_rcu() pointers
1322 */
1323 struct kvfree_rcu_bulk_data {
1324 struct list_head list;
1325 struct rcu_gp_oldstate gp_snap;
1326 unsigned long nr_records;
1327 void *records[] __counted_by(nr_records);
1328 };
1329
1330 /*
1331 * This macro defines how many entries the "records" array
1332 * will contain. It is based on the fact that the size of
1333 * kvfree_rcu_bulk_data structure becomes exactly one page.
1334 */
1335 #define KVFREE_BULK_MAX_ENTR \
1336 ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
1337
1338 /**
1339 * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
1340 * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
1341 * @head_free: List of kfree_rcu() objects waiting for a grace period
1342 * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
1343 * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
1344 * @krcp: Pointer to @kfree_rcu_cpu structure
1345 */
1346
1347 struct kfree_rcu_cpu_work {
1348 struct rcu_work rcu_work;
1349 struct rcu_head *head_free;
1350 struct rcu_gp_oldstate head_free_gp_snap;
1351 struct list_head bulk_head_free[FREE_N_CHANNELS];
1352 struct kfree_rcu_cpu *krcp;
1353 };
1354
1355 /**
1356 * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
1357 * @head: List of kfree_rcu() objects not yet waiting for a grace period
1358 * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
1359 * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
1360 * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
1361 * @lock: Synchronize access to this structure
1362 * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
1363 * @initialized: The @rcu_work fields have been initialized
1364 * @head_count: Number of objects in rcu_head singular list
1365 * @bulk_count: Number of objects in bulk-list
1366 * @bkvcache:
1367 * A simple cache list that contains objects for reuse purpose.
1368 * In order to save some per-cpu space the list is singular.
1369 * Even though it is lockless an access has to be protected by the
1370 * per-cpu lock.
1371 * @page_cache_work: A work to refill the cache when it is empty
1372 * @backoff_page_cache_fill: Delay cache refills
1373 * @work_in_progress: Indicates that page_cache_work is running
1374 * @hrtimer: A hrtimer for scheduling a page_cache_work
1375 * @nr_bkv_objs: number of allocated objects at @bkvcache.
1376 *
1377 * This is a per-CPU structure. The reason that it is not included in
1378 * the rcu_data structure is to permit this code to be extracted from
1379 * the RCU files. Such extraction could allow further optimization of
1380 * the interactions with the slab allocators.
1381 */
1382 struct kfree_rcu_cpu {
1383 // Objects queued on a linked list
1384 // through their rcu_head structures.
1385 struct rcu_head *head;
1386 unsigned long head_gp_snap;
1387 atomic_t head_count;
1388
1389 // Objects queued on a bulk-list.
1390 struct list_head bulk_head[FREE_N_CHANNELS];
1391 atomic_t bulk_count[FREE_N_CHANNELS];
1392
1393 struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
1394 raw_spinlock_t lock;
1395 struct delayed_work monitor_work;
1396 bool initialized;
1397
1398 struct delayed_work page_cache_work;
1399 atomic_t backoff_page_cache_fill;
1400 atomic_t work_in_progress;
1401 struct hrtimer hrtimer;
1402
1403 struct llist_head bkvcache;
1404 int nr_bkv_objs;
1405 };
1406
1407 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
1408 .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
1409 };
1410
1411 static __always_inline void
debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data * bhead)1412 debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
1413 {
1414 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1415 int i;
1416
1417 for (i = 0; i < bhead->nr_records; i++)
1418 debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
1419 #endif
1420 }
1421
1422 static inline struct kfree_rcu_cpu *
krc_this_cpu_lock(unsigned long * flags)1423 krc_this_cpu_lock(unsigned long *flags)
1424 {
1425 struct kfree_rcu_cpu *krcp;
1426
1427 local_irq_save(*flags); // For safely calling this_cpu_ptr().
1428 krcp = this_cpu_ptr(&krc);
1429 raw_spin_lock(&krcp->lock);
1430
1431 return krcp;
1432 }
1433
1434 static inline void
krc_this_cpu_unlock(struct kfree_rcu_cpu * krcp,unsigned long flags)1435 krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
1436 {
1437 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1438 }
1439
1440 static inline struct kvfree_rcu_bulk_data *
get_cached_bnode(struct kfree_rcu_cpu * krcp)1441 get_cached_bnode(struct kfree_rcu_cpu *krcp)
1442 {
1443 if (!krcp->nr_bkv_objs)
1444 return NULL;
1445
1446 WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
1447 return (struct kvfree_rcu_bulk_data *)
1448 llist_del_first(&krcp->bkvcache);
1449 }
1450
1451 static inline bool
put_cached_bnode(struct kfree_rcu_cpu * krcp,struct kvfree_rcu_bulk_data * bnode)1452 put_cached_bnode(struct kfree_rcu_cpu *krcp,
1453 struct kvfree_rcu_bulk_data *bnode)
1454 {
1455 // Check the limit.
1456 if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
1457 return false;
1458
1459 llist_add((struct llist_node *) bnode, &krcp->bkvcache);
1460 WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
1461 return true;
1462 }
1463
1464 static int
drain_page_cache(struct kfree_rcu_cpu * krcp)1465 drain_page_cache(struct kfree_rcu_cpu *krcp)
1466 {
1467 unsigned long flags;
1468 struct llist_node *page_list, *pos, *n;
1469 int freed = 0;
1470
1471 if (!rcu_min_cached_objs)
1472 return 0;
1473
1474 raw_spin_lock_irqsave(&krcp->lock, flags);
1475 page_list = llist_del_all(&krcp->bkvcache);
1476 WRITE_ONCE(krcp->nr_bkv_objs, 0);
1477 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1478
1479 llist_for_each_safe(pos, n, page_list) {
1480 free_page((unsigned long)pos);
1481 freed++;
1482 }
1483
1484 return freed;
1485 }
1486
1487 static void
kvfree_rcu_bulk(struct kfree_rcu_cpu * krcp,struct kvfree_rcu_bulk_data * bnode,int idx)1488 kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
1489 struct kvfree_rcu_bulk_data *bnode, int idx)
1490 {
1491 unsigned long flags;
1492 int i;
1493
1494 if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
1495 debug_rcu_bhead_unqueue(bnode);
1496 rcu_lock_acquire(&rcu_callback_map);
1497 if (idx == 0) { // kmalloc() / kfree().
1498 trace_rcu_invoke_kfree_bulk_callback(
1499 "slab", bnode->nr_records,
1500 bnode->records);
1501
1502 kfree_bulk(bnode->nr_records, bnode->records);
1503 } else { // vmalloc() / vfree().
1504 for (i = 0; i < bnode->nr_records; i++) {
1505 trace_rcu_invoke_kvfree_callback(
1506 "slab", bnode->records[i], 0);
1507
1508 vfree(bnode->records[i]);
1509 }
1510 }
1511 rcu_lock_release(&rcu_callback_map);
1512 }
1513
1514 raw_spin_lock_irqsave(&krcp->lock, flags);
1515 if (put_cached_bnode(krcp, bnode))
1516 bnode = NULL;
1517 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1518
1519 if (bnode)
1520 free_page((unsigned long) bnode);
1521
1522 cond_resched_tasks_rcu_qs();
1523 }
1524
1525 static void
kvfree_rcu_list(struct rcu_head * head)1526 kvfree_rcu_list(struct rcu_head *head)
1527 {
1528 struct rcu_head *next;
1529
1530 for (; head; head = next) {
1531 void *ptr = (void *) head->func;
1532 unsigned long offset = (void *) head - ptr;
1533
1534 next = head->next;
1535 debug_rcu_head_unqueue((struct rcu_head *)ptr);
1536 rcu_lock_acquire(&rcu_callback_map);
1537 trace_rcu_invoke_kvfree_callback("slab", head, offset);
1538
1539 kvfree(ptr);
1540
1541 rcu_lock_release(&rcu_callback_map);
1542 cond_resched_tasks_rcu_qs();
1543 }
1544 }
1545
1546 /*
1547 * This function is invoked in workqueue context after a grace period.
1548 * It frees all the objects queued on ->bulk_head_free or ->head_free.
1549 */
kfree_rcu_work(struct work_struct * work)1550 static void kfree_rcu_work(struct work_struct *work)
1551 {
1552 unsigned long flags;
1553 struct kvfree_rcu_bulk_data *bnode, *n;
1554 struct list_head bulk_head[FREE_N_CHANNELS];
1555 struct rcu_head *head;
1556 struct kfree_rcu_cpu *krcp;
1557 struct kfree_rcu_cpu_work *krwp;
1558 struct rcu_gp_oldstate head_gp_snap;
1559 int i;
1560
1561 krwp = container_of(to_rcu_work(work),
1562 struct kfree_rcu_cpu_work, rcu_work);
1563 krcp = krwp->krcp;
1564
1565 raw_spin_lock_irqsave(&krcp->lock, flags);
1566 // Channels 1 and 2.
1567 for (i = 0; i < FREE_N_CHANNELS; i++)
1568 list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
1569
1570 // Channel 3.
1571 head = krwp->head_free;
1572 krwp->head_free = NULL;
1573 head_gp_snap = krwp->head_free_gp_snap;
1574 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1575
1576 // Handle the first two channels.
1577 for (i = 0; i < FREE_N_CHANNELS; i++) {
1578 // Start from the tail page, so a GP is likely passed for it.
1579 list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
1580 kvfree_rcu_bulk(krcp, bnode, i);
1581 }
1582
1583 /*
1584 * This is used when the "bulk" path can not be used for the
1585 * double-argument of kvfree_rcu(). This happens when the
1586 * page-cache is empty, which means that objects are instead
1587 * queued on a linked list through their rcu_head structures.
1588 * This list is named "Channel 3".
1589 */
1590 if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
1591 kvfree_rcu_list(head);
1592 }
1593
kfree_rcu_sheaf(void * obj)1594 static bool kfree_rcu_sheaf(void *obj)
1595 {
1596 struct kmem_cache *s;
1597 struct slab *slab;
1598
1599 if (is_vmalloc_addr(obj))
1600 return false;
1601
1602 slab = virt_to_slab(obj);
1603 if (unlikely(!slab))
1604 return false;
1605
1606 s = slab->slab_cache;
1607 if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()))
1608 return __kfree_rcu_sheaf(s, obj);
1609
1610 return false;
1611 }
1612
1613 static bool
need_offload_krc(struct kfree_rcu_cpu * krcp)1614 need_offload_krc(struct kfree_rcu_cpu *krcp)
1615 {
1616 int i;
1617
1618 for (i = 0; i < FREE_N_CHANNELS; i++)
1619 if (!list_empty(&krcp->bulk_head[i]))
1620 return true;
1621
1622 return !!READ_ONCE(krcp->head);
1623 }
1624
1625 static bool
need_wait_for_krwp_work(struct kfree_rcu_cpu_work * krwp)1626 need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
1627 {
1628 int i;
1629
1630 for (i = 0; i < FREE_N_CHANNELS; i++)
1631 if (!list_empty(&krwp->bulk_head_free[i]))
1632 return true;
1633
1634 return !!krwp->head_free;
1635 }
1636
krc_count(struct kfree_rcu_cpu * krcp)1637 static int krc_count(struct kfree_rcu_cpu *krcp)
1638 {
1639 int sum = atomic_read(&krcp->head_count);
1640 int i;
1641
1642 for (i = 0; i < FREE_N_CHANNELS; i++)
1643 sum += atomic_read(&krcp->bulk_count[i]);
1644
1645 return sum;
1646 }
1647
1648 static void
__schedule_delayed_monitor_work(struct kfree_rcu_cpu * krcp)1649 __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1650 {
1651 long delay, delay_left;
1652
1653 delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
1654 if (delayed_work_pending(&krcp->monitor_work)) {
1655 delay_left = krcp->monitor_work.timer.expires - jiffies;
1656 if (delay < delay_left)
1657 mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
1658 return;
1659 }
1660 queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
1661 }
1662
1663 static void
schedule_delayed_monitor_work(struct kfree_rcu_cpu * krcp)1664 schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1665 {
1666 unsigned long flags;
1667
1668 raw_spin_lock_irqsave(&krcp->lock, flags);
1669 __schedule_delayed_monitor_work(krcp);
1670 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1671 }
1672
1673 static void
kvfree_rcu_drain_ready(struct kfree_rcu_cpu * krcp)1674 kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
1675 {
1676 struct list_head bulk_ready[FREE_N_CHANNELS];
1677 struct kvfree_rcu_bulk_data *bnode, *n;
1678 struct rcu_head *head_ready = NULL;
1679 unsigned long flags;
1680 int i;
1681
1682 raw_spin_lock_irqsave(&krcp->lock, flags);
1683 for (i = 0; i < FREE_N_CHANNELS; i++) {
1684 INIT_LIST_HEAD(&bulk_ready[i]);
1685
1686 list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
1687 if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
1688 break;
1689
1690 atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
1691 list_move(&bnode->list, &bulk_ready[i]);
1692 }
1693 }
1694
1695 if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
1696 head_ready = krcp->head;
1697 atomic_set(&krcp->head_count, 0);
1698 WRITE_ONCE(krcp->head, NULL);
1699 }
1700 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1701
1702 for (i = 0; i < FREE_N_CHANNELS; i++) {
1703 list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
1704 kvfree_rcu_bulk(krcp, bnode, i);
1705 }
1706
1707 if (head_ready)
1708 kvfree_rcu_list(head_ready);
1709 }
1710
1711 /*
1712 * Return: %true if a work is queued, %false otherwise.
1713 */
1714 static bool
kvfree_rcu_queue_batch(struct kfree_rcu_cpu * krcp)1715 kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
1716 {
1717 unsigned long flags;
1718 bool queued = false;
1719 int i, j;
1720
1721 raw_spin_lock_irqsave(&krcp->lock, flags);
1722
1723 // Attempt to start a new batch.
1724 for (i = 0; i < KFREE_N_BATCHES; i++) {
1725 struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
1726
1727 // Try to detach bulk_head or head and attach it, only when
1728 // all channels are free. Any channel is not free means at krwp
1729 // there is on-going rcu work to handle krwp's free business.
1730 if (need_wait_for_krwp_work(krwp))
1731 continue;
1732
1733 // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
1734 if (need_offload_krc(krcp)) {
1735 // Channel 1 corresponds to the SLAB-pointer bulk path.
1736 // Channel 2 corresponds to vmalloc-pointer bulk path.
1737 for (j = 0; j < FREE_N_CHANNELS; j++) {
1738 if (list_empty(&krwp->bulk_head_free[j])) {
1739 atomic_set(&krcp->bulk_count[j], 0);
1740 list_replace_init(&krcp->bulk_head[j],
1741 &krwp->bulk_head_free[j]);
1742 }
1743 }
1744
1745 // Channel 3 corresponds to both SLAB and vmalloc
1746 // objects queued on the linked list.
1747 if (!krwp->head_free) {
1748 krwp->head_free = krcp->head;
1749 get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
1750 atomic_set(&krcp->head_count, 0);
1751 WRITE_ONCE(krcp->head, NULL);
1752 }
1753
1754 // One work is per one batch, so there are three
1755 // "free channels", the batch can handle. Break
1756 // the loop since it is done with this CPU thus
1757 // queuing an RCU work is _always_ success here.
1758 queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
1759 WARN_ON_ONCE(!queued);
1760 break;
1761 }
1762 }
1763
1764 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1765 return queued;
1766 }
1767
1768 /*
1769 * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
1770 */
kfree_rcu_monitor(struct work_struct * work)1771 static void kfree_rcu_monitor(struct work_struct *work)
1772 {
1773 struct kfree_rcu_cpu *krcp = container_of(work,
1774 struct kfree_rcu_cpu, monitor_work.work);
1775
1776 // Drain ready for reclaim.
1777 kvfree_rcu_drain_ready(krcp);
1778
1779 // Queue a batch for a rest.
1780 kvfree_rcu_queue_batch(krcp);
1781
1782 // If there is nothing to detach, it means that our job is
1783 // successfully done here. In case of having at least one
1784 // of the channels that is still busy we should rearm the
1785 // work to repeat an attempt. Because previous batches are
1786 // still in progress.
1787 if (need_offload_krc(krcp))
1788 schedule_delayed_monitor_work(krcp);
1789 }
1790
fill_page_cache_func(struct work_struct * work)1791 static void fill_page_cache_func(struct work_struct *work)
1792 {
1793 struct kvfree_rcu_bulk_data *bnode;
1794 struct kfree_rcu_cpu *krcp =
1795 container_of(work, struct kfree_rcu_cpu,
1796 page_cache_work.work);
1797 unsigned long flags;
1798 int nr_pages;
1799 bool pushed;
1800 int i;
1801
1802 nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
1803 1 : rcu_min_cached_objs;
1804
1805 for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
1806 bnode = (struct kvfree_rcu_bulk_data *)
1807 __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1808
1809 if (!bnode)
1810 break;
1811
1812 raw_spin_lock_irqsave(&krcp->lock, flags);
1813 pushed = put_cached_bnode(krcp, bnode);
1814 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1815
1816 if (!pushed) {
1817 free_page((unsigned long) bnode);
1818 break;
1819 }
1820 }
1821
1822 atomic_set(&krcp->work_in_progress, 0);
1823 atomic_set(&krcp->backoff_page_cache_fill, 0);
1824 }
1825
1826 // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
1827 // state specified by flags. If can_alloc is true, the caller must
1828 // be schedulable and not be holding any locks or mutexes that might be
1829 // acquired by the memory allocator or anything that it might invoke.
1830 // Returns true if ptr was successfully recorded, else the caller must
1831 // use a fallback.
1832 static inline bool
add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu ** krcp,unsigned long * flags,void * ptr,bool can_alloc)1833 add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
1834 unsigned long *flags, void *ptr, bool can_alloc)
1835 {
1836 struct kvfree_rcu_bulk_data *bnode;
1837 int idx;
1838
1839 *krcp = krc_this_cpu_lock(flags);
1840 if (unlikely(!(*krcp)->initialized))
1841 return false;
1842
1843 idx = !!is_vmalloc_addr(ptr);
1844 bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
1845 struct kvfree_rcu_bulk_data, list);
1846
1847 /* Check if a new block is required. */
1848 if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
1849 bnode = get_cached_bnode(*krcp);
1850 if (!bnode && can_alloc) {
1851 krc_this_cpu_unlock(*krcp, *flags);
1852
1853 // __GFP_NORETRY - allows a light-weight direct reclaim
1854 // what is OK from minimizing of fallback hitting point of
1855 // view. Apart of that it forbids any OOM invoking what is
1856 // also beneficial since we are about to release memory soon.
1857 //
1858 // __GFP_NOMEMALLOC - prevents from consuming of all the
1859 // memory reserves. Please note we have a fallback path.
1860 //
1861 // __GFP_NOWARN - it is supposed that an allocation can
1862 // be failed under low memory or high memory pressure
1863 // scenarios.
1864 bnode = (struct kvfree_rcu_bulk_data *)
1865 __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1866 raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
1867 }
1868
1869 if (!bnode)
1870 return false;
1871
1872 // Initialize the new block and attach it.
1873 bnode->nr_records = 0;
1874 list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
1875 }
1876
1877 // Finally insert and update the GP for this page.
1878 bnode->nr_records++;
1879 bnode->records[bnode->nr_records - 1] = ptr;
1880 get_state_synchronize_rcu_full(&bnode->gp_snap);
1881 atomic_inc(&(*krcp)->bulk_count[idx]);
1882
1883 return true;
1884 }
1885
1886 static enum hrtimer_restart
schedule_page_work_fn(struct hrtimer * t)1887 schedule_page_work_fn(struct hrtimer *t)
1888 {
1889 struct kfree_rcu_cpu *krcp =
1890 container_of(t, struct kfree_rcu_cpu, hrtimer);
1891
1892 queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
1893 return HRTIMER_NORESTART;
1894 }
1895
1896 static void
run_page_cache_worker(struct kfree_rcu_cpu * krcp)1897 run_page_cache_worker(struct kfree_rcu_cpu *krcp)
1898 {
1899 // If cache disabled, bail out.
1900 if (!rcu_min_cached_objs)
1901 return;
1902
1903 if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
1904 !atomic_xchg(&krcp->work_in_progress, 1)) {
1905 if (atomic_read(&krcp->backoff_page_cache_fill)) {
1906 queue_delayed_work(rcu_reclaim_wq,
1907 &krcp->page_cache_work,
1908 msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
1909 } else {
1910 hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
1911 HRTIMER_MODE_REL);
1912 hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
1913 }
1914 }
1915 }
1916
kfree_rcu_scheduler_running(void)1917 void __init kfree_rcu_scheduler_running(void)
1918 {
1919 int cpu;
1920
1921 for_each_possible_cpu(cpu) {
1922 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
1923
1924 if (need_offload_krc(krcp))
1925 schedule_delayed_monitor_work(krcp);
1926 }
1927 }
1928
1929 /*
1930 * Queue a request for lazy invocation of the appropriate free routine
1931 * after a grace period. Please note that three paths are maintained,
1932 * two for the common case using arrays of pointers and a third one that
1933 * is used only when the main paths cannot be used, for example, due to
1934 * memory pressure.
1935 *
1936 * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
1937 * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
1938 * be free'd in workqueue context. This allows us to: batch requests together to
1939 * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
1940 */
kvfree_call_rcu(struct rcu_head * head,void * ptr)1941 void kvfree_call_rcu(struct rcu_head *head, void *ptr)
1942 {
1943 unsigned long flags;
1944 struct kfree_rcu_cpu *krcp;
1945 bool success;
1946
1947 /*
1948 * Please note there is a limitation for the head-less
1949 * variant, that is why there is a clear rule for such
1950 * objects: it can be used from might_sleep() context
1951 * only. For other places please embed an rcu_head to
1952 * your data.
1953 */
1954 if (!head)
1955 might_sleep();
1956
1957 if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
1958 return;
1959
1960 // Queue the object but don't yet schedule the batch.
1961 if (debug_rcu_head_queue(ptr)) {
1962 // Probable double kfree_rcu(), just leak.
1963 WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
1964 __func__, head);
1965
1966 // Mark as success and leave.
1967 return;
1968 }
1969
1970 kasan_record_aux_stack(ptr);
1971 success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
1972 if (!success) {
1973 run_page_cache_worker(krcp);
1974
1975 if (head == NULL)
1976 // Inline if kvfree_rcu(one_arg) call.
1977 goto unlock_return;
1978
1979 head->func = ptr;
1980 head->next = krcp->head;
1981 WRITE_ONCE(krcp->head, head);
1982 atomic_inc(&krcp->head_count);
1983
1984 // Take a snapshot for this krcp.
1985 krcp->head_gp_snap = get_state_synchronize_rcu();
1986 success = true;
1987 }
1988
1989 /*
1990 * The kvfree_rcu() caller considers the pointer freed at this point
1991 * and likely removes any references to it. Since the actual slab
1992 * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
1993 * this object (no scanning or false positives reporting).
1994 */
1995 kmemleak_ignore(ptr);
1996
1997 // Set timer to drain after KFREE_DRAIN_JIFFIES.
1998 if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
1999 __schedule_delayed_monitor_work(krcp);
2000
2001 unlock_return:
2002 krc_this_cpu_unlock(krcp, flags);
2003
2004 /*
2005 * Inline kvfree() after synchronize_rcu(). We can do
2006 * it from might_sleep() context only, so the current
2007 * CPU can pass the QS state.
2008 */
2009 if (!success) {
2010 debug_rcu_head_unqueue((struct rcu_head *) ptr);
2011 synchronize_rcu();
2012 kvfree(ptr);
2013 }
2014 }
2015 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
2016
__kvfree_rcu_barrier(void)2017 static inline void __kvfree_rcu_barrier(void)
2018 {
2019 struct kfree_rcu_cpu_work *krwp;
2020 struct kfree_rcu_cpu *krcp;
2021 bool queued;
2022 int i, cpu;
2023
2024 /*
2025 * Firstly we detach objects and queue them over an RCU-batch
2026 * for all CPUs. Finally queued works are flushed for each CPU.
2027 *
2028 * Please note. If there are outstanding batches for a particular
2029 * CPU, those have to be finished first following by queuing a new.
2030 */
2031 for_each_possible_cpu(cpu) {
2032 krcp = per_cpu_ptr(&krc, cpu);
2033
2034 /*
2035 * Check if this CPU has any objects which have been queued for a
2036 * new GP completion. If not(means nothing to detach), we are done
2037 * with it. If any batch is pending/running for this "krcp", below
2038 * per-cpu flush_rcu_work() waits its completion(see last step).
2039 */
2040 if (!need_offload_krc(krcp))
2041 continue;
2042
2043 while (1) {
2044 /*
2045 * If we are not able to queue a new RCU work it means:
2046 * - batches for this CPU are still in flight which should
2047 * be flushed first and then repeat;
2048 * - no objects to detach, because of concurrency.
2049 */
2050 queued = kvfree_rcu_queue_batch(krcp);
2051
2052 /*
2053 * Bail out, if there is no need to offload this "krcp"
2054 * anymore. As noted earlier it can run concurrently.
2055 */
2056 if (queued || !need_offload_krc(krcp))
2057 break;
2058
2059 /* There are ongoing batches. */
2060 for (i = 0; i < KFREE_N_BATCHES; i++) {
2061 krwp = &(krcp->krw_arr[i]);
2062 flush_rcu_work(&krwp->rcu_work);
2063 }
2064 }
2065 }
2066
2067 /*
2068 * Now we guarantee that all objects are flushed.
2069 */
2070 for_each_possible_cpu(cpu) {
2071 krcp = per_cpu_ptr(&krc, cpu);
2072
2073 /*
2074 * A monitor work can drain ready to reclaim objects
2075 * directly. Wait its completion if running or pending.
2076 */
2077 cancel_delayed_work_sync(&krcp->monitor_work);
2078
2079 for (i = 0; i < KFREE_N_BATCHES; i++) {
2080 krwp = &(krcp->krw_arr[i]);
2081 flush_rcu_work(&krwp->rcu_work);
2082 }
2083 }
2084 }
2085
2086 /**
2087 * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
2088 *
2089 * Note that a single argument of kvfree_rcu() call has a slow path that
2090 * triggers synchronize_rcu() following by freeing a pointer. It is done
2091 * before the return from the function. Therefore for any single-argument
2092 * call that will result in a kfree() to a cache that is to be destroyed
2093 * during module exit, it is developer's responsibility to ensure that all
2094 * such calls have returned before the call to kmem_cache_destroy().
2095 */
kvfree_rcu_barrier(void)2096 void kvfree_rcu_barrier(void)
2097 {
2098 flush_all_rcu_sheaves();
2099 __kvfree_rcu_barrier();
2100 }
2101 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
2102
2103 /**
2104 * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a
2105 * specific slab cache.
2106 * @s: slab cache to wait for
2107 *
2108 * See the description of kvfree_rcu_barrier() for details.
2109 */
kvfree_rcu_barrier_on_cache(struct kmem_cache * s)2110 void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
2111 {
2112 if (cache_has_sheaves(s)) {
2113 flush_rcu_sheaves_on_cache(s);
2114 rcu_barrier();
2115 }
2116
2117 /*
2118 * TODO: Introduce a version of __kvfree_rcu_barrier() that works
2119 * on a specific slab cache.
2120 */
2121 __kvfree_rcu_barrier();
2122 }
2123 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
2124
2125 static unsigned long
kfree_rcu_shrink_count(struct shrinker * shrink,struct shrink_control * sc)2126 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2127 {
2128 int cpu;
2129 unsigned long count = 0;
2130
2131 /* Snapshot count of all CPUs */
2132 for_each_possible_cpu(cpu) {
2133 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2134
2135 count += krc_count(krcp);
2136 count += READ_ONCE(krcp->nr_bkv_objs);
2137 atomic_set(&krcp->backoff_page_cache_fill, 1);
2138 }
2139
2140 return count == 0 ? SHRINK_EMPTY : count;
2141 }
2142
2143 static unsigned long
kfree_rcu_shrink_scan(struct shrinker * shrink,struct shrink_control * sc)2144 kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2145 {
2146 int cpu, freed = 0;
2147
2148 for_each_possible_cpu(cpu) {
2149 int count;
2150 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2151
2152 count = krc_count(krcp);
2153 count += drain_page_cache(krcp);
2154 kfree_rcu_monitor(&krcp->monitor_work.work);
2155
2156 sc->nr_to_scan -= count;
2157 freed += count;
2158
2159 if (sc->nr_to_scan <= 0)
2160 break;
2161 }
2162
2163 return freed == 0 ? SHRINK_STOP : freed;
2164 }
2165
kvfree_rcu_init(void)2166 void __init kvfree_rcu_init(void)
2167 {
2168 int cpu;
2169 int i, j;
2170 struct shrinker *kfree_rcu_shrinker;
2171
2172 rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
2173 WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
2174 WARN_ON(!rcu_reclaim_wq);
2175
2176 /* Clamp it to [0:100] seconds interval. */
2177 if (rcu_delay_page_cache_fill_msec < 0 ||
2178 rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
2179
2180 rcu_delay_page_cache_fill_msec =
2181 clamp(rcu_delay_page_cache_fill_msec, 0,
2182 (int) (100 * MSEC_PER_SEC));
2183
2184 pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
2185 rcu_delay_page_cache_fill_msec);
2186 }
2187
2188 for_each_possible_cpu(cpu) {
2189 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2190
2191 for (i = 0; i < KFREE_N_BATCHES; i++) {
2192 INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
2193 krcp->krw_arr[i].krcp = krcp;
2194
2195 for (j = 0; j < FREE_N_CHANNELS; j++)
2196 INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
2197 }
2198
2199 for (i = 0; i < FREE_N_CHANNELS; i++)
2200 INIT_LIST_HEAD(&krcp->bulk_head[i]);
2201
2202 INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
2203 INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
2204 krcp->initialized = true;
2205 }
2206
2207 kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
2208 if (!kfree_rcu_shrinker) {
2209 pr_err("Failed to allocate kfree_rcu() shrinker!\n");
2210 return;
2211 }
2212
2213 kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
2214 kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
2215
2216 shrinker_register(kfree_rcu_shrinker);
2217 }
2218
2219 #endif /* CONFIG_KVFREE_RCU_BATCHED */
2220