1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3 * Arena-based task data scheduler. This is a variation of scx_simple
4 * that uses a combined allocator and indexing structure to organize
5 * task data. Task context allocation is done when a task enters the
6 * scheduler, while freeing is done when it exits. Task contexts are
7 * retrieved from task-local storage, pointing to the allocated memory.
8 *
9 * The main purpose of this scheduler is to demostrate arena memory
10 * management.
11 *
12 * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates.
13 * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com>
14 * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org>
15 *
16 */
17 #include <scx/common.bpf.h>
18 #include <scx/bpf_arena_common.bpf.h>
19
20 #include "scx_sdt.h"
21
22 char _license[] SEC("license") = "GPL";
23
24 UEI_DEFINE(uei);
25
26 struct {
27 __uint(type, BPF_MAP_TYPE_ARENA);
28 __uint(map_flags, BPF_F_MMAPABLE);
29 #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
30 __uint(max_entries, 1 << 16); /* number of pages */
31 __ulong(map_extra, (1ull << 32)); /* start of mmap() region */
32 #else
33 __uint(max_entries, 1 << 20); /* number of pages */
34 __ulong(map_extra, (1ull << 44)); /* start of mmap() region */
35 #endif
36 } arena __weak SEC(".maps");
37
38 #define SHARED_DSQ 0
39
40 #define DEFINE_SDT_STAT(metric) \
41 static inline void \
42 stat_inc_##metric(struct scx_stats __arena *stats) \
43 { \
44 cast_kern(stats); \
45 stats->metric += 1; \
46 } \
47 __u64 stat_##metric; \
48
49 DEFINE_SDT_STAT(enqueue);
50 DEFINE_SDT_STAT(init);
51 DEFINE_SDT_STAT(exit);
52 DEFINE_SDT_STAT(select_idle_cpu);
53 DEFINE_SDT_STAT(select_busy_cpu);
54
55 /*
56 * Necessary for cond_break/can_loop's semantics. According to kernel commit
57 * 011832b, the loop counter variable must be seen as imprecise and bounded
58 * by the verifier. Initializing it from a constant (e.g., i = 0;), then,
59 * makes it precise and prevents may_goto from helping with converging the
60 * loop. For these loops we must initialize the loop counter from a variable
61 * whose value the verifier cannot reason about when checking the program, so
62 * that the loop counter's value is imprecise.
63 */
64 static __u64 zero = 0;
65
66 /*
67 * XXX Hack to get the verifier to find the arena for sdt_exit_task.
68 * As of 6.12-rc5, The verifier associates arenas with programs by
69 * checking LD.IMM instruction operands for an arena and populating
70 * the program state with the first instance it finds. This requires
71 * accessing our global arena variable, but scx methods do not necessarily
72 * do so while still using pointers from that arena. Insert a bpf_printk
73 * statement that triggers at most once to generate an LD.IMM instruction
74 * to access the arena and help the verifier.
75 */
76 static volatile bool scx_arena_verify_once;
77
scx_arena_subprog_init(void)78 __hidden void scx_arena_subprog_init(void)
79 {
80 if (scx_arena_verify_once)
81 return;
82
83 bpf_printk("%s: arena pointer %p", __func__, &arena);
84 scx_arena_verify_once = true;
85 }
86
87
88 private(LOCK) struct bpf_spin_lock alloc_lock;
89 private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock;
90
91 /* allocation pools */
92 struct sdt_pool desc_pool;
93 struct sdt_pool chunk_pool;
94
95 /* Protected by alloc_lock. */
96 struct scx_alloc_stats alloc_stats;
97
98
99 /* Allocate element from the pool. Must be called with a then pool lock held. */
100 static
scx_alloc_from_pool(struct sdt_pool * pool)101 void __arena *scx_alloc_from_pool(struct sdt_pool *pool)
102 {
103 __u64 elem_size, max_elems;
104 void __arena *slab;
105 void __arena *ptr;
106
107 elem_size = pool->elem_size;
108 max_elems = pool->max_elems;
109
110 /* If the chunk is spent, get a new one. */
111 if (pool->idx >= max_elems) {
112 slab = bpf_arena_alloc_pages(&arena, NULL,
113 div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0);
114 if (!slab)
115 return NULL;
116
117 pool->slab = slab;
118 pool->idx = 0;
119 }
120
121 ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx);
122 pool->idx += 1;
123
124 return ptr;
125 }
126
127 /* Alloc desc and associated chunk. Called with the allocator spinlock held. */
scx_alloc_chunk(void)128 static sdt_desc_t *scx_alloc_chunk(void)
129 {
130 struct sdt_chunk __arena *chunk;
131 sdt_desc_t *desc;
132 sdt_desc_t *out;
133
134 chunk = scx_alloc_from_pool(&chunk_pool);
135 if (!chunk)
136 return NULL;
137
138 desc = scx_alloc_from_pool(&desc_pool);
139 if (!desc) {
140 /*
141 * Effectively frees the previous chunk allocation.
142 * Index cannot be 0, so decrementing is always
143 * valid.
144 */
145 chunk_pool.idx -= 1;
146 return NULL;
147 }
148
149 out = desc;
150
151 desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
152 desc->chunk = chunk;
153
154 alloc_stats.chunk_allocs += 1;
155
156 return out;
157 }
158
pool_set_size(struct sdt_pool * pool,__u64 data_size,__u64 nr_pages)159 static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages)
160 {
161 if (unlikely(data_size % 8))
162 return -EINVAL;
163
164 if (unlikely(nr_pages == 0))
165 return -EINVAL;
166
167 pool->elem_size = data_size;
168 pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size;
169 /* Populate the pool slab on the first allocation. */
170 pool->idx = pool->max_elems;
171
172 return 0;
173 }
174
175 /* Initialize both the base pool allocators and the root chunk of the index. */
176 __hidden int
scx_alloc_init(struct scx_allocator * alloc,__u64 data_size)177 scx_alloc_init(struct scx_allocator *alloc, __u64 data_size)
178 {
179 size_t min_chunk_size;
180 int ret;
181
182 _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE,
183 "chunk size must fit into a page");
184
185 ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1);
186 if (ret != 0)
187 return ret;
188
189 ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1);
190 if (ret != 0)
191 return ret;
192
193 /* Wrap data into a descriptor and word align. */
194 data_size += sizeof(struct sdt_data);
195 data_size = round_up(data_size, 8);
196
197 /*
198 * Ensure we allocate large enough chunks from the arena to avoid excessive
199 * internal fragmentation when turning chunks it into structs.
200 */
201 min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE);
202 ret = pool_set_size(&alloc->pool, data_size, min_chunk_size);
203 if (ret != 0)
204 return ret;
205
206 bpf_spin_lock(&alloc_lock);
207 alloc->root = scx_alloc_chunk();
208 bpf_spin_unlock(&alloc_lock);
209 if (!alloc->root)
210 return -ENOMEM;
211
212 return 0;
213 }
214
215 static
set_idx_state(sdt_desc_t * desc,__u64 pos,bool state)216 int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state)
217 {
218 __u64 __arena *allocated = desc->allocated;
219 __u64 bit;
220
221 if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK))
222 return -EINVAL;
223
224 bit = (__u64)1 << (pos % 64);
225
226 if (state)
227 allocated[pos / 64] |= bit;
228 else
229 allocated[pos / 64] &= ~bit;
230
231 return 0;
232 }
233
234 static __noinline
mark_nodes_avail(sdt_desc_t * lv_desc[SDT_TASK_LEVELS],__u64 lv_pos[SDT_TASK_LEVELS])235 int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS])
236 {
237 sdt_desc_t *desc;
238 __u64 u, level;
239 int ret;
240
241 for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) {
242 level = SDT_TASK_LEVELS - 1 - u;
243
244 /* Only propagate upwards if we are the parent's only free chunk. */
245 desc = lv_desc[level];
246
247 ret = set_idx_state(desc, lv_pos[level], false);
248 if (unlikely(ret != 0))
249 return ret;
250
251 desc->nr_free += 1;
252 if (desc->nr_free > 1)
253 return 0;
254 }
255
256 return 0;
257 }
258
259 /*
260 * Free the allocated struct with the given index. Called with the
261 * allocator lock taken.
262 */
263 __hidden
scx_alloc_free_idx(struct scx_allocator * alloc,__u64 idx)264 int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
265 {
266 const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1;
267 sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
268 sdt_desc_t * __arena *desc_children;
269 struct sdt_chunk __arena *chunk;
270 sdt_desc_t *desc;
271 struct sdt_data __arena *data;
272 __u64 level, shift, pos;
273 __u64 lv_pos[SDT_TASK_LEVELS];
274 int ret;
275 int i;
276
277 if (!alloc)
278 return 0;
279
280 desc = alloc->root;
281 if (unlikely(!desc))
282 return -EINVAL;
283
284 /* To appease the verifier. */
285 for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
286 lv_desc[level] = NULL;
287 lv_pos[level] = 0;
288 }
289
290 /* Find the leaf node containing the index. */
291 for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
292 shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT;
293 pos = (idx >> shift) & mask;
294
295 lv_desc[level] = desc;
296 lv_pos[level] = pos;
297
298 if (level == SDT_TASK_LEVELS - 1)
299 break;
300
301 chunk = desc->chunk;
302
303 desc_children = (sdt_desc_t * __arena *)chunk->descs;
304 desc = desc_children[pos];
305
306 if (unlikely(!desc))
307 return -EINVAL;
308 }
309
310 chunk = desc->chunk;
311
312 pos = idx & mask;
313 data = chunk->data[pos];
314 if (likely(data)) {
315 *data = (struct sdt_data) {
316 .tid.genn = data->tid.genn + 1,
317 };
318
319 /* Zero out one word at a time. */
320 for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
321 data->payload[i] = 0;
322 }
323 }
324
325 ret = mark_nodes_avail(lv_desc, lv_pos);
326 if (unlikely(ret != 0))
327 return ret;
328
329 alloc_stats.active_allocs -= 1;
330 alloc_stats.free_ops += 1;
331
332 return 0;
333 }
334
335 static inline
ffs(__u64 word)336 int ffs(__u64 word)
337 {
338 unsigned int num = 0;
339
340 if ((word & 0xffffffff) == 0) {
341 num += 32;
342 word >>= 32;
343 }
344
345 if ((word & 0xffff) == 0) {
346 num += 16;
347 word >>= 16;
348 }
349
350 if ((word & 0xff) == 0) {
351 num += 8;
352 word >>= 8;
353 }
354
355 if ((word & 0xf) == 0) {
356 num += 4;
357 word >>= 4;
358 }
359
360 if ((word & 0x3) == 0) {
361 num += 2;
362 word >>= 2;
363 }
364
365 if ((word & 0x1) == 0) {
366 num += 1;
367 word >>= 1;
368 }
369
370 return num;
371 }
372
373
374 /* find the first empty slot */
375 __hidden
chunk_find_empty(sdt_desc_t __arg_arena * desc)376 __u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
377 {
378 __u64 freeslots;
379 __u64 i;
380
381 for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
382 freeslots = ~desc->allocated[i];
383 if (freeslots == (__u64)0)
384 continue;
385
386 return (i * 64) + ffs(freeslots);
387 }
388
389 return SDT_TASK_ENTS_PER_CHUNK;
390 }
391
392 /*
393 * Find and return an available idx on the allocator.
394 * Called with the task spinlock held.
395 */
desc_find_empty(sdt_desc_t * desc,__u64 * idxp)396 static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
397 {
398 sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
399 sdt_desc_t * __arena *desc_children;
400 struct sdt_chunk __arena *chunk;
401 sdt_desc_t *tmp;
402 __u64 lv_pos[SDT_TASK_LEVELS];
403 __u64 u, pos, level;
404 __u64 idx = 0;
405 int ret;
406
407 for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
408 pos = chunk_find_empty(desc);
409
410 /* If we error out, something has gone very wrong. */
411 if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK))
412 return NULL;
413
414 if (pos == SDT_TASK_ENTS_PER_CHUNK)
415 return NULL;
416
417 idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT;
418 idx |= pos;
419
420 /* Log the levels to complete allocation. */
421 lv_desc[level] = desc;
422 lv_pos[level] = pos;
423
424 /* The rest of the loop is for internal node traversal. */
425 if (level == SDT_TASK_LEVELS - 1)
426 break;
427
428 /* Allocate an internal node if necessary. */
429 chunk = desc->chunk;
430 desc_children = (sdt_desc_t * __arena *)chunk->descs;
431
432 desc = desc_children[pos];
433 if (!desc) {
434 desc = scx_alloc_chunk();
435 if (!desc)
436 return NULL;
437
438 desc_children[pos] = desc;
439 }
440 }
441
442 /*
443 * Finding the descriptor along with any internal node
444 * allocations was successful. Update all levels with
445 * the new allocation.
446 */
447 bpf_for(u, 0, SDT_TASK_LEVELS) {
448 level = SDT_TASK_LEVELS - 1 - u;
449 tmp = lv_desc[level];
450
451 ret = set_idx_state(tmp, lv_pos[level], true);
452 if (ret != 0)
453 break;
454
455 tmp->nr_free -= 1;
456 if (tmp->nr_free > 0)
457 break;
458 }
459
460 *idxp = idx;
461
462 return desc;
463 }
464
465 __hidden
scx_alloc(struct scx_allocator * alloc)466 void __arena *scx_alloc(struct scx_allocator *alloc)
467 {
468 struct sdt_data __arena *data = NULL;
469 struct sdt_chunk __arena *chunk;
470 sdt_desc_t *desc;
471 __u64 idx, pos;
472
473 if (!alloc)
474 return NULL;
475
476 bpf_spin_lock(&alloc_lock);
477
478 /* We unlock if we encounter an error in the function. */
479 desc = desc_find_empty(alloc->root, &idx);
480 if (unlikely(desc == NULL)) {
481 bpf_spin_unlock(&alloc_lock);
482 return NULL;
483 }
484
485 chunk = desc->chunk;
486
487 /* Populate the leaf node if necessary. */
488 pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
489 data = chunk->data[pos];
490 if (!data) {
491 data = scx_alloc_from_pool(&alloc->pool);
492 if (!data) {
493 scx_alloc_free_idx(alloc, idx);
494 bpf_spin_unlock(&alloc_lock);
495 return NULL;
496 }
497 }
498
499 chunk->data[pos] = data;
500
501 /* The data counts as a chunk */
502 alloc_stats.data_allocs += 1;
503 alloc_stats.alloc_ops += 1;
504 alloc_stats.active_allocs += 1;
505
506 data->tid.idx = idx;
507
508 bpf_spin_unlock(&alloc_lock);
509
510 return data;
511 }
512
513 /*
514 * Task BPF map entry recording the task's assigned ID and pointing to the data
515 * area allocated in arena.
516 */
517 struct scx_task_map_val {
518 union sdt_id tid;
519 __u64 tptr;
520 struct sdt_data __arena *data;
521 };
522
523 struct {
524 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
525 __uint(map_flags, BPF_F_NO_PREALLOC);
526 __type(key, int);
527 __type(value, struct scx_task_map_val);
528 } scx_task_map SEC(".maps");
529
530 static struct scx_allocator scx_task_allocator;
531
532 __hidden
scx_task_alloc(struct task_struct * p)533 void __arena *scx_task_alloc(struct task_struct *p)
534 {
535 struct sdt_data __arena *data = NULL;
536 struct scx_task_map_val *mval;
537
538 mval = bpf_task_storage_get(&scx_task_map, p, 0,
539 BPF_LOCAL_STORAGE_GET_F_CREATE);
540 if (!mval)
541 return NULL;
542
543 data = scx_alloc(&scx_task_allocator);
544 if (unlikely(!data))
545 return NULL;
546
547 mval->tid = data->tid;
548 mval->tptr = (__u64) p;
549 mval->data = data;
550
551 return (void __arena *)data->payload;
552 }
553
554 __hidden
scx_task_init(__u64 data_size)555 int scx_task_init(__u64 data_size)
556 {
557 return scx_alloc_init(&scx_task_allocator, data_size);
558 }
559
560 __hidden
scx_task_data(struct task_struct * p)561 void __arena *scx_task_data(struct task_struct *p)
562 {
563 struct sdt_data __arena *data;
564 struct scx_task_map_val *mval;
565
566 scx_arena_subprog_init();
567
568 mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
569 if (!mval)
570 return NULL;
571
572 data = mval->data;
573
574 return (void __arena *)data->payload;
575 }
576
577 __hidden
scx_task_free(struct task_struct * p)578 void scx_task_free(struct task_struct *p)
579 {
580 struct scx_task_map_val *mval;
581
582 scx_arena_subprog_init();
583
584 mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
585 if (!mval)
586 return;
587
588 bpf_spin_lock(&alloc_lock);
589 scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx);
590 bpf_spin_unlock(&alloc_lock);
591
592 bpf_task_storage_delete(&scx_task_map, p);
593 }
594
595 static inline void
scx_stat_global_update(struct scx_stats __arena * stats)596 scx_stat_global_update(struct scx_stats __arena *stats)
597 {
598 cast_kern(stats);
599 __sync_fetch_and_add(&stat_enqueue, stats->enqueue);
600 __sync_fetch_and_add(&stat_init, stats->init);
601 __sync_fetch_and_add(&stat_exit, stats->exit);
602 __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
603 __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
604 }
605
BPF_STRUCT_OPS(sdt_select_cpu,struct task_struct * p,s32 prev_cpu,u64 wake_flags)606 s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
607 {
608 struct scx_stats __arena *stats;
609 bool is_idle = false;
610 s32 cpu;
611
612 stats = scx_task_data(p);
613 if (!stats) {
614 scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
615 return 0;
616 }
617
618 cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
619 if (is_idle) {
620 stat_inc_select_idle_cpu(stats);
621 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
622 } else {
623 stat_inc_select_busy_cpu(stats);
624 }
625
626 return cpu;
627 }
628
BPF_STRUCT_OPS(sdt_enqueue,struct task_struct * p,u64 enq_flags)629 void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
630 {
631 struct scx_stats __arena *stats;
632
633 stats = scx_task_data(p);
634 if (!stats) {
635 scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
636 return;
637 }
638
639 stat_inc_enqueue(stats);
640
641 scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
642 }
643
BPF_STRUCT_OPS(sdt_dispatch,s32 cpu,struct task_struct * prev)644 void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
645 {
646 scx_bpf_dsq_move_to_local(SHARED_DSQ);
647 }
648
BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task,struct task_struct * p,struct scx_init_task_args * args)649 s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
650 struct scx_init_task_args *args)
651 {
652 struct scx_stats __arena *stats;
653
654 stats = scx_task_alloc(p);
655 if (!stats) {
656 scx_bpf_error("arena allocator out of memory");
657 return -ENOMEM;
658 }
659
660 stats->pid = p->pid;
661
662 stat_inc_init(stats);
663
664 return 0;
665 }
666
BPF_STRUCT_OPS(sdt_exit_task,struct task_struct * p,struct scx_exit_task_args * args)667 void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
668 struct scx_exit_task_args *args)
669 {
670 struct scx_stats __arena *stats;
671
672 stats = scx_task_data(p);
673 if (!stats) {
674 scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
675 return;
676 }
677
678 stat_inc_exit(stats);
679 scx_stat_global_update(stats);
680
681 scx_task_free(p);
682 }
683
BPF_STRUCT_OPS_SLEEPABLE(sdt_init)684 s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
685 {
686 int ret;
687
688 ret = scx_task_init(sizeof(struct scx_stats));
689 if (ret < 0) {
690 scx_bpf_error("%s: failed with %d", __func__, ret);
691 return ret;
692 }
693
694 ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
695 if (ret) {
696 scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
697 return ret;
698 }
699
700 return 0;
701 }
702
BPF_STRUCT_OPS(sdt_exit,struct scx_exit_info * ei)703 void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
704 {
705 UEI_RECORD(uei, ei);
706 }
707
708 SCX_OPS_DEFINE(sdt_ops,
709 .select_cpu = (void *)sdt_select_cpu,
710 .enqueue = (void *)sdt_enqueue,
711 .dispatch = (void *)sdt_dispatch,
712 .init_task = (void *)sdt_init_task,
713 .exit_task = (void *)sdt_exit_task,
714 .init = (void *)sdt_init,
715 .exit = (void *)sdt_exit,
716 .name = "sdt");
717