1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3 * A simple five-level FIFO queue scheduler.
4 *
5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6 * assigned to one depending on its compound weight. Each CPU round robins
7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8 * queue0, 2 from queue1, 4 from queue2 and so on.
9 *
10 * This scheduler demonstrates:
11 *
12 * - BPF-side queueing using PIDs.
13 * - Sleepable per-task storage allocation using ops.prep_enable().
14 * - Core-sched support.
15 *
16 * This scheduler is primarily for demonstration and testing of sched_ext
17 * features and unlikely to be useful for actual workloads.
18 *
19 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
20 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
21 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
22 */
23 #include <scx/common.bpf.h>
24
25 enum consts {
26 ONE_SEC_IN_NS = 1000000000,
27 ONE_MSEC_IN_NS = 1000000,
28 LOWPRI_INTV_NS = 10 * ONE_MSEC_IN_NS,
29 SHARED_DSQ = 0,
30 HIGHPRI_DSQ = 1,
31 LOWPRI_DSQ = 2,
32 HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
33 };
34
35 char _license[] SEC("license") = "GPL";
36
37 const volatile u64 slice_ns;
38 const volatile u32 stall_user_nth;
39 const volatile u32 stall_kernel_nth;
40 const volatile u32 dsp_inf_loop_after;
41 const volatile u32 dsp_batch;
42 const volatile bool highpri_boosting;
43 const volatile bool print_dsqs_and_events;
44 const volatile bool print_msgs;
45 const volatile u64 sub_cgroup_id;
46 const volatile s32 disallow_tgid;
47 const volatile bool suppress_dump;
48 const volatile bool always_enq_immed;
49 const volatile u32 immed_stress_nth;
50
51 u64 nr_highpri_queued;
52 u32 test_error_cnt;
53
54 #define MAX_SUB_SCHEDS 8
55 u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
56
57 UEI_DEFINE(uei);
58
59 struct qmap {
60 __uint(type, BPF_MAP_TYPE_QUEUE);
61 __uint(max_entries, 4096);
62 __type(value, u32);
63 } queue0 SEC(".maps"),
64 queue1 SEC(".maps"),
65 queue2 SEC(".maps"),
66 queue3 SEC(".maps"),
67 queue4 SEC(".maps"),
68 dump_store SEC(".maps");
69
70 struct {
71 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
72 __uint(max_entries, 5);
73 __type(key, int);
74 __array(values, struct qmap);
75 } queue_arr SEC(".maps") = {
76 .values = {
77 [0] = &queue0,
78 [1] = &queue1,
79 [2] = &queue2,
80 [3] = &queue3,
81 [4] = &queue4,
82 },
83 };
84
85 /*
86 * If enabled, CPU performance target is set according to the queue index
87 * according to the following table.
88 */
89 static const u32 qidx_to_cpuperf_target[] = {
90 [0] = SCX_CPUPERF_ONE * 0 / 4,
91 [1] = SCX_CPUPERF_ONE * 1 / 4,
92 [2] = SCX_CPUPERF_ONE * 2 / 4,
93 [3] = SCX_CPUPERF_ONE * 3 / 4,
94 [4] = SCX_CPUPERF_ONE * 4 / 4,
95 };
96
97 /*
98 * Per-queue sequence numbers to implement core-sched ordering.
99 *
100 * Tail seq is assigned to each queued task and incremented. Head seq tracks the
101 * sequence number of the latest dispatched task. The distance between the a
102 * task's seq and the associated queue's head seq is called the queue distance
103 * and used when comparing two tasks for ordering. See qmap_core_sched_before().
104 */
105 static u64 core_sched_head_seqs[5];
106 static u64 core_sched_tail_seqs[5];
107
108 /* Per-task scheduling context */
109 struct task_ctx {
110 bool force_local; /* Dispatch directly to local_dsq */
111 bool highpri;
112 u64 core_sched_seq;
113 };
114
115 struct {
116 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
117 __uint(map_flags, BPF_F_NO_PREALLOC);
118 __type(key, int);
119 __type(value, struct task_ctx);
120 } task_ctx_stor SEC(".maps");
121
122 struct cpu_ctx {
123 u64 dsp_idx; /* dispatch index */
124 u64 dsp_cnt; /* remaining count */
125 u32 avg_weight;
126 u32 cpuperf_target;
127 };
128
129 struct {
130 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
131 __uint(max_entries, 1);
132 __type(key, u32);
133 __type(value, struct cpu_ctx);
134 } cpu_ctx_stor SEC(".maps");
135
136 /* Statistics */
137 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
138 u64 nr_core_sched_execed;
139 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
140 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
141 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
142
pick_direct_dispatch_cpu(struct task_struct * p,s32 prev_cpu)143 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
144 {
145 s32 cpu;
146
147 if (!always_enq_immed && p->nr_cpus_allowed == 1)
148 return prev_cpu;
149
150 if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
151 return prev_cpu;
152
153 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
154 if (cpu >= 0)
155 return cpu;
156
157 return -1;
158 }
159
lookup_task_ctx(struct task_struct * p)160 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
161 {
162 struct task_ctx *tctx;
163
164 if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
165 scx_bpf_error("task_ctx lookup failed");
166 return NULL;
167 }
168 return tctx;
169 }
170
BPF_STRUCT_OPS(qmap_select_cpu,struct task_struct * p,s32 prev_cpu,u64 wake_flags)171 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
172 s32 prev_cpu, u64 wake_flags)
173 {
174 struct task_ctx *tctx;
175 s32 cpu;
176
177 if (!(tctx = lookup_task_ctx(p)))
178 return -ESRCH;
179
180 if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
181 return prev_cpu;
182
183 cpu = pick_direct_dispatch_cpu(p, prev_cpu);
184
185 if (cpu >= 0) {
186 tctx->force_local = true;
187 return cpu;
188 } else {
189 return prev_cpu;
190 }
191 }
192
weight_to_idx(u32 weight)193 static int weight_to_idx(u32 weight)
194 {
195 /* Coarsely map the compound weight to a FIFO. */
196 if (weight <= 25)
197 return 0;
198 else if (weight <= 50)
199 return 1;
200 else if (weight < 200)
201 return 2;
202 else if (weight < 400)
203 return 3;
204 else
205 return 4;
206 }
207
BPF_STRUCT_OPS(qmap_enqueue,struct task_struct * p,u64 enq_flags)208 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
209 {
210 static u32 user_cnt, kernel_cnt;
211 struct task_ctx *tctx;
212 u32 pid = p->pid;
213 int idx = weight_to_idx(p->scx.weight);
214 void *ring;
215 s32 cpu;
216
217 if (enq_flags & SCX_ENQ_REENQ) {
218 __sync_fetch_and_add(&nr_reenqueued, 1);
219 if (scx_bpf_task_cpu(p) == 0)
220 __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
221 }
222
223 if (p->flags & PF_KTHREAD) {
224 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
225 return;
226 } else {
227 if (stall_user_nth && !(++user_cnt % stall_user_nth))
228 return;
229 }
230
231 if (test_error_cnt && !--test_error_cnt)
232 scx_bpf_error("test triggering error");
233
234 if (!(tctx = lookup_task_ctx(p)))
235 return;
236
237 /*
238 * All enqueued tasks must have their core_sched_seq updated for correct
239 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
240 */
241 tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
242
243 /*
244 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
245 * directly to prev_cpu's local DSQ even when busy to force dsq->nr > 1
246 * and exercise the kernel IMMED reenqueue trigger paths.
247 */
248 if (immed_stress_nth && !(enq_flags & SCX_ENQ_REENQ)) {
249 static u32 immed_stress_cnt;
250
251 if (!(++immed_stress_cnt % immed_stress_nth)) {
252 tctx->force_local = false;
253 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
254 slice_ns, enq_flags);
255 return;
256 }
257 }
258
259 /*
260 * If qmap_select_cpu() is telling us to or this is the last runnable
261 * task on the CPU, enqueue locally.
262 */
263 if (tctx->force_local) {
264 tctx->force_local = false;
265 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
266 return;
267 }
268
269 /* see lowpri_timerfn() */
270 if (__COMPAT_has_generic_reenq() &&
271 p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
272 scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
273 return;
274 }
275
276 /* if select_cpu() wasn't called, try direct dispatch */
277 if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
278 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
279 __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
280 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
281 return;
282 }
283
284 /*
285 * If the task was re-enqueued due to the CPU being preempted by a
286 * higher priority scheduling class, just re-enqueue the task directly
287 * on the global DSQ. As we want another CPU to pick it up, find and
288 * kick an idle CPU.
289 */
290 if (enq_flags & SCX_ENQ_REENQ) {
291 s32 cpu;
292
293 scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
294 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
295 if (cpu >= 0)
296 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
297 return;
298 }
299
300 ring = bpf_map_lookup_elem(&queue_arr, &idx);
301 if (!ring) {
302 scx_bpf_error("failed to find ring %d", idx);
303 return;
304 }
305
306 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
307 if (bpf_map_push_elem(ring, &pid, 0)) {
308 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
309 return;
310 }
311
312 if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
313 tctx->highpri = true;
314 __sync_fetch_and_add(&nr_highpri_queued, 1);
315 }
316 __sync_fetch_and_add(&nr_enqueued, 1);
317 }
318
319 /*
320 * The BPF queue map doesn't support removal and sched_ext can handle spurious
321 * dispatches. qmap_dequeue() is only used to collect statistics.
322 */
BPF_STRUCT_OPS(qmap_dequeue,struct task_struct * p,u64 deq_flags)323 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
324 {
325 __sync_fetch_and_add(&nr_dequeued, 1);
326 if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
327 __sync_fetch_and_add(&nr_core_sched_execed, 1);
328 }
329
update_core_sched_head_seq(struct task_struct * p)330 static void update_core_sched_head_seq(struct task_struct *p)
331 {
332 int idx = weight_to_idx(p->scx.weight);
333 struct task_ctx *tctx;
334
335 if ((tctx = lookup_task_ctx(p)))
336 core_sched_head_seqs[idx] = tctx->core_sched_seq;
337 }
338
339 /*
340 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
341 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
342 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
343 * difference only when dsp_batch is larger than 1.
344 *
345 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
346 * non-rq-lock holding BPF programs. As demonstration, this function is called
347 * from qmap_dispatch() and monitor_timerfn().
348 */
dispatch_highpri(bool from_timer)349 static bool dispatch_highpri(bool from_timer)
350 {
351 struct task_struct *p;
352 s32 this_cpu = bpf_get_smp_processor_id();
353
354 /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
355 bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
356 static u64 highpri_seq;
357 struct task_ctx *tctx;
358
359 if (!(tctx = lookup_task_ctx(p)))
360 return false;
361
362 if (tctx->highpri) {
363 /* exercise the set_*() and vtime interface too */
364 scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
365 scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
366 scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
367 }
368 }
369
370 /*
371 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
372 * is found.
373 */
374 bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
375 bool dispatched = false;
376 s32 cpu;
377
378 if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
379 cpu = this_cpu;
380 else
381 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
382
383 if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
384 SCX_ENQ_PREEMPT)) {
385 if (cpu == this_cpu) {
386 dispatched = true;
387 __sync_fetch_and_add(&nr_expedited_local, 1);
388 } else {
389 __sync_fetch_and_add(&nr_expedited_remote, 1);
390 }
391 if (from_timer)
392 __sync_fetch_and_add(&nr_expedited_from_timer, 1);
393 } else {
394 __sync_fetch_and_add(&nr_expedited_lost, 1);
395 }
396
397 if (dispatched)
398 return true;
399 }
400
401 return false;
402 }
403
BPF_STRUCT_OPS(qmap_dispatch,s32 cpu,struct task_struct * prev)404 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
405 {
406 struct task_struct *p;
407 struct cpu_ctx *cpuc;
408 struct task_ctx *tctx;
409 u32 zero = 0, batch = dsp_batch ?: 1;
410 void *fifo;
411 s32 i, pid;
412
413 if (dispatch_highpri(false))
414 return;
415
416 if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
417 return;
418
419 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
420 /*
421 * PID 2 should be kthreadd which should mostly be idle and off
422 * the scheduler. Let's keep dispatching it to force the kernel
423 * to call this function over and over again.
424 */
425 p = bpf_task_from_pid(2);
426 if (p) {
427 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
428 bpf_task_release(p);
429 return;
430 }
431 }
432
433 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
434 scx_bpf_error("failed to look up cpu_ctx");
435 return;
436 }
437
438 for (i = 0; i < 5; i++) {
439 /* Advance the dispatch cursor and pick the fifo. */
440 if (!cpuc->dsp_cnt) {
441 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
442 cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
443 }
444
445 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
446 if (!fifo) {
447 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
448 return;
449 }
450
451 /* Dispatch or advance. */
452 bpf_repeat(BPF_MAX_LOOPS) {
453 struct task_ctx *tctx;
454
455 if (bpf_map_pop_elem(fifo, &pid))
456 break;
457
458 p = bpf_task_from_pid(pid);
459 if (!p)
460 continue;
461
462 if (!(tctx = lookup_task_ctx(p))) {
463 bpf_task_release(p);
464 return;
465 }
466
467 if (tctx->highpri)
468 __sync_fetch_and_sub(&nr_highpri_queued, 1);
469
470 update_core_sched_head_seq(p);
471 __sync_fetch_and_add(&nr_dispatched, 1);
472
473 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
474
475 /*
476 * scx_qmap uses a global BPF queue that any CPU's
477 * dispatch can pop from. If this CPU popped a task that
478 * can't run here, it gets stranded on SHARED_DSQ after
479 * consume_dispatch_q() skips it. Kick the task's home
480 * CPU so it drains SHARED_DSQ.
481 *
482 * There's a race between the pop and the flush of the
483 * buffered dsq_insert:
484 *
485 * CPU 0 (dispatching) CPU 1 (home, idle)
486 * ~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
487 * pop from BPF queue
488 * dsq_insert(buffered)
489 * balance:
490 * SHARED_DSQ empty
491 * BPF queue empty
492 * -> goes idle
493 * flush -> on SHARED
494 * kick CPU 1
495 * wakes, drains task
496 *
497 * The kick prevents indefinite stalls but a per-CPU
498 * kthread like ksoftirqd can be briefly stranded when
499 * its home CPU enters idle with softirq pending,
500 * triggering:
501 *
502 * "NOHZ tick-stop error: local softirq work is pending, handler #N!!!"
503 *
504 * from report_idle_softirq(). The kick lands shortly
505 * after and the home CPU drains the task. This could be
506 * avoided by e.g. dispatching pinned tasks to local or
507 * global DSQs, but the current code is left as-is to
508 * document this class of issue -- other schedulers
509 * seeing similar warnings can use this as a reference.
510 */
511 if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
512 scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
513
514 bpf_task_release(p);
515
516 batch--;
517 cpuc->dsp_cnt--;
518 if (!batch || !scx_bpf_dispatch_nr_slots()) {
519 if (dispatch_highpri(false))
520 return;
521 scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
522 return;
523 }
524 if (!cpuc->dsp_cnt)
525 break;
526 }
527
528 cpuc->dsp_cnt = 0;
529 }
530
531 for (i = 0; i < MAX_SUB_SCHEDS; i++) {
532 if (sub_sched_cgroup_ids[i] &&
533 scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
534 return;
535 }
536
537 /*
538 * No other tasks. @prev will keep running. Update its core_sched_seq as
539 * if the task were enqueued and dispatched immediately.
540 */
541 if (prev) {
542 tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
543 if (!tctx) {
544 scx_bpf_error("task_ctx lookup failed");
545 return;
546 }
547
548 tctx->core_sched_seq =
549 core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
550 }
551 }
552
BPF_STRUCT_OPS(qmap_tick,struct task_struct * p)553 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
554 {
555 struct cpu_ctx *cpuc;
556 u32 zero = 0;
557 int idx;
558
559 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
560 scx_bpf_error("failed to look up cpu_ctx");
561 return;
562 }
563
564 /*
565 * Use the running avg of weights to select the target cpuperf level.
566 * This is a demonstration of the cpuperf feature rather than a
567 * practical strategy to regulate CPU frequency.
568 */
569 cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
570 idx = weight_to_idx(cpuc->avg_weight);
571 cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
572
573 scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
574 }
575
576 /*
577 * The distance from the head of the queue scaled by the weight of the queue.
578 * The lower the number, the older the task and the higher the priority.
579 */
task_qdist(struct task_struct * p)580 static s64 task_qdist(struct task_struct *p)
581 {
582 int idx = weight_to_idx(p->scx.weight);
583 struct task_ctx *tctx;
584 s64 qdist;
585
586 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
587 if (!tctx) {
588 scx_bpf_error("task_ctx lookup failed");
589 return 0;
590 }
591
592 qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
593
594 /*
595 * As queue index increments, the priority doubles. The queue w/ index 3
596 * is dispatched twice more frequently than 2. Reflect the difference by
597 * scaling qdists accordingly. Note that the shift amount needs to be
598 * flipped depending on the sign to avoid flipping priority direction.
599 */
600 if (qdist >= 0)
601 return qdist << (4 - idx);
602 else
603 return qdist << idx;
604 }
605
606 /*
607 * This is called to determine the task ordering when core-sched is picking
608 * tasks to execute on SMT siblings and should encode about the same ordering as
609 * the regular scheduling path. Use the priority-scaled distances from the head
610 * of the queues to compare the two tasks which should be consistent with the
611 * dispatch path behavior.
612 */
BPF_STRUCT_OPS(qmap_core_sched_before,struct task_struct * a,struct task_struct * b)613 bool BPF_STRUCT_OPS(qmap_core_sched_before,
614 struct task_struct *a, struct task_struct *b)
615 {
616 return task_qdist(a) > task_qdist(b);
617 }
618
619 /*
620 * sched_switch tracepoint and cpu_release handlers are no longer needed.
621 * With SCX_OPS_ALWAYS_ENQ_IMMED, wakeup_preempt_scx() reenqueues IMMED
622 * tasks when a higher-priority scheduling class takes the CPU.
623 */
624
BPF_STRUCT_OPS(qmap_init_task,struct task_struct * p,struct scx_init_task_args * args)625 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
626 struct scx_init_task_args *args)
627 {
628 if (p->tgid == disallow_tgid)
629 p->scx.disallow = true;
630
631 /*
632 * @p is new. Let's ensure that its task_ctx is available. We can sleep
633 * in this function and the following will automatically use GFP_KERNEL.
634 */
635 if (bpf_task_storage_get(&task_ctx_stor, p, 0,
636 BPF_LOCAL_STORAGE_GET_F_CREATE))
637 return 0;
638 else
639 return -ENOMEM;
640 }
641
BPF_STRUCT_OPS(qmap_dump,struct scx_dump_ctx * dctx)642 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
643 {
644 s32 i, pid;
645
646 if (suppress_dump)
647 return;
648
649 bpf_for(i, 0, 5) {
650 void *fifo;
651
652 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
653 return;
654
655 scx_bpf_dump("QMAP FIFO[%d]:", i);
656
657 /*
658 * Dump can be invoked anytime and there is no way to iterate in
659 * a non-destructive way. Pop and store in dump_store and then
660 * restore afterwards. If racing against new enqueues, ordering
661 * can get mixed up.
662 */
663 bpf_repeat(4096) {
664 if (bpf_map_pop_elem(fifo, &pid))
665 break;
666 bpf_map_push_elem(&dump_store, &pid, 0);
667 scx_bpf_dump(" %d", pid);
668 }
669
670 bpf_repeat(4096) {
671 if (bpf_map_pop_elem(&dump_store, &pid))
672 break;
673 bpf_map_push_elem(fifo, &pid, 0);
674 }
675
676 scx_bpf_dump("\n");
677 }
678 }
679
BPF_STRUCT_OPS(qmap_dump_cpu,struct scx_dump_ctx * dctx,s32 cpu,bool idle)680 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
681 {
682 u32 zero = 0;
683 struct cpu_ctx *cpuc;
684
685 if (suppress_dump || idle)
686 return;
687 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
688 return;
689
690 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
691 cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
692 cpuc->cpuperf_target);
693 }
694
BPF_STRUCT_OPS(qmap_dump_task,struct scx_dump_ctx * dctx,struct task_struct * p)695 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
696 {
697 struct task_ctx *taskc;
698
699 if (suppress_dump)
700 return;
701 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
702 return;
703
704 scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
705 taskc->force_local, taskc->core_sched_seq);
706 }
707
BPF_STRUCT_OPS(qmap_cgroup_init,struct cgroup * cgrp,struct scx_cgroup_init_args * args)708 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
709 {
710 if (print_msgs)
711 bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
712 cgrp->kn->id, args->weight, args->bw_period_us,
713 args->bw_quota_us, args->bw_burst_us);
714 return 0;
715 }
716
BPF_STRUCT_OPS(qmap_cgroup_set_weight,struct cgroup * cgrp,u32 weight)717 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
718 {
719 if (print_msgs)
720 bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
721 }
722
BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth,struct cgroup * cgrp,u64 period_us,u64 quota_us,u64 burst_us)723 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
724 u64 period_us, u64 quota_us, u64 burst_us)
725 {
726 if (print_msgs)
727 bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
728 cgrp->kn->id, period_us, quota_us, burst_us);
729 }
730
731 /*
732 * Print out the online and possible CPU map using bpf_printk() as a
733 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
734 */
print_cpus(void)735 static void print_cpus(void)
736 {
737 const struct cpumask *possible, *online;
738 s32 cpu;
739 char buf[128] = "", *p;
740 int idx;
741
742 possible = scx_bpf_get_possible_cpumask();
743 online = scx_bpf_get_online_cpumask();
744
745 idx = 0;
746 bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
747 if (!(p = MEMBER_VPTR(buf, [idx++])))
748 break;
749 if (bpf_cpumask_test_cpu(cpu, online))
750 *p++ = 'O';
751 else if (bpf_cpumask_test_cpu(cpu, possible))
752 *p++ = 'X';
753 else
754 *p++ = ' ';
755
756 if ((cpu & 7) == 7) {
757 if (!(p = MEMBER_VPTR(buf, [idx++])))
758 break;
759 *p++ = '|';
760 }
761 }
762 buf[sizeof(buf) - 1] = '\0';
763
764 scx_bpf_put_cpumask(online);
765 scx_bpf_put_cpumask(possible);
766
767 bpf_printk("CPUS: |%s", buf);
768 }
769
BPF_STRUCT_OPS(qmap_cpu_online,s32 cpu)770 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
771 {
772 if (print_msgs) {
773 bpf_printk("CPU %d coming online", cpu);
774 /* @cpu is already online at this point */
775 print_cpus();
776 }
777 }
778
BPF_STRUCT_OPS(qmap_cpu_offline,s32 cpu)779 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
780 {
781 if (print_msgs) {
782 bpf_printk("CPU %d going offline", cpu);
783 /* @cpu is still online at this point */
784 print_cpus();
785 }
786 }
787
788 struct monitor_timer {
789 struct bpf_timer timer;
790 };
791
792 struct {
793 __uint(type, BPF_MAP_TYPE_ARRAY);
794 __uint(max_entries, 1);
795 __type(key, u32);
796 __type(value, struct monitor_timer);
797 } monitor_timer SEC(".maps");
798
799 /*
800 * Print out the min, avg and max performance levels of CPUs every second to
801 * demonstrate the cpuperf interface.
802 */
monitor_cpuperf(void)803 static void monitor_cpuperf(void)
804 {
805 u32 zero = 0, nr_cpu_ids;
806 u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
807 u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
808 const struct cpumask *online;
809 int i, nr_online_cpus = 0;
810
811 nr_cpu_ids = scx_bpf_nr_cpu_ids();
812 online = scx_bpf_get_online_cpumask();
813
814 bpf_for(i, 0, nr_cpu_ids) {
815 struct cpu_ctx *cpuc;
816 u32 cap, cur;
817
818 if (!bpf_cpumask_test_cpu(i, online))
819 continue;
820 nr_online_cpus++;
821
822 /* collect the capacity and current cpuperf */
823 cap = scx_bpf_cpuperf_cap(i);
824 cur = scx_bpf_cpuperf_cur(i);
825
826 cur_min = cur < cur_min ? cur : cur_min;
827 cur_max = cur > cur_max ? cur : cur_max;
828
829 /*
830 * $cur is relative to $cap. Scale it down accordingly so that
831 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
832 * makes sense.
833 */
834 cur_sum += cur * cap / SCX_CPUPERF_ONE;
835 cap_sum += cap;
836
837 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
838 scx_bpf_error("failed to look up cpu_ctx");
839 goto out;
840 }
841
842 /* collect target */
843 cur = cpuc->cpuperf_target;
844 target_sum += cur;
845 target_min = cur < target_min ? cur : target_min;
846 target_max = cur > target_max ? cur : target_max;
847 }
848
849 cpuperf_min = cur_min;
850 cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
851 cpuperf_max = cur_max;
852
853 cpuperf_target_min = target_min;
854 cpuperf_target_avg = target_sum / nr_online_cpus;
855 cpuperf_target_max = target_max;
856 out:
857 scx_bpf_put_cpumask(online);
858 }
859
860 /*
861 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
862 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
863 * see meaningful dumps in the trace pipe.
864 */
dump_shared_dsq(void)865 static void dump_shared_dsq(void)
866 {
867 struct task_struct *p;
868 s32 nr;
869
870 if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
871 return;
872
873 bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
874
875 bpf_rcu_read_lock();
876 bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
877 bpf_printk("%s[%d]", p->comm, p->pid);
878 bpf_rcu_read_unlock();
879 }
880
monitor_timerfn(void * map,int * key,struct bpf_timer * timer)881 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
882 {
883 bpf_rcu_read_lock();
884 dispatch_highpri(true);
885 bpf_rcu_read_unlock();
886
887 monitor_cpuperf();
888
889 if (print_dsqs_and_events) {
890 struct scx_event_stats events;
891
892 dump_shared_dsq();
893
894 __COMPAT_scx_bpf_events(&events, sizeof(events));
895
896 bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
897 scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
898 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
899 scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
900 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
901 scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
902 bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
903 scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
904 bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
905 scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
906 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
907 scx_read_event(&events, SCX_EV_BYPASS_DURATION));
908 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
909 scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
910 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
911 scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
912 }
913
914 bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
915 return 0;
916 }
917
918 struct lowpri_timer {
919 struct bpf_timer timer;
920 };
921
922 struct {
923 __uint(type, BPF_MAP_TYPE_ARRAY);
924 __uint(max_entries, 1);
925 __type(key, u32);
926 __type(value, struct lowpri_timer);
927 } lowpri_timer SEC(".maps");
928
929 /*
930 * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
931 * the tasks are transferred to SHARED_DSQ.
932 */
lowpri_timerfn(void * map,int * key,struct bpf_timer * timer)933 static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
934 {
935 scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
936 bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
937 return 0;
938 }
939
BPF_STRUCT_OPS_SLEEPABLE(qmap_init)940 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
941 {
942 u32 key = 0;
943 struct bpf_timer *timer;
944 s32 ret;
945
946 if (print_msgs && !sub_cgroup_id)
947 print_cpus();
948
949 ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
950 if (ret) {
951 scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
952 return ret;
953 }
954
955 ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
956 if (ret) {
957 scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
958 return ret;
959 }
960
961 ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
962 if (ret)
963 return ret;
964
965 timer = bpf_map_lookup_elem(&monitor_timer, &key);
966 if (!timer)
967 return -ESRCH;
968 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
969 bpf_timer_set_callback(timer, monitor_timerfn);
970 ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
971 if (ret)
972 return ret;
973
974 if (__COMPAT_has_generic_reenq()) {
975 /* see lowpri_timerfn() */
976 timer = bpf_map_lookup_elem(&lowpri_timer, &key);
977 if (!timer)
978 return -ESRCH;
979 bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
980 bpf_timer_set_callback(timer, lowpri_timerfn);
981 ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
982 if (ret)
983 return ret;
984 }
985
986 return 0;
987 }
988
BPF_STRUCT_OPS(qmap_exit,struct scx_exit_info * ei)989 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
990 {
991 UEI_RECORD(uei, ei);
992 }
993
BPF_STRUCT_OPS(qmap_sub_attach,struct scx_sub_attach_args * args)994 s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
995 {
996 s32 i;
997
998 for (i = 0; i < MAX_SUB_SCHEDS; i++) {
999 if (!sub_sched_cgroup_ids[i]) {
1000 sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
1001 bpf_printk("attaching sub-sched[%d] on %s",
1002 i, args->cgroup_path);
1003 return 0;
1004 }
1005 }
1006
1007 return -ENOSPC;
1008 }
1009
BPF_STRUCT_OPS(qmap_sub_detach,struct scx_sub_detach_args * args)1010 void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
1011 {
1012 s32 i;
1013
1014 for (i = 0; i < MAX_SUB_SCHEDS; i++) {
1015 if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
1016 sub_sched_cgroup_ids[i] = 0;
1017 bpf_printk("detaching sub-sched[%d] on %s",
1018 i, args->cgroup_path);
1019 break;
1020 }
1021 }
1022 }
1023
1024 SCX_OPS_DEFINE(qmap_ops,
1025 .select_cpu = (void *)qmap_select_cpu,
1026 .enqueue = (void *)qmap_enqueue,
1027 .dequeue = (void *)qmap_dequeue,
1028 .dispatch = (void *)qmap_dispatch,
1029 .tick = (void *)qmap_tick,
1030 .core_sched_before = (void *)qmap_core_sched_before,
1031 .init_task = (void *)qmap_init_task,
1032 .dump = (void *)qmap_dump,
1033 .dump_cpu = (void *)qmap_dump_cpu,
1034 .dump_task = (void *)qmap_dump_task,
1035 .cgroup_init = (void *)qmap_cgroup_init,
1036 .cgroup_set_weight = (void *)qmap_cgroup_set_weight,
1037 .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
1038 .sub_attach = (void *)qmap_sub_attach,
1039 .sub_detach = (void *)qmap_sub_detach,
1040 .cpu_online = (void *)qmap_cpu_online,
1041 .cpu_offline = (void *)qmap_cpu_offline,
1042 .init = (void *)qmap_init,
1043 .exit = (void *)qmap_exit,
1044 .timeout_ms = 5000U,
1045 .name = "qmap");
1046