1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3 * A simple five-level FIFO queue scheduler.
4 *
5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6 * assigned to one depending on its compound weight. Each CPU round robins
7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8 * queue0, 2 from queue1, 4 from queue2 and so on.
9 *
10 * This scheduler demonstrates:
11 *
12 * - BPF-side queueing using PIDs.
13 * - Sleepable per-task storage allocation using ops.prep_enable().
14 * - Using ops.cpu_release() to handle a higher priority scheduling class taking
15 * the CPU away.
16 * - Core-sched support.
17 *
18 * This scheduler is primarily for demonstration and testing of sched_ext
19 * features and unlikely to be useful for actual workloads.
20 *
21 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
22 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
23 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
24 */
25 #include <scx/common.bpf.h>
26
27 enum consts {
28 ONE_SEC_IN_NS = 1000000000,
29 SHARED_DSQ = 0,
30 HIGHPRI_DSQ = 1,
31 HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
32 };
33
34 char _license[] SEC("license") = "GPL";
35
36 const volatile u64 slice_ns;
37 const volatile u32 stall_user_nth;
38 const volatile u32 stall_kernel_nth;
39 const volatile u32 dsp_inf_loop_after;
40 const volatile u32 dsp_batch;
41 const volatile bool highpri_boosting;
42 const volatile bool print_shared_dsq;
43 const volatile s32 disallow_tgid;
44 const volatile bool suppress_dump;
45
46 u64 nr_highpri_queued;
47 u32 test_error_cnt;
48
49 UEI_DEFINE(uei);
50
51 struct qmap {
52 __uint(type, BPF_MAP_TYPE_QUEUE);
53 __uint(max_entries, 4096);
54 __type(value, u32);
55 } queue0 SEC(".maps"),
56 queue1 SEC(".maps"),
57 queue2 SEC(".maps"),
58 queue3 SEC(".maps"),
59 queue4 SEC(".maps");
60
61 struct {
62 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
63 __uint(max_entries, 5);
64 __type(key, int);
65 __array(values, struct qmap);
66 } queue_arr SEC(".maps") = {
67 .values = {
68 [0] = &queue0,
69 [1] = &queue1,
70 [2] = &queue2,
71 [3] = &queue3,
72 [4] = &queue4,
73 },
74 };
75
76 /*
77 * If enabled, CPU performance target is set according to the queue index
78 * according to the following table.
79 */
80 static const u32 qidx_to_cpuperf_target[] = {
81 [0] = SCX_CPUPERF_ONE * 0 / 4,
82 [1] = SCX_CPUPERF_ONE * 1 / 4,
83 [2] = SCX_CPUPERF_ONE * 2 / 4,
84 [3] = SCX_CPUPERF_ONE * 3 / 4,
85 [4] = SCX_CPUPERF_ONE * 4 / 4,
86 };
87
88 /*
89 * Per-queue sequence numbers to implement core-sched ordering.
90 *
91 * Tail seq is assigned to each queued task and incremented. Head seq tracks the
92 * sequence number of the latest dispatched task. The distance between the a
93 * task's seq and the associated queue's head seq is called the queue distance
94 * and used when comparing two tasks for ordering. See qmap_core_sched_before().
95 */
96 static u64 core_sched_head_seqs[5];
97 static u64 core_sched_tail_seqs[5];
98
99 /* Per-task scheduling context */
100 struct task_ctx {
101 bool force_local; /* Dispatch directly to local_dsq */
102 bool highpri;
103 u64 core_sched_seq;
104 };
105
106 struct {
107 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
108 __uint(map_flags, BPF_F_NO_PREALLOC);
109 __type(key, int);
110 __type(value, struct task_ctx);
111 } task_ctx_stor SEC(".maps");
112
113 struct cpu_ctx {
114 u64 dsp_idx; /* dispatch index */
115 u64 dsp_cnt; /* remaining count */
116 u32 avg_weight;
117 u32 cpuperf_target;
118 };
119
120 struct {
121 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
122 __uint(max_entries, 1);
123 __type(key, u32);
124 __type(value, struct cpu_ctx);
125 } cpu_ctx_stor SEC(".maps");
126
127 /* Statistics */
128 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
129 u64 nr_core_sched_execed;
130 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
131 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
132 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
133
pick_direct_dispatch_cpu(struct task_struct * p,s32 prev_cpu)134 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
135 {
136 s32 cpu;
137
138 if (p->nr_cpus_allowed == 1 ||
139 scx_bpf_test_and_clear_cpu_idle(prev_cpu))
140 return prev_cpu;
141
142 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
143 if (cpu >= 0)
144 return cpu;
145
146 return -1;
147 }
148
lookup_task_ctx(struct task_struct * p)149 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
150 {
151 struct task_ctx *tctx;
152
153 if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
154 scx_bpf_error("task_ctx lookup failed");
155 return NULL;
156 }
157 return tctx;
158 }
159
BPF_STRUCT_OPS(qmap_select_cpu,struct task_struct * p,s32 prev_cpu,u64 wake_flags)160 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
161 s32 prev_cpu, u64 wake_flags)
162 {
163 struct task_ctx *tctx;
164 s32 cpu;
165
166 if (!(tctx = lookup_task_ctx(p)))
167 return -ESRCH;
168
169 cpu = pick_direct_dispatch_cpu(p, prev_cpu);
170
171 if (cpu >= 0) {
172 tctx->force_local = true;
173 return cpu;
174 } else {
175 return prev_cpu;
176 }
177 }
178
weight_to_idx(u32 weight)179 static int weight_to_idx(u32 weight)
180 {
181 /* Coarsely map the compound weight to a FIFO. */
182 if (weight <= 25)
183 return 0;
184 else if (weight <= 50)
185 return 1;
186 else if (weight < 200)
187 return 2;
188 else if (weight < 400)
189 return 3;
190 else
191 return 4;
192 }
193
BPF_STRUCT_OPS(qmap_enqueue,struct task_struct * p,u64 enq_flags)194 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
195 {
196 static u32 user_cnt, kernel_cnt;
197 struct task_ctx *tctx;
198 u32 pid = p->pid;
199 int idx = weight_to_idx(p->scx.weight);
200 void *ring;
201 s32 cpu;
202
203 if (p->flags & PF_KTHREAD) {
204 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
205 return;
206 } else {
207 if (stall_user_nth && !(++user_cnt % stall_user_nth))
208 return;
209 }
210
211 if (test_error_cnt && !--test_error_cnt)
212 scx_bpf_error("test triggering error");
213
214 if (!(tctx = lookup_task_ctx(p)))
215 return;
216
217 /*
218 * All enqueued tasks must have their core_sched_seq updated for correct
219 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
220 */
221 tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
222
223 /*
224 * If qmap_select_cpu() is telling us to or this is the last runnable
225 * task on the CPU, enqueue locally.
226 */
227 if (tctx->force_local) {
228 tctx->force_local = false;
229 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
230 return;
231 }
232
233 /* if select_cpu() wasn't called, try direct dispatch */
234 if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
235 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
236 __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
237 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
238 return;
239 }
240
241 /*
242 * If the task was re-enqueued due to the CPU being preempted by a
243 * higher priority scheduling class, just re-enqueue the task directly
244 * on the global DSQ. As we want another CPU to pick it up, find and
245 * kick an idle CPU.
246 */
247 if (enq_flags & SCX_ENQ_REENQ) {
248 s32 cpu;
249
250 scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
251 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
252 if (cpu >= 0)
253 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
254 return;
255 }
256
257 ring = bpf_map_lookup_elem(&queue_arr, &idx);
258 if (!ring) {
259 scx_bpf_error("failed to find ring %d", idx);
260 return;
261 }
262
263 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
264 if (bpf_map_push_elem(ring, &pid, 0)) {
265 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
266 return;
267 }
268
269 if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
270 tctx->highpri = true;
271 __sync_fetch_and_add(&nr_highpri_queued, 1);
272 }
273 __sync_fetch_and_add(&nr_enqueued, 1);
274 }
275
276 /*
277 * The BPF queue map doesn't support removal and sched_ext can handle spurious
278 * dispatches. qmap_dequeue() is only used to collect statistics.
279 */
BPF_STRUCT_OPS(qmap_dequeue,struct task_struct * p,u64 deq_flags)280 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
281 {
282 __sync_fetch_and_add(&nr_dequeued, 1);
283 if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
284 __sync_fetch_and_add(&nr_core_sched_execed, 1);
285 }
286
update_core_sched_head_seq(struct task_struct * p)287 static void update_core_sched_head_seq(struct task_struct *p)
288 {
289 int idx = weight_to_idx(p->scx.weight);
290 struct task_ctx *tctx;
291
292 if ((tctx = lookup_task_ctx(p)))
293 core_sched_head_seqs[idx] = tctx->core_sched_seq;
294 }
295
296 /*
297 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
298 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
299 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
300 * difference only when dsp_batch is larger than 1.
301 *
302 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
303 * non-rq-lock holding BPF programs. As demonstration, this function is called
304 * from qmap_dispatch() and monitor_timerfn().
305 */
dispatch_highpri(bool from_timer)306 static bool dispatch_highpri(bool from_timer)
307 {
308 struct task_struct *p;
309 s32 this_cpu = bpf_get_smp_processor_id();
310
311 /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
312 bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
313 static u64 highpri_seq;
314 struct task_ctx *tctx;
315
316 if (!(tctx = lookup_task_ctx(p)))
317 return false;
318
319 if (tctx->highpri) {
320 /* exercise the set_*() and vtime interface too */
321 __COMPAT_scx_bpf_dsq_move_set_slice(
322 BPF_FOR_EACH_ITER, slice_ns * 2);
323 __COMPAT_scx_bpf_dsq_move_set_vtime(
324 BPF_FOR_EACH_ITER, highpri_seq++);
325 __COMPAT_scx_bpf_dsq_move_vtime(
326 BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
327 }
328 }
329
330 /*
331 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
332 * is found.
333 */
334 bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
335 bool dispatched = false;
336 s32 cpu;
337
338 if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
339 cpu = this_cpu;
340 else
341 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
342
343 if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
344 SCX_DSQ_LOCAL_ON | cpu,
345 SCX_ENQ_PREEMPT)) {
346 if (cpu == this_cpu) {
347 dispatched = true;
348 __sync_fetch_and_add(&nr_expedited_local, 1);
349 } else {
350 __sync_fetch_and_add(&nr_expedited_remote, 1);
351 }
352 if (from_timer)
353 __sync_fetch_and_add(&nr_expedited_from_timer, 1);
354 } else {
355 __sync_fetch_and_add(&nr_expedited_lost, 1);
356 }
357
358 if (dispatched)
359 return true;
360 }
361
362 return false;
363 }
364
BPF_STRUCT_OPS(qmap_dispatch,s32 cpu,struct task_struct * prev)365 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
366 {
367 struct task_struct *p;
368 struct cpu_ctx *cpuc;
369 struct task_ctx *tctx;
370 u32 zero = 0, batch = dsp_batch ?: 1;
371 void *fifo;
372 s32 i, pid;
373
374 if (dispatch_highpri(false))
375 return;
376
377 if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
378 return;
379
380 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
381 /*
382 * PID 2 should be kthreadd which should mostly be idle and off
383 * the scheduler. Let's keep dispatching it to force the kernel
384 * to call this function over and over again.
385 */
386 p = bpf_task_from_pid(2);
387 if (p) {
388 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
389 bpf_task_release(p);
390 return;
391 }
392 }
393
394 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
395 scx_bpf_error("failed to look up cpu_ctx");
396 return;
397 }
398
399 for (i = 0; i < 5; i++) {
400 /* Advance the dispatch cursor and pick the fifo. */
401 if (!cpuc->dsp_cnt) {
402 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
403 cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
404 }
405
406 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
407 if (!fifo) {
408 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
409 return;
410 }
411
412 /* Dispatch or advance. */
413 bpf_repeat(BPF_MAX_LOOPS) {
414 struct task_ctx *tctx;
415
416 if (bpf_map_pop_elem(fifo, &pid))
417 break;
418
419 p = bpf_task_from_pid(pid);
420 if (!p)
421 continue;
422
423 if (!(tctx = lookup_task_ctx(p))) {
424 bpf_task_release(p);
425 return;
426 }
427
428 if (tctx->highpri)
429 __sync_fetch_and_sub(&nr_highpri_queued, 1);
430
431 update_core_sched_head_seq(p);
432 __sync_fetch_and_add(&nr_dispatched, 1);
433
434 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
435 bpf_task_release(p);
436
437 batch--;
438 cpuc->dsp_cnt--;
439 if (!batch || !scx_bpf_dispatch_nr_slots()) {
440 if (dispatch_highpri(false))
441 return;
442 scx_bpf_dsq_move_to_local(SHARED_DSQ);
443 return;
444 }
445 if (!cpuc->dsp_cnt)
446 break;
447 }
448
449 cpuc->dsp_cnt = 0;
450 }
451
452 /*
453 * No other tasks. @prev will keep running. Update its core_sched_seq as
454 * if the task were enqueued and dispatched immediately.
455 */
456 if (prev) {
457 tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
458 if (!tctx) {
459 scx_bpf_error("task_ctx lookup failed");
460 return;
461 }
462
463 tctx->core_sched_seq =
464 core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
465 }
466 }
467
BPF_STRUCT_OPS(qmap_tick,struct task_struct * p)468 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
469 {
470 struct cpu_ctx *cpuc;
471 u32 zero = 0;
472 int idx;
473
474 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
475 scx_bpf_error("failed to look up cpu_ctx");
476 return;
477 }
478
479 /*
480 * Use the running avg of weights to select the target cpuperf level.
481 * This is a demonstration of the cpuperf feature rather than a
482 * practical strategy to regulate CPU frequency.
483 */
484 cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
485 idx = weight_to_idx(cpuc->avg_weight);
486 cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
487
488 scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
489 }
490
491 /*
492 * The distance from the head of the queue scaled by the weight of the queue.
493 * The lower the number, the older the task and the higher the priority.
494 */
task_qdist(struct task_struct * p)495 static s64 task_qdist(struct task_struct *p)
496 {
497 int idx = weight_to_idx(p->scx.weight);
498 struct task_ctx *tctx;
499 s64 qdist;
500
501 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
502 if (!tctx) {
503 scx_bpf_error("task_ctx lookup failed");
504 return 0;
505 }
506
507 qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
508
509 /*
510 * As queue index increments, the priority doubles. The queue w/ index 3
511 * is dispatched twice more frequently than 2. Reflect the difference by
512 * scaling qdists accordingly. Note that the shift amount needs to be
513 * flipped depending on the sign to avoid flipping priority direction.
514 */
515 if (qdist >= 0)
516 return qdist << (4 - idx);
517 else
518 return qdist << idx;
519 }
520
521 /*
522 * This is called to determine the task ordering when core-sched is picking
523 * tasks to execute on SMT siblings and should encode about the same ordering as
524 * the regular scheduling path. Use the priority-scaled distances from the head
525 * of the queues to compare the two tasks which should be consistent with the
526 * dispatch path behavior.
527 */
BPF_STRUCT_OPS(qmap_core_sched_before,struct task_struct * a,struct task_struct * b)528 bool BPF_STRUCT_OPS(qmap_core_sched_before,
529 struct task_struct *a, struct task_struct *b)
530 {
531 return task_qdist(a) > task_qdist(b);
532 }
533
BPF_STRUCT_OPS(qmap_cpu_release,s32 cpu,struct scx_cpu_release_args * args)534 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
535 {
536 u32 cnt;
537
538 /*
539 * Called when @cpu is taken by a higher priority scheduling class. This
540 * makes @cpu no longer available for executing sched_ext tasks. As we
541 * don't want the tasks in @cpu's local dsq to sit there until @cpu
542 * becomes available again, re-enqueue them into the global dsq. See
543 * %SCX_ENQ_REENQ handling in qmap_enqueue().
544 */
545 cnt = scx_bpf_reenqueue_local();
546 if (cnt)
547 __sync_fetch_and_add(&nr_reenqueued, cnt);
548 }
549
BPF_STRUCT_OPS(qmap_init_task,struct task_struct * p,struct scx_init_task_args * args)550 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
551 struct scx_init_task_args *args)
552 {
553 if (p->tgid == disallow_tgid)
554 p->scx.disallow = true;
555
556 /*
557 * @p is new. Let's ensure that its task_ctx is available. We can sleep
558 * in this function and the following will automatically use GFP_KERNEL.
559 */
560 if (bpf_task_storage_get(&task_ctx_stor, p, 0,
561 BPF_LOCAL_STORAGE_GET_F_CREATE))
562 return 0;
563 else
564 return -ENOMEM;
565 }
566
BPF_STRUCT_OPS(qmap_dump,struct scx_dump_ctx * dctx)567 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
568 {
569 s32 i, pid;
570
571 if (suppress_dump)
572 return;
573
574 bpf_for(i, 0, 5) {
575 void *fifo;
576
577 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
578 return;
579
580 scx_bpf_dump("QMAP FIFO[%d]:", i);
581 bpf_repeat(4096) {
582 if (bpf_map_pop_elem(fifo, &pid))
583 break;
584 scx_bpf_dump(" %d", pid);
585 }
586 scx_bpf_dump("\n");
587 }
588 }
589
BPF_STRUCT_OPS(qmap_dump_cpu,struct scx_dump_ctx * dctx,s32 cpu,bool idle)590 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
591 {
592 u32 zero = 0;
593 struct cpu_ctx *cpuc;
594
595 if (suppress_dump || idle)
596 return;
597 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
598 return;
599
600 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
601 cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
602 cpuc->cpuperf_target);
603 }
604
BPF_STRUCT_OPS(qmap_dump_task,struct scx_dump_ctx * dctx,struct task_struct * p)605 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
606 {
607 struct task_ctx *taskc;
608
609 if (suppress_dump)
610 return;
611 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
612 return;
613
614 scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
615 taskc->force_local, taskc->core_sched_seq);
616 }
617
BPF_STRUCT_OPS(qmap_cgroup_init,struct cgroup * cgrp,struct scx_cgroup_init_args * args)618 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
619 {
620 bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
621 cgrp->kn->id, args->weight, args->bw_period_us,
622 args->bw_quota_us, args->bw_burst_us);
623 return 0;
624 }
625
BPF_STRUCT_OPS(qmap_cgroup_set_weight,struct cgroup * cgrp,u32 weight)626 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
627 {
628 bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
629 }
630
BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth,struct cgroup * cgrp,u64 period_us,u64 quota_us,u64 burst_us)631 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
632 u64 period_us, u64 quota_us, u64 burst_us)
633 {
634 bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
635 period_us, quota_us, burst_us);
636 }
637
638 /*
639 * Print out the online and possible CPU map using bpf_printk() as a
640 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
641 */
print_cpus(void)642 static void print_cpus(void)
643 {
644 const struct cpumask *possible, *online;
645 s32 cpu;
646 char buf[128] = "", *p;
647 int idx;
648
649 possible = scx_bpf_get_possible_cpumask();
650 online = scx_bpf_get_online_cpumask();
651
652 idx = 0;
653 bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
654 if (!(p = MEMBER_VPTR(buf, [idx++])))
655 break;
656 if (bpf_cpumask_test_cpu(cpu, online))
657 *p++ = 'O';
658 else if (bpf_cpumask_test_cpu(cpu, possible))
659 *p++ = 'X';
660 else
661 *p++ = ' ';
662
663 if ((cpu & 7) == 7) {
664 if (!(p = MEMBER_VPTR(buf, [idx++])))
665 break;
666 *p++ = '|';
667 }
668 }
669 buf[sizeof(buf) - 1] = '\0';
670
671 scx_bpf_put_cpumask(online);
672 scx_bpf_put_cpumask(possible);
673
674 bpf_printk("CPUS: |%s", buf);
675 }
676
BPF_STRUCT_OPS(qmap_cpu_online,s32 cpu)677 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
678 {
679 bpf_printk("CPU %d coming online", cpu);
680 /* @cpu is already online at this point */
681 print_cpus();
682 }
683
BPF_STRUCT_OPS(qmap_cpu_offline,s32 cpu)684 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
685 {
686 bpf_printk("CPU %d going offline", cpu);
687 /* @cpu is still online at this point */
688 print_cpus();
689 }
690
691 struct monitor_timer {
692 struct bpf_timer timer;
693 };
694
695 struct {
696 __uint(type, BPF_MAP_TYPE_ARRAY);
697 __uint(max_entries, 1);
698 __type(key, u32);
699 __type(value, struct monitor_timer);
700 } monitor_timer SEC(".maps");
701
702 /*
703 * Print out the min, avg and max performance levels of CPUs every second to
704 * demonstrate the cpuperf interface.
705 */
monitor_cpuperf(void)706 static void monitor_cpuperf(void)
707 {
708 u32 zero = 0, nr_cpu_ids;
709 u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
710 u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
711 const struct cpumask *online;
712 int i, nr_online_cpus = 0;
713
714 nr_cpu_ids = scx_bpf_nr_cpu_ids();
715 online = scx_bpf_get_online_cpumask();
716
717 bpf_for(i, 0, nr_cpu_ids) {
718 struct cpu_ctx *cpuc;
719 u32 cap, cur;
720
721 if (!bpf_cpumask_test_cpu(i, online))
722 continue;
723 nr_online_cpus++;
724
725 /* collect the capacity and current cpuperf */
726 cap = scx_bpf_cpuperf_cap(i);
727 cur = scx_bpf_cpuperf_cur(i);
728
729 cur_min = cur < cur_min ? cur : cur_min;
730 cur_max = cur > cur_max ? cur : cur_max;
731
732 /*
733 * $cur is relative to $cap. Scale it down accordingly so that
734 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
735 * makes sense.
736 */
737 cur_sum += cur * cap / SCX_CPUPERF_ONE;
738 cap_sum += cap;
739
740 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
741 scx_bpf_error("failed to look up cpu_ctx");
742 goto out;
743 }
744
745 /* collect target */
746 cur = cpuc->cpuperf_target;
747 target_sum += cur;
748 target_min = cur < target_min ? cur : target_min;
749 target_max = cur > target_max ? cur : target_max;
750 }
751
752 cpuperf_min = cur_min;
753 cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
754 cpuperf_max = cur_max;
755
756 cpuperf_target_min = target_min;
757 cpuperf_target_avg = target_sum / nr_online_cpus;
758 cpuperf_target_max = target_max;
759 out:
760 scx_bpf_put_cpumask(online);
761 }
762
763 /*
764 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
765 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
766 * see meaningful dumps in the trace pipe.
767 */
dump_shared_dsq(void)768 static void dump_shared_dsq(void)
769 {
770 struct task_struct *p;
771 s32 nr;
772
773 if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
774 return;
775
776 bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
777
778 bpf_rcu_read_lock();
779 bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
780 bpf_printk("%s[%d]", p->comm, p->pid);
781 bpf_rcu_read_unlock();
782 }
783
monitor_timerfn(void * map,int * key,struct bpf_timer * timer)784 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
785 {
786 struct scx_event_stats events;
787
788 bpf_rcu_read_lock();
789 dispatch_highpri(true);
790 bpf_rcu_read_unlock();
791
792 monitor_cpuperf();
793
794 if (print_shared_dsq)
795 dump_shared_dsq();
796
797 __COMPAT_scx_bpf_events(&events, sizeof(events));
798
799 bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
800 scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
801 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
802 scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
803 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
804 scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
805 bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
806 scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
807 bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
808 scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
809 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
810 scx_read_event(&events, SCX_EV_BYPASS_DURATION));
811 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
812 scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
813 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
814 scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
815
816 bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
817 return 0;
818 }
819
BPF_STRUCT_OPS_SLEEPABLE(qmap_init)820 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
821 {
822 u32 key = 0;
823 struct bpf_timer *timer;
824 s32 ret;
825
826 print_cpus();
827
828 ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
829 if (ret)
830 return ret;
831
832 ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
833 if (ret)
834 return ret;
835
836 timer = bpf_map_lookup_elem(&monitor_timer, &key);
837 if (!timer)
838 return -ESRCH;
839
840 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
841 bpf_timer_set_callback(timer, monitor_timerfn);
842
843 return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
844 }
845
BPF_STRUCT_OPS(qmap_exit,struct scx_exit_info * ei)846 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
847 {
848 UEI_RECORD(uei, ei);
849 }
850
851 SCX_OPS_DEFINE(qmap_ops,
852 .select_cpu = (void *)qmap_select_cpu,
853 .enqueue = (void *)qmap_enqueue,
854 .dequeue = (void *)qmap_dequeue,
855 .dispatch = (void *)qmap_dispatch,
856 .tick = (void *)qmap_tick,
857 .core_sched_before = (void *)qmap_core_sched_before,
858 .cpu_release = (void *)qmap_cpu_release,
859 .init_task = (void *)qmap_init_task,
860 .dump = (void *)qmap_dump,
861 .dump_cpu = (void *)qmap_dump_cpu,
862 .dump_task = (void *)qmap_dump_task,
863 .cgroup_init = (void *)qmap_cgroup_init,
864 .cgroup_set_weight = (void *)qmap_cgroup_set_weight,
865 .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
866 .cpu_online = (void *)qmap_cpu_online,
867 .cpu_offline = (void *)qmap_cpu_offline,
868 .init = (void *)qmap_init,
869 .exit = (void *)qmap_exit,
870 .timeout_ms = 5000U,
871 .name = "qmap");
872