xref: /linux/tools/sched_ext/scx_qmap.bpf.c (revision 5bdb4078e1efba9650c03753616866192d680718)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * A simple five-level FIFO queue scheduler.
4  *
5  * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6  * assigned to one depending on its compound weight. Each CPU round robins
7  * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8  * queue0, 2 from queue1, 4 from queue2 and so on.
9  *
10  * This scheduler demonstrates:
11  *
12  * - BPF-side queueing using PIDs.
13  * - Sleepable per-task storage allocation using ops.prep_enable().
14  * - Core-sched support.
15  *
16  * This scheduler is primarily for demonstration and testing of sched_ext
17  * features and unlikely to be useful for actual workloads.
18  *
19  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
20  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
21  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
22  */
23 #include <scx/common.bpf.h>
24 
25 enum consts {
26 	ONE_SEC_IN_NS		= 1000000000,
27 	ONE_MSEC_IN_NS		= 1000000,
28 	LOWPRI_INTV_NS		= 10 * ONE_MSEC_IN_NS,
29 	SHARED_DSQ		= 0,
30 	HIGHPRI_DSQ		= 1,
31 	LOWPRI_DSQ		= 2,
32 	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
33 };
34 
35 char _license[] SEC("license") = "GPL";
36 
37 const volatile u64 slice_ns;
38 const volatile u32 stall_user_nth;
39 const volatile u32 stall_kernel_nth;
40 const volatile u32 dsp_inf_loop_after;
41 const volatile u32 dsp_batch;
42 const volatile bool highpri_boosting;
43 const volatile bool print_dsqs_and_events;
44 const volatile bool print_msgs;
45 const volatile u64 sub_cgroup_id;
46 const volatile s32 disallow_tgid;
47 const volatile bool suppress_dump;
48 const volatile bool always_enq_immed;
49 const volatile u32 immed_stress_nth;
50 
51 u64 nr_highpri_queued;
52 u32 test_error_cnt;
53 
54 #define MAX_SUB_SCHEDS		8
55 u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
56 
57 UEI_DEFINE(uei);
58 
59 struct qmap {
60 	__uint(type, BPF_MAP_TYPE_QUEUE);
61 	__uint(max_entries, 4096);
62 	__type(value, u32);
63 } queue0 SEC(".maps"),
64   queue1 SEC(".maps"),
65   queue2 SEC(".maps"),
66   queue3 SEC(".maps"),
67   queue4 SEC(".maps"),
68   dump_store SEC(".maps");
69 
70 struct {
71 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
72 	__uint(max_entries, 5);
73 	__type(key, int);
74 	__array(values, struct qmap);
75 } queue_arr SEC(".maps") = {
76 	.values = {
77 		[0] = &queue0,
78 		[1] = &queue1,
79 		[2] = &queue2,
80 		[3] = &queue3,
81 		[4] = &queue4,
82 	},
83 };
84 
85 /*
86  * If enabled, CPU performance target is set according to the queue index
87  * according to the following table.
88  */
89 static const u32 qidx_to_cpuperf_target[] = {
90 	[0] = SCX_CPUPERF_ONE * 0 / 4,
91 	[1] = SCX_CPUPERF_ONE * 1 / 4,
92 	[2] = SCX_CPUPERF_ONE * 2 / 4,
93 	[3] = SCX_CPUPERF_ONE * 3 / 4,
94 	[4] = SCX_CPUPERF_ONE * 4 / 4,
95 };
96 
97 /*
98  * Per-queue sequence numbers to implement core-sched ordering.
99  *
100  * Tail seq is assigned to each queued task and incremented. Head seq tracks the
101  * sequence number of the latest dispatched task. The distance between the a
102  * task's seq and the associated queue's head seq is called the queue distance
103  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
104  */
105 static u64 core_sched_head_seqs[5];
106 static u64 core_sched_tail_seqs[5];
107 
108 /* Per-task scheduling context */
109 struct task_ctx {
110 	bool	force_local;	/* Dispatch directly to local_dsq */
111 	bool	highpri;
112 	u64	core_sched_seq;
113 };
114 
115 struct {
116 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
117 	__uint(map_flags, BPF_F_NO_PREALLOC);
118 	__type(key, int);
119 	__type(value, struct task_ctx);
120 } task_ctx_stor SEC(".maps");
121 
122 struct cpu_ctx {
123 	u64	dsp_idx;	/* dispatch index */
124 	u64	dsp_cnt;	/* remaining count */
125 	u32	avg_weight;
126 	u32	cpuperf_target;
127 };
128 
129 struct {
130 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
131 	__uint(max_entries, 1);
132 	__type(key, u32);
133 	__type(value, struct cpu_ctx);
134 } cpu_ctx_stor SEC(".maps");
135 
136 /* Statistics */
137 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
138 u64 nr_core_sched_execed;
139 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
140 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
141 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
142 
pick_direct_dispatch_cpu(struct task_struct * p,s32 prev_cpu)143 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
144 {
145 	s32 cpu;
146 
147 	if (!always_enq_immed && p->nr_cpus_allowed == 1)
148 		return prev_cpu;
149 
150 	if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
151 		return prev_cpu;
152 
153 	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
154 	if (cpu >= 0)
155 		return cpu;
156 
157 	return -1;
158 }
159 
lookup_task_ctx(struct task_struct * p)160 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
161 {
162 	struct task_ctx *tctx;
163 
164 	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
165 		scx_bpf_error("task_ctx lookup failed");
166 		return NULL;
167 	}
168 	return tctx;
169 }
170 
BPF_STRUCT_OPS(qmap_select_cpu,struct task_struct * p,s32 prev_cpu,u64 wake_flags)171 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
172 		   s32 prev_cpu, u64 wake_flags)
173 {
174 	struct task_ctx *tctx;
175 	s32 cpu;
176 
177 	if (!(tctx = lookup_task_ctx(p)))
178 		return -ESRCH;
179 
180 	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
181 		return prev_cpu;
182 
183 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
184 
185 	if (cpu >= 0) {
186 		tctx->force_local = true;
187 		return cpu;
188 	} else {
189 		return prev_cpu;
190 	}
191 }
192 
weight_to_idx(u32 weight)193 static int weight_to_idx(u32 weight)
194 {
195 	/* Coarsely map the compound weight to a FIFO. */
196 	if (weight <= 25)
197 		return 0;
198 	else if (weight <= 50)
199 		return 1;
200 	else if (weight < 200)
201 		return 2;
202 	else if (weight < 400)
203 		return 3;
204 	else
205 		return 4;
206 }
207 
BPF_STRUCT_OPS(qmap_enqueue,struct task_struct * p,u64 enq_flags)208 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
209 {
210 	static u32 user_cnt, kernel_cnt;
211 	struct task_ctx *tctx;
212 	u32 pid = p->pid;
213 	int idx = weight_to_idx(p->scx.weight);
214 	void *ring;
215 	s32 cpu;
216 
217 	if (enq_flags & SCX_ENQ_REENQ) {
218 		__sync_fetch_and_add(&nr_reenqueued, 1);
219 		if (scx_bpf_task_cpu(p) == 0)
220 			__sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
221 	}
222 
223 	if (p->flags & PF_KTHREAD) {
224 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
225 			return;
226 	} else {
227 		if (stall_user_nth && !(++user_cnt % stall_user_nth))
228 			return;
229 	}
230 
231 	if (test_error_cnt && !--test_error_cnt)
232 		scx_bpf_error("test triggering error");
233 
234 	if (!(tctx = lookup_task_ctx(p)))
235 		return;
236 
237 	/*
238 	 * All enqueued tasks must have their core_sched_seq updated for correct
239 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
240 	 */
241 	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
242 
243 	/*
244 	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
245 	 * directly to prev_cpu's local DSQ even when busy to force dsq->nr > 1
246 	 * and exercise the kernel IMMED reenqueue trigger paths.
247 	 */
248 	if (immed_stress_nth && !(enq_flags & SCX_ENQ_REENQ)) {
249 		static u32 immed_stress_cnt;
250 
251 		if (!(++immed_stress_cnt % immed_stress_nth)) {
252 			tctx->force_local = false;
253 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
254 					   slice_ns, enq_flags);
255 			return;
256 		}
257 	}
258 
259 	/*
260 	 * If qmap_select_cpu() is telling us to or this is the last runnable
261 	 * task on the CPU, enqueue locally.
262 	 */
263 	if (tctx->force_local) {
264 		tctx->force_local = false;
265 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
266 		return;
267 	}
268 
269 	/* see lowpri_timerfn() */
270 	if (__COMPAT_has_generic_reenq() &&
271 	    p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
272 		scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
273 		return;
274 	}
275 
276 	/* if select_cpu() wasn't called, try direct dispatch */
277 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
278 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
279 		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
280 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
281 		return;
282 	}
283 
284 	/*
285 	 * If the task was re-enqueued due to the CPU being preempted by a
286 	 * higher priority scheduling class, just re-enqueue the task directly
287 	 * on the global DSQ. As we want another CPU to pick it up, find and
288 	 * kick an idle CPU.
289 	 */
290 	if (enq_flags & SCX_ENQ_REENQ) {
291 		s32 cpu;
292 
293 		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
294 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
295 		if (cpu >= 0)
296 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
297 		return;
298 	}
299 
300 	ring = bpf_map_lookup_elem(&queue_arr, &idx);
301 	if (!ring) {
302 		scx_bpf_error("failed to find ring %d", idx);
303 		return;
304 	}
305 
306 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
307 	if (bpf_map_push_elem(ring, &pid, 0)) {
308 		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
309 		return;
310 	}
311 
312 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
313 		tctx->highpri = true;
314 		__sync_fetch_and_add(&nr_highpri_queued, 1);
315 	}
316 	__sync_fetch_and_add(&nr_enqueued, 1);
317 }
318 
319 /*
320  * The BPF queue map doesn't support removal and sched_ext can handle spurious
321  * dispatches. qmap_dequeue() is only used to collect statistics.
322  */
BPF_STRUCT_OPS(qmap_dequeue,struct task_struct * p,u64 deq_flags)323 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
324 {
325 	__sync_fetch_and_add(&nr_dequeued, 1);
326 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
327 		__sync_fetch_and_add(&nr_core_sched_execed, 1);
328 }
329 
update_core_sched_head_seq(struct task_struct * p)330 static void update_core_sched_head_seq(struct task_struct *p)
331 {
332 	int idx = weight_to_idx(p->scx.weight);
333 	struct task_ctx *tctx;
334 
335 	if ((tctx = lookup_task_ctx(p)))
336 		core_sched_head_seqs[idx] = tctx->core_sched_seq;
337 }
338 
339 /*
340  * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
341  * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
342  * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
343  * difference only when dsp_batch is larger than 1.
344  *
345  * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
346  * non-rq-lock holding BPF programs. As demonstration, this function is called
347  * from qmap_dispatch() and monitor_timerfn().
348  */
dispatch_highpri(bool from_timer)349 static bool dispatch_highpri(bool from_timer)
350 {
351 	struct task_struct *p;
352 	s32 this_cpu = bpf_get_smp_processor_id();
353 
354 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
355 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
356 		static u64 highpri_seq;
357 		struct task_ctx *tctx;
358 
359 		if (!(tctx = lookup_task_ctx(p)))
360 			return false;
361 
362 		if (tctx->highpri) {
363 			/* exercise the set_*() and vtime interface too */
364 			scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
365 			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
366 			scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
367 		}
368 	}
369 
370 	/*
371 	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
372 	 * is found.
373 	 */
374 	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
375 		bool dispatched = false;
376 		s32 cpu;
377 
378 		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
379 			cpu = this_cpu;
380 		else
381 			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
382 
383 		if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
384 				     SCX_ENQ_PREEMPT)) {
385 			if (cpu == this_cpu) {
386 				dispatched = true;
387 				__sync_fetch_and_add(&nr_expedited_local, 1);
388 			} else {
389 				__sync_fetch_and_add(&nr_expedited_remote, 1);
390 			}
391 			if (from_timer)
392 				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
393 		} else {
394 			__sync_fetch_and_add(&nr_expedited_lost, 1);
395 		}
396 
397 		if (dispatched)
398 			return true;
399 	}
400 
401 	return false;
402 }
403 
BPF_STRUCT_OPS(qmap_dispatch,s32 cpu,struct task_struct * prev)404 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
405 {
406 	struct task_struct *p;
407 	struct cpu_ctx *cpuc;
408 	struct task_ctx *tctx;
409 	u32 zero = 0, batch = dsp_batch ?: 1;
410 	void *fifo;
411 	s32 i, pid;
412 
413 	if (dispatch_highpri(false))
414 		return;
415 
416 	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
417 		return;
418 
419 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
420 		/*
421 		 * PID 2 should be kthreadd which should mostly be idle and off
422 		 * the scheduler. Let's keep dispatching it to force the kernel
423 		 * to call this function over and over again.
424 		 */
425 		p = bpf_task_from_pid(2);
426 		if (p) {
427 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
428 			bpf_task_release(p);
429 			return;
430 		}
431 	}
432 
433 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
434 		scx_bpf_error("failed to look up cpu_ctx");
435 		return;
436 	}
437 
438 	for (i = 0; i < 5; i++) {
439 		/* Advance the dispatch cursor and pick the fifo. */
440 		if (!cpuc->dsp_cnt) {
441 			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
442 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
443 		}
444 
445 		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
446 		if (!fifo) {
447 			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
448 			return;
449 		}
450 
451 		/* Dispatch or advance. */
452 		bpf_repeat(BPF_MAX_LOOPS) {
453 			struct task_ctx *tctx;
454 
455 			if (bpf_map_pop_elem(fifo, &pid))
456 				break;
457 
458 			p = bpf_task_from_pid(pid);
459 			if (!p)
460 				continue;
461 
462 			if (!(tctx = lookup_task_ctx(p))) {
463 				bpf_task_release(p);
464 				return;
465 			}
466 
467 			if (tctx->highpri)
468 				__sync_fetch_and_sub(&nr_highpri_queued, 1);
469 
470 			update_core_sched_head_seq(p);
471 			__sync_fetch_and_add(&nr_dispatched, 1);
472 
473 			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
474 
475 			/*
476 			 * scx_qmap uses a global BPF queue that any CPU's
477 			 * dispatch can pop from. If this CPU popped a task that
478 			 * can't run here, it gets stranded on SHARED_DSQ after
479 			 * consume_dispatch_q() skips it. Kick the task's home
480 			 * CPU so it drains SHARED_DSQ.
481 			 *
482 			 * There's a race between the pop and the flush of the
483 			 * buffered dsq_insert:
484 			 *
485 			 *  CPU 0 (dispatching)      CPU 1 (home, idle)
486 			 *  ~~~~~~~~~~~~~~~~~~~      ~~~~~~~~~~~~~~~~~~~
487 			 *  pop from BPF queue
488 			 *  dsq_insert(buffered)
489 			 *                           balance:
490 			 *                             SHARED_DSQ empty
491 			 *                             BPF queue empty
492 			 *                             -> goes idle
493 			 *  flush -> on SHARED
494 			 *  kick CPU 1
495 			 *                           wakes, drains task
496 			 *
497 			 * The kick prevents indefinite stalls but a per-CPU
498 			 * kthread like ksoftirqd can be briefly stranded when
499 			 * its home CPU enters idle with softirq pending,
500 			 * triggering:
501 			 *
502 			 *  "NOHZ tick-stop error: local softirq work is pending, handler #N!!!"
503 			 *
504 			 * from report_idle_softirq(). The kick lands shortly
505 			 * after and the home CPU drains the task. This could be
506 			 * avoided by e.g. dispatching pinned tasks to local or
507 			 * global DSQs, but the current code is left as-is to
508 			 * document this class of issue -- other schedulers
509 			 * seeing similar warnings can use this as a reference.
510 			 */
511 			if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
512 				scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
513 
514 			bpf_task_release(p);
515 
516 			batch--;
517 			cpuc->dsp_cnt--;
518 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
519 				if (dispatch_highpri(false))
520 					return;
521 				scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
522 				return;
523 			}
524 			if (!cpuc->dsp_cnt)
525 				break;
526 		}
527 
528 		cpuc->dsp_cnt = 0;
529 	}
530 
531 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
532 		if (sub_sched_cgroup_ids[i] &&
533 		    scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
534 			return;
535 	}
536 
537 	/*
538 	 * No other tasks. @prev will keep running. Update its core_sched_seq as
539 	 * if the task were enqueued and dispatched immediately.
540 	 */
541 	if (prev) {
542 		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
543 		if (!tctx) {
544 			scx_bpf_error("task_ctx lookup failed");
545 			return;
546 		}
547 
548 		tctx->core_sched_seq =
549 			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
550 	}
551 }
552 
BPF_STRUCT_OPS(qmap_tick,struct task_struct * p)553 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
554 {
555 	struct cpu_ctx *cpuc;
556 	u32 zero = 0;
557 	int idx;
558 
559 	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
560 		scx_bpf_error("failed to look up cpu_ctx");
561 		return;
562 	}
563 
564 	/*
565 	 * Use the running avg of weights to select the target cpuperf level.
566 	 * This is a demonstration of the cpuperf feature rather than a
567 	 * practical strategy to regulate CPU frequency.
568 	 */
569 	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
570 	idx = weight_to_idx(cpuc->avg_weight);
571 	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
572 
573 	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
574 }
575 
576 /*
577  * The distance from the head of the queue scaled by the weight of the queue.
578  * The lower the number, the older the task and the higher the priority.
579  */
task_qdist(struct task_struct * p)580 static s64 task_qdist(struct task_struct *p)
581 {
582 	int idx = weight_to_idx(p->scx.weight);
583 	struct task_ctx *tctx;
584 	s64 qdist;
585 
586 	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
587 	if (!tctx) {
588 		scx_bpf_error("task_ctx lookup failed");
589 		return 0;
590 	}
591 
592 	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
593 
594 	/*
595 	 * As queue index increments, the priority doubles. The queue w/ index 3
596 	 * is dispatched twice more frequently than 2. Reflect the difference by
597 	 * scaling qdists accordingly. Note that the shift amount needs to be
598 	 * flipped depending on the sign to avoid flipping priority direction.
599 	 */
600 	if (qdist >= 0)
601 		return qdist << (4 - idx);
602 	else
603 		return qdist << idx;
604 }
605 
606 /*
607  * This is called to determine the task ordering when core-sched is picking
608  * tasks to execute on SMT siblings and should encode about the same ordering as
609  * the regular scheduling path. Use the priority-scaled distances from the head
610  * of the queues to compare the two tasks which should be consistent with the
611  * dispatch path behavior.
612  */
BPF_STRUCT_OPS(qmap_core_sched_before,struct task_struct * a,struct task_struct * b)613 bool BPF_STRUCT_OPS(qmap_core_sched_before,
614 		    struct task_struct *a, struct task_struct *b)
615 {
616 	return task_qdist(a) > task_qdist(b);
617 }
618 
619 /*
620  * sched_switch tracepoint and cpu_release handlers are no longer needed.
621  * With SCX_OPS_ALWAYS_ENQ_IMMED, wakeup_preempt_scx() reenqueues IMMED
622  * tasks when a higher-priority scheduling class takes the CPU.
623  */
624 
BPF_STRUCT_OPS(qmap_init_task,struct task_struct * p,struct scx_init_task_args * args)625 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
626 		   struct scx_init_task_args *args)
627 {
628 	if (p->tgid == disallow_tgid)
629 		p->scx.disallow = true;
630 
631 	/*
632 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
633 	 * in this function and the following will automatically use GFP_KERNEL.
634 	 */
635 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
636 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
637 		return 0;
638 	else
639 		return -ENOMEM;
640 }
641 
BPF_STRUCT_OPS(qmap_dump,struct scx_dump_ctx * dctx)642 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
643 {
644 	s32 i, pid;
645 
646 	if (suppress_dump)
647 		return;
648 
649 	bpf_for(i, 0, 5) {
650 		void *fifo;
651 
652 		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
653 			return;
654 
655 		scx_bpf_dump("QMAP FIFO[%d]:", i);
656 
657 		/*
658 		 * Dump can be invoked anytime and there is no way to iterate in
659 		 * a non-destructive way. Pop and store in dump_store and then
660 		 * restore afterwards. If racing against new enqueues, ordering
661 		 * can get mixed up.
662 		 */
663 		bpf_repeat(4096) {
664 			if (bpf_map_pop_elem(fifo, &pid))
665 				break;
666 			bpf_map_push_elem(&dump_store, &pid, 0);
667 			scx_bpf_dump(" %d", pid);
668 		}
669 
670 		bpf_repeat(4096) {
671 			if (bpf_map_pop_elem(&dump_store, &pid))
672 				break;
673 			bpf_map_push_elem(fifo, &pid, 0);
674 		}
675 
676 		scx_bpf_dump("\n");
677 	}
678 }
679 
BPF_STRUCT_OPS(qmap_dump_cpu,struct scx_dump_ctx * dctx,s32 cpu,bool idle)680 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
681 {
682 	u32 zero = 0;
683 	struct cpu_ctx *cpuc;
684 
685 	if (suppress_dump || idle)
686 		return;
687 	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
688 		return;
689 
690 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
691 		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
692 		     cpuc->cpuperf_target);
693 }
694 
BPF_STRUCT_OPS(qmap_dump_task,struct scx_dump_ctx * dctx,struct task_struct * p)695 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
696 {
697 	struct task_ctx *taskc;
698 
699 	if (suppress_dump)
700 		return;
701 	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
702 		return;
703 
704 	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
705 		     taskc->force_local, taskc->core_sched_seq);
706 }
707 
BPF_STRUCT_OPS(qmap_cgroup_init,struct cgroup * cgrp,struct scx_cgroup_init_args * args)708 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
709 {
710 	if (print_msgs)
711 		bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
712 			   cgrp->kn->id, args->weight, args->bw_period_us,
713 			   args->bw_quota_us, args->bw_burst_us);
714 	return 0;
715 }
716 
BPF_STRUCT_OPS(qmap_cgroup_set_weight,struct cgroup * cgrp,u32 weight)717 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
718 {
719 	if (print_msgs)
720 		bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
721 }
722 
BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth,struct cgroup * cgrp,u64 period_us,u64 quota_us,u64 burst_us)723 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
724 		    u64 period_us, u64 quota_us, u64 burst_us)
725 {
726 	if (print_msgs)
727 		bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
728 			   cgrp->kn->id, period_us, quota_us, burst_us);
729 }
730 
731 /*
732  * Print out the online and possible CPU map using bpf_printk() as a
733  * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
734  */
print_cpus(void)735 static void print_cpus(void)
736 {
737 	const struct cpumask *possible, *online;
738 	s32 cpu;
739 	char buf[128] = "", *p;
740 	int idx;
741 
742 	possible = scx_bpf_get_possible_cpumask();
743 	online = scx_bpf_get_online_cpumask();
744 
745 	idx = 0;
746 	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
747 		if (!(p = MEMBER_VPTR(buf, [idx++])))
748 			break;
749 		if (bpf_cpumask_test_cpu(cpu, online))
750 			*p++ = 'O';
751 		else if (bpf_cpumask_test_cpu(cpu, possible))
752 			*p++ = 'X';
753 		else
754 			*p++ = ' ';
755 
756 		if ((cpu & 7) == 7) {
757 			if (!(p = MEMBER_VPTR(buf, [idx++])))
758 				break;
759 			*p++ = '|';
760 		}
761 	}
762 	buf[sizeof(buf) - 1] = '\0';
763 
764 	scx_bpf_put_cpumask(online);
765 	scx_bpf_put_cpumask(possible);
766 
767 	bpf_printk("CPUS: |%s", buf);
768 }
769 
BPF_STRUCT_OPS(qmap_cpu_online,s32 cpu)770 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
771 {
772 	if (print_msgs) {
773 		bpf_printk("CPU %d coming online", cpu);
774 		/* @cpu is already online at this point */
775 		print_cpus();
776 	}
777 }
778 
BPF_STRUCT_OPS(qmap_cpu_offline,s32 cpu)779 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
780 {
781 	if (print_msgs) {
782 		bpf_printk("CPU %d going offline", cpu);
783 		/* @cpu is still online at this point */
784 		print_cpus();
785 	}
786 }
787 
788 struct monitor_timer {
789 	struct bpf_timer timer;
790 };
791 
792 struct {
793 	__uint(type, BPF_MAP_TYPE_ARRAY);
794 	__uint(max_entries, 1);
795 	__type(key, u32);
796 	__type(value, struct monitor_timer);
797 } monitor_timer SEC(".maps");
798 
799 /*
800  * Print out the min, avg and max performance levels of CPUs every second to
801  * demonstrate the cpuperf interface.
802  */
monitor_cpuperf(void)803 static void monitor_cpuperf(void)
804 {
805 	u32 zero = 0, nr_cpu_ids;
806 	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
807 	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
808 	const struct cpumask *online;
809 	int i, nr_online_cpus = 0;
810 
811 	nr_cpu_ids = scx_bpf_nr_cpu_ids();
812 	online = scx_bpf_get_online_cpumask();
813 
814 	bpf_for(i, 0, nr_cpu_ids) {
815 		struct cpu_ctx *cpuc;
816 		u32 cap, cur;
817 
818 		if (!bpf_cpumask_test_cpu(i, online))
819 			continue;
820 		nr_online_cpus++;
821 
822 		/* collect the capacity and current cpuperf */
823 		cap = scx_bpf_cpuperf_cap(i);
824 		cur = scx_bpf_cpuperf_cur(i);
825 
826 		cur_min = cur < cur_min ? cur : cur_min;
827 		cur_max = cur > cur_max ? cur : cur_max;
828 
829 		/*
830 		 * $cur is relative to $cap. Scale it down accordingly so that
831 		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
832 		 * makes sense.
833 		 */
834 		cur_sum += cur * cap / SCX_CPUPERF_ONE;
835 		cap_sum += cap;
836 
837 		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
838 			scx_bpf_error("failed to look up cpu_ctx");
839 			goto out;
840 		}
841 
842 		/* collect target */
843 		cur = cpuc->cpuperf_target;
844 		target_sum += cur;
845 		target_min = cur < target_min ? cur : target_min;
846 		target_max = cur > target_max ? cur : target_max;
847 	}
848 
849 	cpuperf_min = cur_min;
850 	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
851 	cpuperf_max = cur_max;
852 
853 	cpuperf_target_min = target_min;
854 	cpuperf_target_avg = target_sum / nr_online_cpus;
855 	cpuperf_target_max = target_max;
856 out:
857 	scx_bpf_put_cpumask(online);
858 }
859 
860 /*
861  * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
862  * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
863  * see meaningful dumps in the trace pipe.
864  */
dump_shared_dsq(void)865 static void dump_shared_dsq(void)
866 {
867 	struct task_struct *p;
868 	s32 nr;
869 
870 	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
871 		return;
872 
873 	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
874 
875 	bpf_rcu_read_lock();
876 	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
877 		bpf_printk("%s[%d]", p->comm, p->pid);
878 	bpf_rcu_read_unlock();
879 }
880 
monitor_timerfn(void * map,int * key,struct bpf_timer * timer)881 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
882 {
883 	bpf_rcu_read_lock();
884 	dispatch_highpri(true);
885 	bpf_rcu_read_unlock();
886 
887 	monitor_cpuperf();
888 
889 	if (print_dsqs_and_events) {
890 		struct scx_event_stats events;
891 
892 		dump_shared_dsq();
893 
894 		__COMPAT_scx_bpf_events(&events, sizeof(events));
895 
896 		bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
897 			   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
898 		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
899 			   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
900 		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
901 			   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
902 		bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
903 			   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
904 		bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
905 			   scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
906 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
907 			   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
908 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
909 			   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
910 		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
911 			   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
912 	}
913 
914 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
915 	return 0;
916 }
917 
918 struct lowpri_timer {
919 	struct bpf_timer timer;
920 };
921 
922 struct {
923 	__uint(type, BPF_MAP_TYPE_ARRAY);
924 	__uint(max_entries, 1);
925 	__type(key, u32);
926 	__type(value, struct lowpri_timer);
927 } lowpri_timer SEC(".maps");
928 
929 /*
930  * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
931  * the tasks are transferred to SHARED_DSQ.
932  */
lowpri_timerfn(void * map,int * key,struct bpf_timer * timer)933 static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
934 {
935 	scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
936 	bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
937 	return 0;
938 }
939 
BPF_STRUCT_OPS_SLEEPABLE(qmap_init)940 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
941 {
942 	u32 key = 0;
943 	struct bpf_timer *timer;
944 	s32 ret;
945 
946 	if (print_msgs && !sub_cgroup_id)
947 		print_cpus();
948 
949 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
950 	if (ret) {
951 		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
952 		return ret;
953 	}
954 
955 	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
956 	if (ret) {
957 		scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
958 		return ret;
959 	}
960 
961 	ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
962 	if (ret)
963 		return ret;
964 
965 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
966 	if (!timer)
967 		return -ESRCH;
968 	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
969 	bpf_timer_set_callback(timer, monitor_timerfn);
970 	ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
971 	if (ret)
972 		return ret;
973 
974 	if (__COMPAT_has_generic_reenq()) {
975 		/* see lowpri_timerfn() */
976 		timer = bpf_map_lookup_elem(&lowpri_timer, &key);
977 		if (!timer)
978 			return -ESRCH;
979 		bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
980 		bpf_timer_set_callback(timer, lowpri_timerfn);
981 		ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
982 		if (ret)
983 			return ret;
984 	}
985 
986 	return 0;
987 }
988 
BPF_STRUCT_OPS(qmap_exit,struct scx_exit_info * ei)989 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
990 {
991 	UEI_RECORD(uei, ei);
992 }
993 
BPF_STRUCT_OPS(qmap_sub_attach,struct scx_sub_attach_args * args)994 s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
995 {
996 	s32 i;
997 
998 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
999 		if (!sub_sched_cgroup_ids[i]) {
1000 			sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
1001 			bpf_printk("attaching sub-sched[%d] on %s",
1002 				   i, args->cgroup_path);
1003 			return 0;
1004 		}
1005 	}
1006 
1007 	return -ENOSPC;
1008 }
1009 
BPF_STRUCT_OPS(qmap_sub_detach,struct scx_sub_detach_args * args)1010 void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
1011 {
1012 	s32 i;
1013 
1014 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
1015 		if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
1016 			sub_sched_cgroup_ids[i] = 0;
1017 			bpf_printk("detaching sub-sched[%d] on %s",
1018 				   i, args->cgroup_path);
1019 			break;
1020 		}
1021 	}
1022 }
1023 
1024 SCX_OPS_DEFINE(qmap_ops,
1025 	       .select_cpu		= (void *)qmap_select_cpu,
1026 	       .enqueue			= (void *)qmap_enqueue,
1027 	       .dequeue			= (void *)qmap_dequeue,
1028 	       .dispatch		= (void *)qmap_dispatch,
1029 	       .tick			= (void *)qmap_tick,
1030 	       .core_sched_before	= (void *)qmap_core_sched_before,
1031 	       .init_task		= (void *)qmap_init_task,
1032 	       .dump			= (void *)qmap_dump,
1033 	       .dump_cpu		= (void *)qmap_dump_cpu,
1034 	       .dump_task		= (void *)qmap_dump_task,
1035 	       .cgroup_init		= (void *)qmap_cgroup_init,
1036 	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
1037 	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
1038 	       .sub_attach		= (void *)qmap_sub_attach,
1039 	       .sub_detach		= (void *)qmap_sub_detach,
1040 	       .cpu_online		= (void *)qmap_cpu_online,
1041 	       .cpu_offline		= (void *)qmap_cpu_offline,
1042 	       .init			= (void *)qmap_init,
1043 	       .exit			= (void *)qmap_exit,
1044 	       .timeout_ms		= 5000U,
1045 	       .name			= "qmap");
1046