1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * A scheduler that validates ops.dequeue() is called correctly:
4 * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
5 * scheduler entirely: no ops.dequeue() should be called
6 * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody:
7 * ops.dequeue() must be called when they leave custody
8 * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by
9 * exactly one ops.dequeue() (validate 1:1 pairing and state machine)
10 *
11 * Copyright (c) 2026 NVIDIA Corporation.
12 */
13
14 #include <scx/common.bpf.h>
15
16 #define SHARED_DSQ 0
17
18 /*
19 * BPF internal queue.
20 *
21 * Tasks are stored here and consumed from ops.dispatch(), validating that
22 * tasks on BPF internal structures still get ops.dequeue() when they
23 * leave.
24 */
25 struct {
26 __uint(type, BPF_MAP_TYPE_QUEUE);
27 __uint(max_entries, 32768);
28 __type(value, s32);
29 } global_queue SEC(".maps");
30
31 char _license[] SEC("license") = "GPL";
32
33 UEI_DEFINE(uei);
34
35 /*
36 * Counters to track the lifecycle of tasks:
37 * - enqueue_cnt: Number of times ops.enqueue() was called
38 * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
39 * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
40 * - change_dequeue_cnt: Number of property change dequeues
41 * - bpf_queue_full: Number of times the BPF internal queue was full
42 */
43 u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full;
44
45 /*
46 * Test scenarios:
47 * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
48 * scheduler, no dequeue callbacks)
49 * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
50 * scheduler, no dequeue callbacks)
51 * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
52 * dequeue callbacks expected)
53 * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
54 * scheduler, no dequeue callbacks)
55 * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
56 * scheduler, no dequeue callbacks)
57 * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
58 * dequeue callbacks expected)
59 * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(),
60 * consume in ops.dispatch() and dispatch to local DSQ (validates dequeue
61 * for tasks stored in internal BPF data structures)
62 */
63 u32 test_scenario;
64
65 /*
66 * Per-task state to track lifecycle and validate workflow semantics.
67 * State transitions:
68 * NONE -> ENQUEUED (on enqueue)
69 * NONE -> DISPATCHED (on direct dispatch to terminal DSQ)
70 * ENQUEUED -> DISPATCHED (on dispatch dequeue)
71 * DISPATCHED -> NONE (on property change dequeue or re-enqueue)
72 * ENQUEUED -> NONE (on property change dequeue before dispatch)
73 */
74 enum task_state {
75 TASK_NONE = 0,
76 TASK_ENQUEUED,
77 TASK_DISPATCHED,
78 };
79
80 struct task_ctx {
81 enum task_state state; /* Current state in the workflow */
82 u64 enqueue_seq; /* Sequence number for debugging */
83 };
84
85 struct {
86 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
87 __uint(map_flags, BPF_F_NO_PREALLOC);
88 __type(key, int);
89 __type(value, struct task_ctx);
90 } task_ctx_stor SEC(".maps");
91
try_lookup_task_ctx(struct task_struct * p)92 static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
93 {
94 return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
95 }
96
BPF_STRUCT_OPS(dequeue_select_cpu,struct task_struct * p,s32 prev_cpu,u64 wake_flags)97 s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
98 s32 prev_cpu, u64 wake_flags)
99 {
100 struct task_ctx *tctx;
101
102 tctx = try_lookup_task_ctx(p);
103 if (!tctx)
104 return prev_cpu;
105
106 switch (test_scenario) {
107 case 0:
108 /*
109 * Direct dispatch to the local DSQ.
110 *
111 * Task bypasses BPF scheduler entirely: no enqueue
112 * tracking, no ops.dequeue() callbacks.
113 */
114 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
115 tctx->state = TASK_DISPATCHED;
116 break;
117 case 1:
118 /*
119 * Direct dispatch to the global DSQ.
120 *
121 * Task bypasses BPF scheduler entirely: no enqueue
122 * tracking, no ops.dequeue() callbacks.
123 */
124 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
125 tctx->state = TASK_DISPATCHED;
126 break;
127 case 2:
128 /*
129 * Dispatch to a shared user DSQ.
130 *
131 * Task enters BPF scheduler management: track
132 * enqueue/dequeue lifecycle and validate state
133 * transitions.
134 */
135 if (tctx->state == TASK_ENQUEUED)
136 scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
137 p->pid, p->comm, tctx->enqueue_seq);
138
139 scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
140
141 __sync_fetch_and_add(&enqueue_cnt, 1);
142
143 tctx->state = TASK_ENQUEUED;
144 tctx->enqueue_seq++;
145 break;
146 }
147
148 return prev_cpu;
149 }
150
BPF_STRUCT_OPS(dequeue_enqueue,struct task_struct * p,u64 enq_flags)151 void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
152 {
153 struct task_ctx *tctx;
154 s32 pid = p->pid;
155
156 tctx = try_lookup_task_ctx(p);
157 if (!tctx)
158 return;
159
160 switch (test_scenario) {
161 case 3:
162 /*
163 * Direct dispatch to the local DSQ.
164 *
165 * Task bypasses BPF scheduler entirely: no enqueue
166 * tracking, no ops.dequeue() callbacks.
167 */
168 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
169 tctx->state = TASK_DISPATCHED;
170 break;
171 case 4:
172 /*
173 * Direct dispatch to the global DSQ.
174 *
175 * Task bypasses BPF scheduler entirely: no enqueue
176 * tracking, no ops.dequeue() callbacks.
177 */
178 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
179 tctx->state = TASK_DISPATCHED;
180 break;
181 case 5:
182 /*
183 * Dispatch to shared user DSQ.
184 *
185 * Task enters BPF scheduler management: track
186 * enqueue/dequeue lifecycle and validate state
187 * transitions.
188 */
189 if (tctx->state == TASK_ENQUEUED)
190 scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
191 p->pid, p->comm, tctx->enqueue_seq);
192
193 scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
194
195 __sync_fetch_and_add(&enqueue_cnt, 1);
196
197 tctx->state = TASK_ENQUEUED;
198 tctx->enqueue_seq++;
199 break;
200 case 6:
201 /*
202 * Store task in BPF internal queue.
203 *
204 * Task enters BPF scheduler management: track
205 * enqueue/dequeue lifecycle and validate state
206 * transitions.
207 */
208 if (tctx->state == TASK_ENQUEUED)
209 scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
210 p->pid, p->comm, tctx->enqueue_seq);
211
212 if (bpf_map_push_elem(&global_queue, &pid, 0)) {
213 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
214 __sync_fetch_and_add(&bpf_queue_full, 1);
215
216 tctx->state = TASK_DISPATCHED;
217 } else {
218 __sync_fetch_and_add(&enqueue_cnt, 1);
219
220 tctx->state = TASK_ENQUEUED;
221 tctx->enqueue_seq++;
222 }
223 break;
224 default:
225 /* For all other scenarios, dispatch to the global DSQ */
226 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
227 tctx->state = TASK_DISPATCHED;
228 break;
229 }
230
231 scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
232 }
233
BPF_STRUCT_OPS(dequeue_dequeue,struct task_struct * p,u64 deq_flags)234 void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
235 {
236 struct task_ctx *tctx;
237
238 __sync_fetch_and_add(&dequeue_cnt, 1);
239
240 tctx = try_lookup_task_ctx(p);
241 if (!tctx)
242 return;
243
244 /*
245 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
246 * ops.dequeue() should never be called because tasks bypass the
247 * BPF scheduler entirely. If we get here, it's a kernel bug.
248 */
249 if (test_scenario == 0 || test_scenario == 3) {
250 scx_bpf_error("%d (%s): dequeue called for local DSQ scenario",
251 p->pid, p->comm);
252 return;
253 }
254
255 if (test_scenario == 1 || test_scenario == 4) {
256 scx_bpf_error("%d (%s): dequeue called for global DSQ scenario",
257 p->pid, p->comm);
258 return;
259 }
260
261 if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
262 /*
263 * Property change interrupting the workflow. Valid from
264 * both ENQUEUED and DISPATCHED states. Transitions task
265 * back to NONE state.
266 */
267 __sync_fetch_and_add(&change_dequeue_cnt, 1);
268
269 /* Validate state transition */
270 if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
271 scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
272 p->pid, p->comm, tctx->state, tctx->enqueue_seq);
273
274 /*
275 * Transition back to NONE: task outside scheduler control.
276 *
277 * Scenario 6: dispatch() checks tctx->state after popping a
278 * PID, if the task is in state NONE, it was dequeued by
279 * property change and must not be dispatched (this
280 * prevents "target CPU not allowed").
281 */
282 tctx->state = TASK_NONE;
283 } else {
284 /*
285 * Regular dispatch dequeue: kernel is moving the task from
286 * BPF custody to a terminal DSQ. Normally we come from
287 * ENQUEUED state. We can also see TASK_NONE if the task
288 * was dequeued by property change (SCX_DEQ_SCHED_CHANGE)
289 * while it was already on a DSQ (dispatched but not yet
290 * consumed); in that case we just leave state as NONE.
291 */
292 __sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
293
294 /*
295 * Must be ENQUEUED (normal path) or NONE (already dequeued
296 * by property change while on a DSQ).
297 */
298 if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_NONE)
299 scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
300 p->pid, p->comm, tctx->state, tctx->enqueue_seq);
301
302 if (tctx->state == TASK_ENQUEUED)
303 tctx->state = TASK_DISPATCHED;
304
305 /* NONE: leave as-is, task was already property-change dequeued */
306 }
307 }
308
BPF_STRUCT_OPS(dequeue_dispatch,s32 cpu,struct task_struct * prev)309 void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
310 {
311 if (test_scenario == 6) {
312 struct task_ctx *tctx;
313 struct task_struct *p;
314 s32 pid;
315
316 if (bpf_map_pop_elem(&global_queue, &pid))
317 return;
318
319 p = bpf_task_from_pid(pid);
320 if (!p)
321 return;
322
323 /*
324 * If the task was dequeued by property change
325 * (ops.dequeue() set tctx->state = TASK_NONE), skip
326 * dispatch.
327 */
328 tctx = try_lookup_task_ctx(p);
329 if (!tctx || tctx->state == TASK_NONE) {
330 bpf_task_release(p);
331 return;
332 }
333
334 /*
335 * Dispatch to this CPU's local DSQ if allowed, otherwise
336 * fallback to the global DSQ.
337 */
338 if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
339 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
340 else
341 scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
342
343 bpf_task_release(p);
344 } else {
345 scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
346 }
347 }
348
BPF_STRUCT_OPS(dequeue_init_task,struct task_struct * p,struct scx_init_task_args * args)349 s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
350 struct scx_init_task_args *args)
351 {
352 struct task_ctx *tctx;
353
354 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
355 BPF_LOCAL_STORAGE_GET_F_CREATE);
356 if (!tctx)
357 return -ENOMEM;
358
359 return 0;
360 }
361
BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)362 s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
363 {
364 s32 ret;
365
366 ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
367 if (ret)
368 return ret;
369
370 return 0;
371 }
372
BPF_STRUCT_OPS(dequeue_exit,struct scx_exit_info * ei)373 void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
374 {
375 UEI_RECORD(uei, ei);
376 }
377
378 SEC(".struct_ops.link")
379 struct sched_ext_ops dequeue_ops = {
380 .select_cpu = (void *)dequeue_select_cpu,
381 .enqueue = (void *)dequeue_enqueue,
382 .dequeue = (void *)dequeue_dequeue,
383 .dispatch = (void *)dequeue_dispatch,
384 .init_task = (void *)dequeue_init_task,
385 .init = (void *)dequeue_init,
386 .exit = (void *)dequeue_exit,
387 .flags = SCX_OPS_ENQ_LAST,
388 .name = "dequeue_test",
389 };
390