xref: /linux/tools/testing/selftests/sched_ext/dequeue.c (revision 5bdb4078e1efba9650c03753616866192d680718)
1*658ad225SAndrea Righi // SPDX-License-Identifier: GPL-2.0
2*658ad225SAndrea Righi /*
3*658ad225SAndrea Righi  * Copyright (c) 2025 NVIDIA Corporation.
4*658ad225SAndrea Righi  */
5*658ad225SAndrea Righi #define _GNU_SOURCE
6*658ad225SAndrea Righi #include <stdio.h>
7*658ad225SAndrea Righi #include <unistd.h>
8*658ad225SAndrea Righi #include <signal.h>
9*658ad225SAndrea Righi #include <time.h>
10*658ad225SAndrea Righi #include <bpf/bpf.h>
11*658ad225SAndrea Righi #include <scx/common.h>
12*658ad225SAndrea Righi #include <sys/wait.h>
13*658ad225SAndrea Righi #include <sched.h>
14*658ad225SAndrea Righi #include <pthread.h>
15*658ad225SAndrea Righi #include "scx_test.h"
16*658ad225SAndrea Righi #include "dequeue.bpf.skel.h"
17*658ad225SAndrea Righi 
18*658ad225SAndrea Righi #define NUM_WORKERS 8
19*658ad225SAndrea Righi #define AFFINITY_HAMMER_MS 500
20*658ad225SAndrea Righi 
21*658ad225SAndrea Righi /*
22*658ad225SAndrea Righi  * Worker function that creates enqueue/dequeue events via CPU work and
23*658ad225SAndrea Righi  * sleep.
24*658ad225SAndrea Righi  */
worker_fn(int id)25*658ad225SAndrea Righi static void worker_fn(int id)
26*658ad225SAndrea Righi {
27*658ad225SAndrea Righi 	int i;
28*658ad225SAndrea Righi 	volatile int sum = 0;
29*658ad225SAndrea Righi 
30*658ad225SAndrea Righi 	for (i = 0; i < 1000; i++) {
31*658ad225SAndrea Righi 		volatile int j;
32*658ad225SAndrea Righi 
33*658ad225SAndrea Righi 		/* Do some work to trigger scheduling events */
34*658ad225SAndrea Righi 		for (j = 0; j < 10000; j++)
35*658ad225SAndrea Righi 			sum += j;
36*658ad225SAndrea Righi 
37*658ad225SAndrea Righi 		/* Sleep to trigger dequeue */
38*658ad225SAndrea Righi 		usleep(1000 + (id * 100));
39*658ad225SAndrea Righi 	}
40*658ad225SAndrea Righi 
41*658ad225SAndrea Righi 	exit(0);
42*658ad225SAndrea Righi }
43*658ad225SAndrea Righi 
44*658ad225SAndrea Righi /*
45*658ad225SAndrea Righi  * This thread changes workers' affinity from outside so that some changes
46*658ad225SAndrea Righi  * hit tasks while they are still in the scheduler's queue and trigger
47*658ad225SAndrea Righi  * property-change dequeues.
48*658ad225SAndrea Righi  */
affinity_hammer_fn(void * arg)49*658ad225SAndrea Righi static void *affinity_hammer_fn(void *arg)
50*658ad225SAndrea Righi {
51*658ad225SAndrea Righi 	pid_t *pids = arg;
52*658ad225SAndrea Righi 	cpu_set_t cpuset;
53*658ad225SAndrea Righi 	int i = 0, n = NUM_WORKERS;
54*658ad225SAndrea Righi 	struct timespec start, now;
55*658ad225SAndrea Righi 
56*658ad225SAndrea Righi 	clock_gettime(CLOCK_MONOTONIC, &start);
57*658ad225SAndrea Righi 	while (1) {
58*658ad225SAndrea Righi 		int w = i % n;
59*658ad225SAndrea Righi 		int cpu = (i / n) % 4;
60*658ad225SAndrea Righi 
61*658ad225SAndrea Righi 		CPU_ZERO(&cpuset);
62*658ad225SAndrea Righi 		CPU_SET(cpu, &cpuset);
63*658ad225SAndrea Righi 		sched_setaffinity(pids[w], sizeof(cpuset), &cpuset);
64*658ad225SAndrea Righi 		i++;
65*658ad225SAndrea Righi 
66*658ad225SAndrea Righi 		/* Check elapsed time every 256 iterations to limit gettime cost */
67*658ad225SAndrea Righi 		if ((i & 255) == 0) {
68*658ad225SAndrea Righi 			long long elapsed_ms;
69*658ad225SAndrea Righi 
70*658ad225SAndrea Righi 			clock_gettime(CLOCK_MONOTONIC, &now);
71*658ad225SAndrea Righi 			elapsed_ms = (now.tv_sec - start.tv_sec) * 1000LL +
72*658ad225SAndrea Righi 				     (now.tv_nsec - start.tv_nsec) / 1000000;
73*658ad225SAndrea Righi 			if (elapsed_ms >= AFFINITY_HAMMER_MS)
74*658ad225SAndrea Righi 				break;
75*658ad225SAndrea Righi 		}
76*658ad225SAndrea Righi 	}
77*658ad225SAndrea Righi 	return NULL;
78*658ad225SAndrea Righi }
79*658ad225SAndrea Righi 
run_scenario(struct dequeue * skel,u32 scenario,const char * scenario_name)80*658ad225SAndrea Righi static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
81*658ad225SAndrea Righi 					 const char *scenario_name)
82*658ad225SAndrea Righi {
83*658ad225SAndrea Righi 	struct bpf_link *link;
84*658ad225SAndrea Righi 	pid_t pids[NUM_WORKERS];
85*658ad225SAndrea Righi 	pthread_t hammer;
86*658ad225SAndrea Righi 
87*658ad225SAndrea Righi 	int i, status;
88*658ad225SAndrea Righi 	u64 enq_start, deq_start,
89*658ad225SAndrea Righi 	    dispatch_deq_start, change_deq_start, bpf_queue_full_start;
90*658ad225SAndrea Righi 	u64 enq_delta, deq_delta,
91*658ad225SAndrea Righi 	    dispatch_deq_delta, change_deq_delta, bpf_queue_full_delta;
92*658ad225SAndrea Righi 
93*658ad225SAndrea Righi 	/* Set the test scenario */
94*658ad225SAndrea Righi 	skel->bss->test_scenario = scenario;
95*658ad225SAndrea Righi 
96*658ad225SAndrea Righi 	/* Record starting counts */
97*658ad225SAndrea Righi 	enq_start = skel->bss->enqueue_cnt;
98*658ad225SAndrea Righi 	deq_start = skel->bss->dequeue_cnt;
99*658ad225SAndrea Righi 	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
100*658ad225SAndrea Righi 	change_deq_start = skel->bss->change_dequeue_cnt;
101*658ad225SAndrea Righi 	bpf_queue_full_start = skel->bss->bpf_queue_full;
102*658ad225SAndrea Righi 
103*658ad225SAndrea Righi 	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
104*658ad225SAndrea Righi 	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
105*658ad225SAndrea Righi 
106*658ad225SAndrea Righi 	/* Fork worker processes to generate enqueue/dequeue events */
107*658ad225SAndrea Righi 	for (i = 0; i < NUM_WORKERS; i++) {
108*658ad225SAndrea Righi 		pids[i] = fork();
109*658ad225SAndrea Righi 		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
110*658ad225SAndrea Righi 
111*658ad225SAndrea Righi 		if (pids[i] == 0) {
112*658ad225SAndrea Righi 			worker_fn(i);
113*658ad225SAndrea Righi 			/* Should not reach here */
114*658ad225SAndrea Righi 			exit(1);
115*658ad225SAndrea Righi 		}
116*658ad225SAndrea Righi 	}
117*658ad225SAndrea Righi 
118*658ad225SAndrea Righi 	/*
119*658ad225SAndrea Righi 	 * Run an "affinity hammer" so that some property changes hit tasks
120*658ad225SAndrea Righi 	 * while they are still in BPF custody (e.g., in user DSQ or BPF
121*658ad225SAndrea Righi 	 * queue), triggering SCX_DEQ_SCHED_CHANGE dequeues.
122*658ad225SAndrea Righi 	 */
123*658ad225SAndrea Righi 	SCX_FAIL_IF(pthread_create(&hammer, NULL, affinity_hammer_fn, pids) != 0,
124*658ad225SAndrea Righi 		    "Failed to create affinity hammer thread");
125*658ad225SAndrea Righi 	pthread_join(hammer, NULL);
126*658ad225SAndrea Righi 
127*658ad225SAndrea Righi 	/* Wait for all workers to complete */
128*658ad225SAndrea Righi 	for (i = 0; i < NUM_WORKERS; i++) {
129*658ad225SAndrea Righi 		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
130*658ad225SAndrea Righi 			    "Failed to wait for worker %d", i);
131*658ad225SAndrea Righi 		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
132*658ad225SAndrea Righi 	}
133*658ad225SAndrea Righi 
134*658ad225SAndrea Righi 	bpf_link__destroy(link);
135*658ad225SAndrea Righi 
136*658ad225SAndrea Righi 	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
137*658ad225SAndrea Righi 
138*658ad225SAndrea Righi 	/* Calculate deltas */
139*658ad225SAndrea Righi 	enq_delta = skel->bss->enqueue_cnt - enq_start;
140*658ad225SAndrea Righi 	deq_delta = skel->bss->dequeue_cnt - deq_start;
141*658ad225SAndrea Righi 	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
142*658ad225SAndrea Righi 	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
143*658ad225SAndrea Righi 	bpf_queue_full_delta = skel->bss->bpf_queue_full - bpf_queue_full_start;
144*658ad225SAndrea Righi 
145*658ad225SAndrea Righi 	printf("%s:\n", scenario_name);
146*658ad225SAndrea Righi 	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
147*658ad225SAndrea Righi 	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
148*658ad225SAndrea Righi 	       (unsigned long)deq_delta,
149*658ad225SAndrea Righi 	       (unsigned long)dispatch_deq_delta,
150*658ad225SAndrea Righi 	       (unsigned long)change_deq_delta);
151*658ad225SAndrea Righi 	printf("  BPF queue full: %lu\n", (unsigned long)bpf_queue_full_delta);
152*658ad225SAndrea Righi 
153*658ad225SAndrea Righi 	/*
154*658ad225SAndrea Righi 	 * Validate enqueue/dequeue lifecycle tracking.
155*658ad225SAndrea Righi 	 *
156*658ad225SAndrea Righi 	 * For scenarios 0, 1, 3, 4 (local and global DSQs from
157*658ad225SAndrea Righi 	 * ops.select_cpu() and ops.enqueue()), both enqueues and dequeues
158*658ad225SAndrea Righi 	 * should be 0 because tasks bypass the BPF scheduler entirely:
159*658ad225SAndrea Righi 	 * tasks never enter BPF scheduler's custody.
160*658ad225SAndrea Righi 	 *
161*658ad225SAndrea Righi 	 * For scenarios 2, 5, 6 (user DSQ or BPF internal queue) we expect
162*658ad225SAndrea Righi 	 * both enqueues and dequeues.
163*658ad225SAndrea Righi 	 *
164*658ad225SAndrea Righi 	 * The BPF code does strict state machine validation with
165*658ad225SAndrea Righi 	 * scx_bpf_error() to ensure the workflow semantics are correct.
166*658ad225SAndrea Righi 	 *
167*658ad225SAndrea Righi 	 * If we reach this point without errors, the semantics are
168*658ad225SAndrea Righi 	 * validated correctly.
169*658ad225SAndrea Righi 	 */
170*658ad225SAndrea Righi 	if (scenario == 0 || scenario == 1 ||
171*658ad225SAndrea Righi 	    scenario == 3 || scenario == 4) {
172*658ad225SAndrea Righi 		/* Tasks bypass BPF scheduler completely */
173*658ad225SAndrea Righi 		SCX_EQ(enq_delta, 0);
174*658ad225SAndrea Righi 		SCX_EQ(deq_delta, 0);
175*658ad225SAndrea Righi 		SCX_EQ(dispatch_deq_delta, 0);
176*658ad225SAndrea Righi 		SCX_EQ(change_deq_delta, 0);
177*658ad225SAndrea Righi 	} else {
178*658ad225SAndrea Righi 		/*
179*658ad225SAndrea Righi 		 * User DSQ from ops.enqueue() or ops.select_cpu(): tasks
180*658ad225SAndrea Righi 		 * enter BPF scheduler's custody.
181*658ad225SAndrea Righi 		 *
182*658ad225SAndrea Righi 		 * Also validate 1:1 enqueue/dequeue pairing.
183*658ad225SAndrea Righi 		 */
184*658ad225SAndrea Righi 		SCX_GT(enq_delta, 0);
185*658ad225SAndrea Righi 		SCX_GT(deq_delta, 0);
186*658ad225SAndrea Righi 		SCX_EQ(enq_delta, deq_delta);
187*658ad225SAndrea Righi 	}
188*658ad225SAndrea Righi 
189*658ad225SAndrea Righi 	return SCX_TEST_PASS;
190*658ad225SAndrea Righi }
191*658ad225SAndrea Righi 
setup(void ** ctx)192*658ad225SAndrea Righi static enum scx_test_status setup(void **ctx)
193*658ad225SAndrea Righi {
194*658ad225SAndrea Righi 	struct dequeue *skel;
195*658ad225SAndrea Righi 
196*658ad225SAndrea Righi 	skel = dequeue__open();
197*658ad225SAndrea Righi 	SCX_FAIL_IF(!skel, "Failed to open skel");
198*658ad225SAndrea Righi 	SCX_ENUM_INIT(skel);
199*658ad225SAndrea Righi 	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
200*658ad225SAndrea Righi 
201*658ad225SAndrea Righi 	*ctx = skel;
202*658ad225SAndrea Righi 
203*658ad225SAndrea Righi 	return SCX_TEST_PASS;
204*658ad225SAndrea Righi }
205*658ad225SAndrea Righi 
run(void * ctx)206*658ad225SAndrea Righi static enum scx_test_status run(void *ctx)
207*658ad225SAndrea Righi {
208*658ad225SAndrea Righi 	struct dequeue *skel = ctx;
209*658ad225SAndrea Righi 	enum scx_test_status status;
210*658ad225SAndrea Righi 
211*658ad225SAndrea Righi 	status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.select_cpu()");
212*658ad225SAndrea Righi 	if (status != SCX_TEST_PASS)
213*658ad225SAndrea Righi 		return status;
214*658ad225SAndrea Righi 
215*658ad225SAndrea Righi 	status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.select_cpu()");
216*658ad225SAndrea Righi 	if (status != SCX_TEST_PASS)
217*658ad225SAndrea Righi 		return status;
218*658ad225SAndrea Righi 
219*658ad225SAndrea Righi 	status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.select_cpu()");
220*658ad225SAndrea Righi 	if (status != SCX_TEST_PASS)
221*658ad225SAndrea Righi 		return status;
222*658ad225SAndrea Righi 
223*658ad225SAndrea Righi 	status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.enqueue()");
224*658ad225SAndrea Righi 	if (status != SCX_TEST_PASS)
225*658ad225SAndrea Righi 		return status;
226*658ad225SAndrea Righi 
227*658ad225SAndrea Righi 	status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.enqueue()");
228*658ad225SAndrea Righi 	if (status != SCX_TEST_PASS)
229*658ad225SAndrea Righi 		return status;
230*658ad225SAndrea Righi 
231*658ad225SAndrea Righi 	status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.enqueue()");
232*658ad225SAndrea Righi 	if (status != SCX_TEST_PASS)
233*658ad225SAndrea Righi 		return status;
234*658ad225SAndrea Righi 
235*658ad225SAndrea Righi 	status = run_scenario(skel, 6, "Scenario 6: BPF queue from ops.enqueue()");
236*658ad225SAndrea Righi 	if (status != SCX_TEST_PASS)
237*658ad225SAndrea Righi 		return status;
238*658ad225SAndrea Righi 
239*658ad225SAndrea Righi 	printf("\n=== Summary ===\n");
240*658ad225SAndrea Righi 	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
241*658ad225SAndrea Righi 	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
242*658ad225SAndrea Righi 	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
243*658ad225SAndrea Righi 	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
244*658ad225SAndrea Righi 	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
245*658ad225SAndrea Righi 	       (unsigned long)skel->bss->change_dequeue_cnt);
246*658ad225SAndrea Righi 	printf("  BPF queue full: %lu\n",
247*658ad225SAndrea Righi 	       (unsigned long)skel->bss->bpf_queue_full);
248*658ad225SAndrea Righi 	printf("\nAll scenarios passed - no state machine violations detected\n");
249*658ad225SAndrea Righi 	printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler\n");
250*658ad225SAndrea Righi 	printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler\n");
251*658ad225SAndrea Righi 	printf("-> Validated: User DSQ dispatch triggers ops.dequeue() callbacks\n");
252*658ad225SAndrea Righi 	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
253*658ad225SAndrea Righi 	printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n");
254*658ad225SAndrea Righi 	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
255*658ad225SAndrea Righi 
256*658ad225SAndrea Righi 	return SCX_TEST_PASS;
257*658ad225SAndrea Righi }
258*658ad225SAndrea Righi 
cleanup(void * ctx)259*658ad225SAndrea Righi static void cleanup(void *ctx)
260*658ad225SAndrea Righi {
261*658ad225SAndrea Righi 	struct dequeue *skel = ctx;
262*658ad225SAndrea Righi 
263*658ad225SAndrea Righi 	dequeue__destroy(skel);
264*658ad225SAndrea Righi }
265*658ad225SAndrea Righi 
266*658ad225SAndrea Righi struct scx_test dequeue_test = {
267*658ad225SAndrea Righi 	.name = "dequeue",
268*658ad225SAndrea Righi 	.description = "Verify ops.dequeue() semantics",
269*658ad225SAndrea Righi 	.setup = setup,
270*658ad225SAndrea Righi 	.run = run,
271*658ad225SAndrea Righi 	.cleanup = cleanup,
272*658ad225SAndrea Righi };
273*658ad225SAndrea Righi 
274*658ad225SAndrea Righi REGISTER_SCX_TEST(&dequeue_test)
275