xref: /linux/tools/sched_ext/include/scx/common.bpf.h (revision 5bdb4078e1efba9650c03753616866192d680718)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
4  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
5  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
6  */
7 #ifndef __SCX_COMMON_BPF_H
8 #define __SCX_COMMON_BPF_H
9 
10 /*
11  * The generated kfunc prototypes in vmlinux.h are missing address space
12  * attributes which cause build failures. For now, suppress the generated
13  * prototypes. See https://github.com/sched-ext/scx/issues/1111.
14  */
15 #define BPF_NO_KFUNC_PROTOTYPES
16 
17 #ifdef LSP
18 #define __bpf__
19 #include "../vmlinux.h"
20 #else
21 #include "vmlinux.h"
22 #endif
23 
24 #include <bpf/bpf_helpers.h>
25 #include <bpf/bpf_tracing.h>
26 #include <asm-generic/errno.h>
27 #include "user_exit_info.bpf.h"
28 #include "enum_defs.autogen.h"
29 
30 #define PF_IDLE				0x00000002	/* I am an IDLE thread */
31 #define PF_IO_WORKER			0x00000010	/* Task is an IO worker */
32 #define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
33 #define PF_KCOMPACTD			0x00010000      /* I am kcompactd */
34 #define PF_KSWAPD			0x00020000      /* I am kswapd */
35 #define PF_KTHREAD			0x00200000	/* I am a kernel thread */
36 #define PF_EXITING			0x00000004
37 #define CLOCK_MONOTONIC			1
38 
39 #ifndef NR_CPUS
40 #define NR_CPUS 1024
41 #endif
42 
43 #ifndef NUMA_NO_NODE
44 #define	NUMA_NO_NODE	(-1)
45 #endif
46 
47 extern int LINUX_KERNEL_VERSION __kconfig;
48 extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak;
49 extern const char CONFIG_LOCALVERSION[64] __kconfig __weak;
50 
51 /*
52  * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
53  * lead to really confusing misbehaviors. Let's trigger a build failure.
54  */
___vmlinux_h_sanity_check___(void)55 static inline void ___vmlinux_h_sanity_check___(void)
56 {
57 	_Static_assert(SCX_DSQ_FLAG_BUILTIN,
58 		       "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
59 }
60 
61 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
62 s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
63 s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
64 			     struct scx_bpf_select_cpu_and_args *args) __ksym __weak;
65 bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak;
66 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
67 void scx_bpf_dispatch_cancel(void) __ksym;
68 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
69 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
70 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
71 struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
72 int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
73 struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
74 void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
75 void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
76 void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
77 void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
78 u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
79 u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
80 void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
81 u32 scx_bpf_nr_node_ids(void) __ksym __weak;
82 u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
83 int scx_bpf_cpu_node(s32 cpu) __ksym __weak;
84 const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
85 const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
86 void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
87 const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __ksym __weak;
88 const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
89 const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __ksym __weak;
90 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
91 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
92 bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
93 s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak;
94 s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
95 s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak;
96 s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
97 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
98 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
99 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
100 struct rq *scx_bpf_locked_rq(void) __ksym;
101 struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
102 u64 scx_bpf_now(void) __ksym __weak;
103 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
104 
105 /*
106  * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
107  * within bpf_for_each() loops.
108  */
109 #define BPF_FOR_EACH_ITER	(&___it)
110 
111 #define scx_read_event(e, name)							\
112 	(bpf_core_field_exists((e)->name) ? (e)->name : 0)
113 
114 static inline __attribute__((format(printf, 1, 2)))
___scx_bpf_bstr_format_checker(const char * fmt,...)115 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
116 
117 #define SCX_STRINGIFY(x) #x
118 #define SCX_TOSTRING(x) SCX_STRINGIFY(x)
119 
120 /*
121  * Helper macro for initializing the fmt and variadic argument inputs to both
122  * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
123  * refer to the initialized list of inputs to the bstr kfunc.
124  */
125 #define scx_bpf_bstr_preamble(fmt, args...)					\
126 	static char ___fmt[] = fmt;						\
127 	/*									\
128 	 * Note that __param[] must have at least one				\
129 	 * element to keep the verifier happy.					\
130 	 */									\
131 	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};		\
132 										\
133 	_Pragma("GCC diagnostic push")						\
134 	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
135 	___bpf_fill(___param, args);						\
136 	_Pragma("GCC diagnostic pop")
137 
138 /*
139  * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
140  * instead of an array of u64. Using this macro will cause the scheduler to
141  * exit cleanly with the specified exit code being passed to user space.
142  */
143 #define scx_bpf_exit(code, fmt, args...)					\
144 ({										\
145 	scx_bpf_bstr_preamble(fmt, args)					\
146 	scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param));		\
147 	___scx_bpf_bstr_format_checker(fmt, ##args);				\
148 })
149 
150 /*
151  * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
152  * instead of an array of u64. Invoking this macro will cause the scheduler to
153  * exit in an erroneous state, with diagnostic information being passed to the
154  * user. It appends the file and line number to aid debugging.
155  */
156 #define scx_bpf_error(fmt, args...)						\
157 ({										\
158 	scx_bpf_bstr_preamble(							\
159 		__FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args)		\
160 	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
161 	___scx_bpf_bstr_format_checker(						\
162 		__FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args);		\
163 })
164 
165 /*
166  * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments
167  * instead of an array of u64. To be used from ops.dump() and friends.
168  */
169 #define scx_bpf_dump(fmt, args...)						\
170 ({										\
171 	scx_bpf_bstr_preamble(fmt, args)					\
172 	scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param));			\
173 	___scx_bpf_bstr_format_checker(fmt, ##args);				\
174 })
175 
176 /*
177  * scx_bpf_dump_header() is a wrapper around scx_bpf_dump that adds a header
178  * of system information for debugging.
179  */
180 #define scx_bpf_dump_header()							\
181 ({										\
182 	scx_bpf_dump("kernel: %d.%d.%d %s\ncc: %s\n",				\
183 		     LINUX_KERNEL_VERSION >> 16,				\
184 		     LINUX_KERNEL_VERSION >> 8 & 0xFF,				\
185 		     LINUX_KERNEL_VERSION & 0xFF,				\
186 		     CONFIG_LOCALVERSION,					\
187 		     CONFIG_CC_VERSION_TEXT);					\
188 })
189 
190 #define BPF_STRUCT_OPS(name, args...)						\
191 SEC("struct_ops/"#name)								\
192 BPF_PROG(name, ##args)
193 
194 #define BPF_STRUCT_OPS_SLEEPABLE(name, args...)					\
195 SEC("struct_ops.s/"#name)							\
196 BPF_PROG(name, ##args)
197 
198 /**
199  * RESIZABLE_ARRAY - Generates annotations for an array that may be resized
200  * @elfsec: the data section of the BPF program in which to place the array
201  * @arr: the name of the array
202  *
203  * libbpf has an API for setting map value sizes. Since data sections (i.e.
204  * bss, data, rodata) themselves are maps, a data section can be resized. If
205  * a data section has an array as its last element, the BTF info for that
206  * array will be adjusted so that length of the array is extended to meet the
207  * new length of the data section. This macro annotates an array to have an
208  * element count of one with the assumption that this array can be resized
209  * within the userspace program. It also annotates the section specifier so
210  * this array exists in a custom sub data section which can be resized
211  * independently.
212  *
213  * See RESIZE_ARRAY() for the userspace convenience macro for resizing an
214  * array declared with RESIZABLE_ARRAY().
215  */
216 #define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr)
217 
218 /**
219  * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
220  * @base: struct or array to index
221  * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...)
222  *
223  * The verifier often gets confused by the instruction sequence the compiler
224  * generates for indexing struct fields or arrays. This macro forces the
225  * compiler to generate a code sequence which first calculates the byte offset,
226  * checks it against the struct or array size and add that byte offset to
227  * generate the pointer to the member to help the verifier.
228  *
229  * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
230  * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller
231  * must check for %NULL and take appropriate action to appease the verifier. To
232  * avoid confusing the verifier, it's best to check for %NULL and dereference
233  * immediately.
234  *
235  *	vptr = MEMBER_VPTR(my_array, [i][j]);
236  *	if (!vptr)
237  *		return error;
238  *	*vptr = new_value;
239  *
240  * sizeof(@base) should encompass the memory area to be accessed and thus can't
241  * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
242  * `MEMBER_VPTR(ptr, ->member)`.
243  */
244 #ifndef MEMBER_VPTR
245 #define MEMBER_VPTR(base, member) (typeof((base) member) *)			\
246 ({										\
247 	u64 __base = (u64)&(base);						\
248 	u64 __addr = (u64)&((base) member) - __base;				\
249 	_Static_assert(sizeof(base) >= sizeof((base) member),			\
250 		       "@base is smaller than @member, is @base a pointer?");	\
251 	asm volatile (								\
252 		"if %0 <= %[max] goto +2\n"					\
253 		"%0 = 0\n"							\
254 		"goto +1\n"							\
255 		"%0 += %1\n"							\
256 		: "+r"(__addr)							\
257 		: "r"(__base),							\
258 		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
259 	__addr;									\
260 })
261 #endif /* MEMBER_VPTR */
262 
263 /**
264  * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
265  * @arr: array to index into
266  * @i: array index
267  * @n: number of elements in array
268  *
269  * Similar to MEMBER_VPTR() but is intended for use with arrays where the
270  * element count needs to be explicit.
271  * It can be used in cases where a global array is defined with an initial
272  * size but is intended to be be resized before loading the BPF program.
273  * Without this version of the macro, MEMBER_VPTR() will use the compile time
274  * size of the array to compute the max, which will result in rejection by
275  * the verifier.
276  */
277 #ifndef ARRAY_ELEM_PTR
278 #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)				\
279 ({										\
280 	u64 __base = (u64)arr;							\
281 	u64 __addr = (u64)&(arr[i]) - __base;					\
282 	asm volatile (								\
283 		"if %0 <= %[max] goto +2\n"					\
284 		"%0 = 0\n"							\
285 		"goto +1\n"							\
286 		"%0 += %1\n"							\
287 		: "+r"(__addr)							\
288 		: "r"(__base),							\
289 		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));			\
290 	__addr;									\
291 })
292 #endif /* ARRAY_ELEM_PTR */
293 
294 /**
295  * __sink - Hide @expr's value from the compiler and BPF verifier
296  * @expr: The expression whose value should be opacified
297  *
298  * No-op at runtime. The empty inline assembly with a read-write constraint
299  * ("+g") has two effects at compile/verify time:
300  *
301  * 1. Compiler: treats @expr as both read and written, preventing dead-code
302  *    elimination and keeping @expr (and any side effects that produced it)
303  *    alive.
304  *
305  * 2. BPF verifier: forgets the precise value/range of @expr ("makes it
306  *    imprecise"). The verifier normally tracks exact ranges for every register
307  *    and stack slot. While useful, precision means each distinct value creates a
308  *    separate verifier state. Inside loops this leads to state explosion - each
309  *    iteration carries different precise values so states never merge and the
310  *    verifier explores every iteration individually.
311  *
312  * Example - preventing loop state explosion::
313  *
314  *     u32 nr_intersects = 0, nr_covered = 0;
315  *     __sink(nr_intersects);
316  *     __sink(nr_covered);
317  *     bpf_for(i, 0, nr_nodes) {
318  *         if (intersects(cpumask, node_mask[i]))
319  *             nr_intersects++;
320  *         if (covers(cpumask, node_mask[i]))
321  *             nr_covered++;
322  *     }
323  *
324  * Without __sink(), the verifier tracks every possible (nr_intersects,
325  * nr_covered) pair across iterations, causing "BPF program is too large". With
326  * __sink(), the values become unknown scalars so all iterations collapse into
327  * one reusable state.
328  *
329  * Example - keeping a reference alive::
330  *
331  *     struct task_struct *t = bpf_task_acquire(task);
332  *     __sink(t);
333  *
334  * Follows the convention from BPF selftests (bpf_misc.h).
335  */
336 #define __sink(expr) asm volatile ("" : "+g"(expr))
337 
338 /*
339  * BPF declarations and helpers
340  */
341 
342 /* list and rbtree */
343 #define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
344 #define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
345 
346 void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
347 void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
348 
349 #define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
350 #define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
351 
352 int bpf_list_push_front_impl(struct bpf_list_head *head,
353 				    struct bpf_list_node *node,
354 				    void *meta, __u64 off) __ksym;
355 #define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0)
356 
357 int bpf_list_push_back_impl(struct bpf_list_head *head,
358 				   struct bpf_list_node *node,
359 				   void *meta, __u64 off) __ksym;
360 #define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0)
361 
362 struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
363 struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
364 struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
365 				      struct bpf_rb_node *node) __ksym;
366 int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
367 			bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
368 			void *meta, __u64 off) __ksym;
369 #define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
370 
371 struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
372 
373 void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
374 #define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
375 
376 /* task */
377 struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
378 struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
379 void bpf_task_release(struct task_struct *p) __ksym;
380 
381 /* cgroup */
382 struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
383 struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) __ksym;
384 void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
385 struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
386 
387 /* css iteration */
388 struct bpf_iter_css;
389 struct cgroup_subsys_state;
390 extern int bpf_iter_css_new(struct bpf_iter_css *it,
391 			    struct cgroup_subsys_state *start,
392 			    unsigned int flags) __weak __ksym;
393 extern struct cgroup_subsys_state *
394 bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
395 extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
396 
397 /* cpumask */
398 struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
399 struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
400 void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
401 u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
402 u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
403 void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
404 void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
405 bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
406 bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
407 bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
408 void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
409 void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
410 bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
411 		     const struct cpumask *src2) __ksym;
412 void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1,
413 		    const struct cpumask *src2) __ksym;
414 void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1,
415 		     const struct cpumask *src2) __ksym;
416 bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
417 bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
418 bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
419 bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
420 bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
421 void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
422 u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
423 u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
424 				   const struct cpumask *src2) __ksym;
425 u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
426 
427 int bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words) __ksym;
428 int *bpf_iter_bits_next(struct bpf_iter_bits *it) __ksym;
429 void bpf_iter_bits_destroy(struct bpf_iter_bits *it) __ksym;
430 
431 #define def_iter_struct(name)							\
432 struct bpf_iter_##name {							\
433     struct bpf_iter_bits it;							\
434     const struct cpumask *bitmap;						\
435 };
436 
437 #define def_iter_new(name)							\
438 static inline int bpf_iter_##name##_new(					\
439 	struct bpf_iter_##name *it, const u64 *unsafe_ptr__ign, u32 nr_words)	\
440 {										\
441 	it->bitmap = scx_bpf_get_##name##_cpumask();				\
442 	return bpf_iter_bits_new(&it->it, (const u64 *)it->bitmap,		\
443 				 sizeof(struct cpumask) / 8);			\
444 }
445 
446 #define def_iter_next(name)							\
447 static inline int *bpf_iter_##name##_next(struct bpf_iter_##name *it) {		\
448 	return bpf_iter_bits_next(&it->it);					\
449 }
450 
451 #define def_iter_destroy(name)							\
452 static inline void bpf_iter_##name##_destroy(struct bpf_iter_##name *it) {	\
453 	scx_bpf_put_cpumask(it->bitmap);					\
454 	bpf_iter_bits_destroy(&it->it);						\
455 }
456 #define def_for_each_cpu(cpu, name) for_each_##name##_cpu(cpu)
457 
458 /// Provides iterator for possible and online cpus.
459 ///
460 /// # Example
461 ///
462 /// ```
463 /// static inline void example_use() {
464 ///     int *cpu;
465 ///
466 ///     for_each_possible_cpu(cpu){
467 ///         bpf_printk("CPU %d is possible", *cpu);
468 ///     }
469 ///
470 ///     for_each_online_cpu(cpu){
471 ///         bpf_printk("CPU %d is online", *cpu);
472 ///     }
473 /// }
474 /// ```
475 def_iter_struct(possible);
476 def_iter_new(possible);
477 def_iter_next(possible);
478 def_iter_destroy(possible);
479 #define for_each_possible_cpu(cpu) bpf_for_each(possible, cpu, NULL, 0)
480 
481 def_iter_struct(online);
482 def_iter_new(online);
483 def_iter_next(online);
484 def_iter_destroy(online);
485 #define for_each_online_cpu(cpu) bpf_for_each(online, cpu, NULL, 0)
486 
487 /*
488  * Access a cpumask in read-only mode (typically to check bits).
489  */
cast_mask(struct bpf_cpumask * mask)490 static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
491 {
492 	return (const struct cpumask *)mask;
493 }
494 
495 /*
496  * Return true if task @p cannot migrate to a different CPU, false
497  * otherwise.
498  */
is_migration_disabled(const struct task_struct * p)499 static inline bool is_migration_disabled(const struct task_struct *p)
500 {
501 	/*
502 	 * Testing p->migration_disabled in a BPF code is tricky because the
503 	 * migration is _always_ disabled while running the BPF code.
504 	 * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF
505 	 * code execution disable and re-enable the migration of the current
506 	 * task, respectively. So, the _current_ task of the sched_ext ops is
507 	 * always migration-disabled. Moreover, p->migration_disabled could be
508 	 * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is
509 	 * executed in the middle of the other BPF code execution.
510 	 *
511 	 * Therefore, we should decide that the _current_ task is
512 	 * migration-disabled only when its migration_disabled count is greater
513 	 * than one. In other words, when  p->migration_disabled == 1, there is
514 	 * an ambiguity, so we should check if @p is the current task or not.
515 	 */
516 	if (bpf_core_field_exists(p->migration_disabled)) {
517 		if (p->migration_disabled == 1)
518 			return bpf_get_current_task_btf() != p;
519 		else
520 			return p->migration_disabled;
521 	}
522 	return false;
523 }
524 
525 /* rcu */
526 void bpf_rcu_read_lock(void) __ksym;
527 void bpf_rcu_read_unlock(void) __ksym;
528 
529 /*
530  * Time helpers, most of which are from jiffies.h.
531  */
532 
533 /**
534  * time_delta - Calculate the delta between new and old time stamp
535  * @after: first comparable as u64
536  * @before: second comparable as u64
537  *
538  * Return: the time difference, which is >= 0
539  */
time_delta(u64 after,u64 before)540 static inline s64 time_delta(u64 after, u64 before)
541 {
542 	return (s64)(after - before) > 0 ? (s64)(after - before) : 0;
543 }
544 
545 /**
546  * time_after - returns true if the time a is after time b.
547  * @a: first comparable as u64
548  * @b: second comparable as u64
549  *
550  * Do this with "<0" and ">=0" to only test the sign of the result. A
551  * good compiler would generate better code (and a really good compiler
552  * wouldn't care). Gcc is currently neither.
553  *
554  * Return: %true is time a is after time b, otherwise %false.
555  */
time_after(u64 a,u64 b)556 static inline bool time_after(u64 a, u64 b)
557 {
558 	return (s64)(b - a) < 0;
559 }
560 
561 /**
562  * time_before - returns true if the time a is before time b.
563  * @a: first comparable as u64
564  * @b: second comparable as u64
565  *
566  * Return: %true is time a is before time b, otherwise %false.
567  */
time_before(u64 a,u64 b)568 static inline bool time_before(u64 a, u64 b)
569 {
570 	return time_after(b, a);
571 }
572 
573 /**
574  * time_after_eq - returns true if the time a is after or the same as time b.
575  * @a: first comparable as u64
576  * @b: second comparable as u64
577  *
578  * Return: %true is time a is after or the same as time b, otherwise %false.
579  */
time_after_eq(u64 a,u64 b)580 static inline bool time_after_eq(u64 a, u64 b)
581 {
582 	return (s64)(a - b) >= 0;
583 }
584 
585 /**
586  * time_before_eq - returns true if the time a is before or the same as time b.
587  * @a: first comparable as u64
588  * @b: second comparable as u64
589  *
590  * Return: %true is time a is before or the same as time b, otherwise %false.
591  */
time_before_eq(u64 a,u64 b)592 static inline bool time_before_eq(u64 a, u64 b)
593 {
594 	return time_after_eq(b, a);
595 }
596 
597 /**
598  * time_in_range - Calculate whether a is in the range of [b, c].
599  * @a: time to test
600  * @b: beginning of the range
601  * @c: end of the range
602  *
603  * Return: %true is time a is in the range [b, c], otherwise %false.
604  */
time_in_range(u64 a,u64 b,u64 c)605 static inline bool time_in_range(u64 a, u64 b, u64 c)
606 {
607 	return time_after_eq(a, b) && time_before_eq(a, c);
608 }
609 
610 /**
611  * time_in_range_open - Calculate whether a is in the range of [b, c).
612  * @a: time to test
613  * @b: beginning of the range
614  * @c: end of the range
615  *
616  * Return: %true is time a is in the range [b, c), otherwise %false.
617  */
time_in_range_open(u64 a,u64 b,u64 c)618 static inline bool time_in_range_open(u64 a, u64 b, u64 c)
619 {
620 	return time_after_eq(a, b) && time_before(a, c);
621 }
622 
623 
624 /*
625  * Other helpers
626  */
627 
628 /* useful compiler attributes */
629 #ifndef likely
630 #define likely(x) __builtin_expect(!!(x), 1)
631 #endif
632 #ifndef unlikely
633 #define unlikely(x) __builtin_expect(!!(x), 0)
634 #endif
635 #ifndef __maybe_unused
636 #define __maybe_unused __attribute__((__unused__))
637 #endif
638 
639 /*
640  * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They
641  * prevent compiler from caching, redoing or reordering reads or writes.
642  */
643 typedef __u8  __attribute__((__may_alias__))  __u8_alias_t;
644 typedef __u16 __attribute__((__may_alias__)) __u16_alias_t;
645 typedef __u32 __attribute__((__may_alias__)) __u32_alias_t;
646 typedef __u64 __attribute__((__may_alias__)) __u64_alias_t;
647 
__read_once_size(const volatile void * p,void * res,int size)648 static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
649 {
650 	switch (size) {
651 	case 1: *(__u8_alias_t  *) res = *(volatile __u8_alias_t  *) p; break;
652 	case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
653 	case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
654 	case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
655 	default:
656 		barrier();
657 		__builtin_memcpy((void *)res, (const void *)p, size);
658 		barrier();
659 	}
660 }
661 
__write_once_size(volatile void * p,void * res,int size)662 static __always_inline void __write_once_size(volatile void *p, void *res, int size)
663 {
664 	switch (size) {
665 	case 1: *(volatile  __u8_alias_t *) p = *(__u8_alias_t  *) res; break;
666 	case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
667 	case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
668 	case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
669 	default:
670 		barrier();
671 		__builtin_memcpy((void *)p, (const void *)res, size);
672 		barrier();
673 	}
674 }
675 
676 /*
677  * __unqual_typeof(x) - Declare an unqualified scalar type, leaving
678  *			non-scalar types unchanged,
679  *
680  * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char'
681  * is not type-compatible with 'signed char', and we define a separate case.
682  *
683  * This is copied verbatim from kernel's include/linux/compiler_types.h, but
684  * with default expression (for pointers) changed from (x) to (typeof(x)0).
685  *
686  * This is because LLVM has a bug where for lvalue (x), it does not get rid of
687  * an extra address_space qualifier, but does in case of rvalue (typeof(x)0).
688  * Hence, for pointers, we need to create an rvalue expression to get the
689  * desired type. See https://github.com/llvm/llvm-project/issues/53400.
690  */
691 #define __scalar_type_to_expr_cases(type) \
692 	unsigned type : (unsigned type)0, signed type : (signed type)0
693 
694 #define __unqual_typeof(x)                              \
695 	typeof(_Generic((x),                            \
696 		char: (char)0,                          \
697 		__scalar_type_to_expr_cases(char),      \
698 		__scalar_type_to_expr_cases(short),     \
699 		__scalar_type_to_expr_cases(int),       \
700 		__scalar_type_to_expr_cases(long),      \
701 		__scalar_type_to_expr_cases(long long), \
702 		default: (typeof(x))0))
703 
704 #define READ_ONCE(x)								\
705 ({										\
706 	union { __unqual_typeof(x) __val; char __c[1]; } __u =			\
707 		{ .__c = { 0 } };						\
708 	__read_once_size((__unqual_typeof(x) *)&(x), __u.__c, sizeof(x));	\
709 	__u.__val;								\
710 })
711 
712 #define WRITE_ONCE(x, val)							\
713 ({										\
714 	union { __unqual_typeof(x) __val; char __c[1]; } __u =			\
715 		{ .__val = (val) }; 						\
716 	__write_once_size((__unqual_typeof(x) *)&(x), __u.__c, sizeof(x));	\
717 	__u.__val;								\
718 })
719 
720 /*
721  * __calc_avg - Calculate exponential weighted moving average (EWMA) with
722  * @old and @new values. @decay represents how large the @old value remains.
723  * With a larger @decay value, the moving average changes slowly, exhibiting
724  * fewer fluctuations.
725  */
726 #define __calc_avg(old, new, decay) ({						\
727 	typeof(decay) thr = 1 << (decay);					\
728 	typeof(old) ret;							\
729 	if (((old) < thr) || ((new) < thr)) {					\
730 		if (((old) == 1) && ((new) == 0))				\
731 			ret = 0;						\
732 		else								\
733 			ret = ((old) - ((old) >> 1)) + ((new) >> 1);		\
734 	} else {								\
735 		ret = ((old) - ((old) >> (decay))) + ((new) >> (decay));	\
736 	}									\
737 	ret;									\
738 })
739 
740 /*
741  * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
742  * @v: The value for which we're computing the base 2 logarithm.
743  */
log2_u32(u32 v)744 static inline u32 log2_u32(u32 v)
745 {
746         u32 r;
747         u32 shift;
748 
749         r = (v > 0xFFFF) << 4; v >>= r;
750         shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
751         shift = (v > 0xF) << 2; v >>= shift; r |= shift;
752         shift = (v > 0x3) << 1; v >>= shift; r |= shift;
753         r |= (v >> 1);
754         return r;
755 }
756 
757 /*
758  * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value.
759  * @v: The value for which we're computing the base 2 logarithm.
760  */
log2_u64(u64 v)761 static inline u32 log2_u64(u64 v)
762 {
763         u32 hi = v >> 32;
764         if (hi)
765                 return log2_u32(hi) + 32 + 1;
766         else
767                 return log2_u32(v) + 1;
768 }
769 
770 /*
771  * sqrt_u64 - Calculate the square root of value @x using Newton's method.
772  */
__sqrt_u64(u64 x)773 static inline u64 __sqrt_u64(u64 x)
774 {
775 	if (x == 0 || x == 1)
776 		return x;
777 
778 	u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32);
779 
780 	for (int i = 0; i < 8; ++i) {
781 		u64 q = x / r;
782 		if (r <= q)
783 			break;
784 		r = (r + q) >> 1;
785 	}
786 	return r;
787 }
788 
789 /*
790  * ctzll -- Counts trailing zeros in an unsigned long long. If the input value
791  * is zero, the return value is undefined.
792  */
ctzll(u64 v)793 static inline int ctzll(u64 v)
794 {
795 #if (!defined(__BPF__) && defined(__SCX_TARGET_ARCH_x86)) || \
796 	(defined(__BPF__) && defined(__clang_major__) && __clang_major__ >= 19)
797 	/*
798 	 * Use the ctz builtin when: (1) building for native x86, or
799 	 * (2) building for BPF with clang >= 19 (BPF backend supports
800 	 * the intrinsic from clang 19 onward; earlier versions hit
801 	 * "unimplemented opcode" in the backend).
802 	 */
803 	return __builtin_ctzll(v);
804 #else
805 	/*
806 	 * If neither the target architecture nor the toolchains support ctzll,
807 	 * use software-based emulation. Let's use the De Bruijn sequence-based
808 	 * approach to find LSB fastly. See the details of De Bruijn sequence:
809 	 *
810 	 * https://en.wikipedia.org/wiki/De_Bruijn_sequence
811 	 * https://www.chessprogramming.org/BitScan#De_Bruijn_Multiplication
812 	 */
813 	const int lookup_table[64] = {
814 		 0,  1, 48,  2, 57, 49, 28,  3, 61, 58, 50, 42, 38, 29, 17,  4,
815 		62, 55, 59, 36, 53, 51, 43, 22, 45, 39, 33, 30, 24, 18, 12,  5,
816 		63, 47, 56, 27, 60, 41, 37, 16, 54, 35, 52, 21, 44, 32, 23, 11,
817 		46, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19,  9, 13,  8,  7,  6,
818 	};
819 	const u64 DEBRUIJN_CONSTANT = 0x03f79d71b4cb0a89ULL;
820 	unsigned int index;
821 	u64 lowest_bit;
822 	const int *lt;
823 
824 	if (v == 0)
825 		return -1;
826 
827 	/*
828 	 * Isolate the least significant bit (LSB).
829 	 * For example, if v = 0b...10100, then v & -v = 0b...00100
830 	 */
831 	lowest_bit = v & -v;
832 
833 	/*
834 	 * Each isolated bit produces a unique 6-bit value, guaranteed by the
835 	 * De Bruijn property. Calculate a unique index into the lookup table
836 	 * using the magic constant and a right shift.
837 	 *
838 	 * Multiplying by the 64-bit constant "spreads out" that 1-bit into a
839 	 * unique pattern in the top 6 bits. This uniqueness property is
840 	 * exactly what a De Bruijn sequence guarantees: Every possible 6-bit
841 	 * pattern (in top bits) occurs exactly once for each LSB position. So,
842 	 * the constant 0x03f79d71b4cb0a89ULL is carefully chosen to be a
843 	 * De Bruijn sequence, ensuring no collisions in the table index.
844 	 */
845 	index = (lowest_bit * DEBRUIJN_CONSTANT) >> 58;
846 
847 	/*
848 	 * Lookup in a precomputed table. No collision is guaranteed by the
849 	 * De Bruijn property.
850 	 */
851 	lt = MEMBER_VPTR(lookup_table, [index]);
852 	return (lt)? *lt : -1;
853 #endif
854 }
855 
856 /*
857  * Return a value proportionally scaled to the task's weight.
858  */
scale_by_task_weight(const struct task_struct * p,u64 value)859 static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value)
860 {
861 	return (value * p->scx.weight) / 100;
862 }
863 
864 /*
865  * Return a value inversely proportional to the task's weight.
866  */
scale_by_task_weight_inverse(const struct task_struct * p,u64 value)867 static inline u64 scale_by_task_weight_inverse(const struct task_struct *p, u64 value)
868 {
869 	return value * 100 / p->scx.weight;
870 }
871 
872 
873 /*
874  * Get a random u64 from the kernel's pseudo-random generator.
875  */
get_prandom_u64()876 static inline u64 get_prandom_u64()
877 {
878 	return ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32();
879 }
880 
881 /*
882  * Define the shadow structure to avoid a compilation error when
883  * vmlinux.h does not enable necessary kernel configs. The ___local
884  * suffix is a CO-RE convention that tells the loader to match this
885  * against the base struct rq in the kernel. The attribute
886  * preserve_access_index tells the compiler to generate a CO-RE
887  * relocation for these fields.
888  */
889 struct rq___local {
890 	/*
891 	 * A monotonically increasing clock per CPU. It is rq->clock minus
892 	 * cumulative IRQ time and hypervisor steal time. Unlike rq->clock,
893 	 * it does not advance during IRQ processing or hypervisor preemption.
894 	 * It does advance during idle (the idle task counts as a running task
895 	 * for this purpose).
896 	 */
897 	u64		clock_task;
898 	/*
899 	 * Invariant version of clock_task scaled by CPU capacity and
900 	 * frequency. For example, clock_pelt advances 2x slower on a CPU
901 	 * with half the capacity.
902 	 *
903 	 * At idle exit, rq->clock_pelt jumps forward to resync with
904 	 * clock_task. The kernel's rq_clock_pelt() corrects for this jump
905 	 * by subtracting lost_idle_time, yielding a clock that appears
906 	 * continuous across idle transitions. scx_clock_pelt() mirrors
907 	 * rq_clock_pelt() by performing the same subtraction.
908 	 */
909 	u64		clock_pelt;
910 	/*
911 	 * Accumulates the magnitude of each clock_pelt jump at idle exit.
912 	 * Subtracting this from clock_pelt gives rq_clock_pelt(): a
913 	 * continuous, capacity-invariant clock suitable for both task
914 	 * execution time stamping and cross-idle measurements.
915 	 */
916 	unsigned long	lost_idle_time;
917 	/*
918 	 * Shadow of paravirt_steal_clock() (the hypervisor's cumulative
919 	 * stolen time counter). Stays frozen while the hypervisor preempts
920 	 * the vCPU; catches up the next time update_rq_clock_task() is
921 	 * called. The delta is the stolen time not yet subtracted from
922 	 * clock_task.
923 	 *
924 	 * Unlike irqtime->total (a plain kernel-side field), the live stolen
925 	 * time counter lives in hypervisor-specific shared memory and has no
926 	 * kernel-side equivalent readable from BPF in a hypervisor-agnostic
927 	 * way. This field is therefore the only portable BPF-accessible
928 	 * approximation of cumulative steal time.
929 	 *
930 	 * Available only when CONFIG_PARAVIRT_TIME_ACCOUNTING is on.
931 	 */
932 	u64		prev_steal_time_rq;
933 } __attribute__((preserve_access_index));
934 
935 extern struct rq runqueues __ksym;
936 
937 /*
938  * Define the shadow structure to avoid a compilation error when
939  * vmlinux.h does not enable necessary kernel configs.
940  */
941 struct irqtime___local {
942 	/*
943 	 * Cumulative IRQ time counter for this CPU, in nanoseconds. Advances
944 	 * immediately at the exit of every hardirq and non-ksoftirqd softirq
945 	 * via irqtime_account_irq(). ksoftirqd time is counted as normal
946 	 * task time and is NOT included. NMI time is also NOT included.
947 	 *
948 	 * The companion field irqtime->sync (struct u64_stats_sync) protects
949 	 * against 64-bit tearing on 32-bit architectures. On 64-bit kernels,
950 	 * u64_stats_sync is an empty struct and all seqcount operations are
951 	 * no-ops, so a plain BPF_CORE_READ of this field is safe.
952 	 *
953 	 * Available only when CONFIG_IRQ_TIME_ACCOUNTING is on.
954 	 */
955 	u64		total;
956 } __attribute__((preserve_access_index));
957 
958 /*
959  * cpu_irqtime is a per-CPU variable defined only when
960  * CONFIG_IRQ_TIME_ACCOUNTING is on. Declare it as __weak so the BPF
961  * loader sets its address to 0 (rather than failing) when the symbol
962  * is absent from the running kernel.
963  */
964 extern struct irqtime___local cpu_irqtime __ksym __weak;
965 
get_current_rq(u32 cpu)966 static inline struct rq___local *get_current_rq(u32 cpu)
967 {
968 	/*
969 	 * This is a workaround to get an rq pointer since we decided to
970 	 * deprecate scx_bpf_cpu_rq().
971 	 *
972 	 * WARNING: The caller must hold the rq lock for @cpu. This is
973 	 * guaranteed when called from scheduling callbacks (ops.running,
974 	 * ops.stopping, ops.enqueue, ops.dequeue, ops.dispatch, etc.).
975 	 * There is no runtime check available in BPF for kernel spinlock
976 	 * state — correctness is enforced by calling context only.
977 	 */
978 	return (void *)bpf_per_cpu_ptr(&runqueues, cpu);
979 }
980 
scx_clock_task(u32 cpu)981 static inline u64 scx_clock_task(u32 cpu)
982 {
983 	struct rq___local *rq = get_current_rq(cpu);
984 
985 	/* Equivalent to the kernel's rq_clock_task(). */
986 	return rq ? rq->clock_task : 0;
987 }
988 
scx_clock_pelt(u32 cpu)989 static inline u64 scx_clock_pelt(u32 cpu)
990 {
991 	struct rq___local *rq = get_current_rq(cpu);
992 
993 	/*
994 	 * Equivalent to the kernel's rq_clock_pelt(): subtracts
995 	 * lost_idle_time from clock_pelt to absorb the jump that occurs
996 	 * when clock_pelt resyncs with clock_task at idle exit. The result
997 	 * is a continuous, capacity-invariant clock safe for both task
998 	 * execution time stamping and cross-idle measurements.
999 	 */
1000 	return rq ? (rq->clock_pelt - rq->lost_idle_time) : 0;
1001 }
1002 
scx_clock_virt(u32 cpu)1003 static inline u64 scx_clock_virt(u32 cpu)
1004 {
1005 	struct rq___local *rq;
1006 
1007 	/*
1008 	 * Check field existence before calling get_current_rq() so we avoid
1009 	 * the per_cpu lookup entirely on kernels built without
1010 	 * CONFIG_PARAVIRT_TIME_ACCOUNTING.
1011 	 */
1012 	if (!bpf_core_field_exists(((struct rq___local *)0)->prev_steal_time_rq))
1013 		return 0;
1014 
1015 	/* Lagging shadow of the kernel's paravirt_steal_clock(). */
1016 	rq = get_current_rq(cpu);
1017 	return rq ? BPF_CORE_READ(rq, prev_steal_time_rq) : 0;
1018 }
1019 
scx_clock_irq(u32 cpu)1020 static inline u64 scx_clock_irq(u32 cpu)
1021 {
1022 	struct irqtime___local *irqt;
1023 
1024 	/*
1025 	 * bpf_core_type_exists() resolves at load time: if struct irqtime is
1026 	 * absent from kernel BTF (CONFIG_IRQ_TIME_ACCOUNTING off), the loader
1027 	 * patches this into an unconditional return 0, making the
1028 	 * bpf_per_cpu_ptr() call below dead code that the verifier never sees.
1029 	 */
1030 	if (!bpf_core_type_exists(struct irqtime___local))
1031 		return 0;
1032 
1033 	/* Equivalent to the kernel's irq_time_read(). */
1034 	irqt = bpf_per_cpu_ptr(&cpu_irqtime, cpu);
1035 	return irqt ? BPF_CORE_READ(irqt, total) : 0;
1036 }
1037 
1038 #include "compat.bpf.h"
1039 #include "enums.bpf.h"
1040 
1041 #endif	/* __SCX_COMMON_BPF_H */
1042