1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
4 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
5 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
6 */
7 #ifndef __SCX_COMMON_BPF_H
8 #define __SCX_COMMON_BPF_H
9
10 /*
11 * The generated kfunc prototypes in vmlinux.h are missing address space
12 * attributes which cause build failures. For now, suppress the generated
13 * prototypes. See https://github.com/sched-ext/scx/issues/1111.
14 */
15 #define BPF_NO_KFUNC_PROTOTYPES
16
17 #ifdef LSP
18 #define __bpf__
19 #include "../vmlinux.h"
20 #else
21 #include "vmlinux.h"
22 #endif
23
24 #include <bpf/bpf_helpers.h>
25 #include <bpf/bpf_tracing.h>
26 #include <asm-generic/errno.h>
27 #include "user_exit_info.bpf.h"
28 #include "enum_defs.autogen.h"
29
30 #define PF_IDLE 0x00000002 /* I am an IDLE thread */
31 #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */
32 #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
33 #define PF_KCOMPACTD 0x00010000 /* I am kcompactd */
34 #define PF_KSWAPD 0x00020000 /* I am kswapd */
35 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
36 #define PF_EXITING 0x00000004
37 #define CLOCK_MONOTONIC 1
38
39 #ifndef NR_CPUS
40 #define NR_CPUS 1024
41 #endif
42
43 #ifndef NUMA_NO_NODE
44 #define NUMA_NO_NODE (-1)
45 #endif
46
47 extern int LINUX_KERNEL_VERSION __kconfig;
48 extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak;
49 extern const char CONFIG_LOCALVERSION[64] __kconfig __weak;
50
51 /*
52 * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
53 * lead to really confusing misbehaviors. Let's trigger a build failure.
54 */
___vmlinux_h_sanity_check___(void)55 static inline void ___vmlinux_h_sanity_check___(void)
56 {
57 _Static_assert(SCX_DSQ_FLAG_BUILTIN,
58 "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
59 }
60
61 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
62 s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
63 s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
64 struct scx_bpf_select_cpu_and_args *args) __ksym __weak;
65 bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak;
66 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
67 void scx_bpf_dispatch_cancel(void) __ksym;
68 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
69 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
70 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
71 struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
72 int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
73 struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
74 void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
75 void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
76 void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
77 void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
78 u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
79 u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
80 void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
81 u32 scx_bpf_nr_node_ids(void) __ksym __weak;
82 u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
83 int scx_bpf_cpu_node(s32 cpu) __ksym __weak;
84 const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
85 const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
86 void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
87 const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __ksym __weak;
88 const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
89 const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __ksym __weak;
90 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
91 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
92 bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
93 s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak;
94 s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
95 s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak;
96 s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
97 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
98 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
99 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
100 struct rq *scx_bpf_locked_rq(void) __ksym;
101 struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
102 u64 scx_bpf_now(void) __ksym __weak;
103 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
104
105 /*
106 * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
107 * within bpf_for_each() loops.
108 */
109 #define BPF_FOR_EACH_ITER (&___it)
110
111 #define scx_read_event(e, name) \
112 (bpf_core_field_exists((e)->name) ? (e)->name : 0)
113
114 static inline __attribute__((format(printf, 1, 2)))
___scx_bpf_bstr_format_checker(const char * fmt,...)115 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
116
117 #define SCX_STRINGIFY(x) #x
118 #define SCX_TOSTRING(x) SCX_STRINGIFY(x)
119
120 /*
121 * Helper macro for initializing the fmt and variadic argument inputs to both
122 * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
123 * refer to the initialized list of inputs to the bstr kfunc.
124 */
125 #define scx_bpf_bstr_preamble(fmt, args...) \
126 static char ___fmt[] = fmt; \
127 /* \
128 * Note that __param[] must have at least one \
129 * element to keep the verifier happy. \
130 */ \
131 unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \
132 \
133 _Pragma("GCC diagnostic push") \
134 _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
135 ___bpf_fill(___param, args); \
136 _Pragma("GCC diagnostic pop")
137
138 /*
139 * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
140 * instead of an array of u64. Using this macro will cause the scheduler to
141 * exit cleanly with the specified exit code being passed to user space.
142 */
143 #define scx_bpf_exit(code, fmt, args...) \
144 ({ \
145 scx_bpf_bstr_preamble(fmt, args) \
146 scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \
147 ___scx_bpf_bstr_format_checker(fmt, ##args); \
148 })
149
150 /*
151 * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
152 * instead of an array of u64. Invoking this macro will cause the scheduler to
153 * exit in an erroneous state, with diagnostic information being passed to the
154 * user. It appends the file and line number to aid debugging.
155 */
156 #define scx_bpf_error(fmt, args...) \
157 ({ \
158 scx_bpf_bstr_preamble( \
159 __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args) \
160 scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \
161 ___scx_bpf_bstr_format_checker( \
162 __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args); \
163 })
164
165 /*
166 * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments
167 * instead of an array of u64. To be used from ops.dump() and friends.
168 */
169 #define scx_bpf_dump(fmt, args...) \
170 ({ \
171 scx_bpf_bstr_preamble(fmt, args) \
172 scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \
173 ___scx_bpf_bstr_format_checker(fmt, ##args); \
174 })
175
176 /*
177 * scx_bpf_dump_header() is a wrapper around scx_bpf_dump that adds a header
178 * of system information for debugging.
179 */
180 #define scx_bpf_dump_header() \
181 ({ \
182 scx_bpf_dump("kernel: %d.%d.%d %s\ncc: %s\n", \
183 LINUX_KERNEL_VERSION >> 16, \
184 LINUX_KERNEL_VERSION >> 8 & 0xFF, \
185 LINUX_KERNEL_VERSION & 0xFF, \
186 CONFIG_LOCALVERSION, \
187 CONFIG_CC_VERSION_TEXT); \
188 })
189
190 #define BPF_STRUCT_OPS(name, args...) \
191 SEC("struct_ops/"#name) \
192 BPF_PROG(name, ##args)
193
194 #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \
195 SEC("struct_ops.s/"#name) \
196 BPF_PROG(name, ##args)
197
198 /**
199 * RESIZABLE_ARRAY - Generates annotations for an array that may be resized
200 * @elfsec: the data section of the BPF program in which to place the array
201 * @arr: the name of the array
202 *
203 * libbpf has an API for setting map value sizes. Since data sections (i.e.
204 * bss, data, rodata) themselves are maps, a data section can be resized. If
205 * a data section has an array as its last element, the BTF info for that
206 * array will be adjusted so that length of the array is extended to meet the
207 * new length of the data section. This macro annotates an array to have an
208 * element count of one with the assumption that this array can be resized
209 * within the userspace program. It also annotates the section specifier so
210 * this array exists in a custom sub data section which can be resized
211 * independently.
212 *
213 * See RESIZE_ARRAY() for the userspace convenience macro for resizing an
214 * array declared with RESIZABLE_ARRAY().
215 */
216 #define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr)
217
218 /**
219 * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
220 * @base: struct or array to index
221 * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...)
222 *
223 * The verifier often gets confused by the instruction sequence the compiler
224 * generates for indexing struct fields or arrays. This macro forces the
225 * compiler to generate a code sequence which first calculates the byte offset,
226 * checks it against the struct or array size and add that byte offset to
227 * generate the pointer to the member to help the verifier.
228 *
229 * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
230 * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller
231 * must check for %NULL and take appropriate action to appease the verifier. To
232 * avoid confusing the verifier, it's best to check for %NULL and dereference
233 * immediately.
234 *
235 * vptr = MEMBER_VPTR(my_array, [i][j]);
236 * if (!vptr)
237 * return error;
238 * *vptr = new_value;
239 *
240 * sizeof(@base) should encompass the memory area to be accessed and thus can't
241 * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
242 * `MEMBER_VPTR(ptr, ->member)`.
243 */
244 #ifndef MEMBER_VPTR
245 #define MEMBER_VPTR(base, member) (typeof((base) member) *) \
246 ({ \
247 u64 __base = (u64)&(base); \
248 u64 __addr = (u64)&((base) member) - __base; \
249 _Static_assert(sizeof(base) >= sizeof((base) member), \
250 "@base is smaller than @member, is @base a pointer?"); \
251 asm volatile ( \
252 "if %0 <= %[max] goto +2\n" \
253 "%0 = 0\n" \
254 "goto +1\n" \
255 "%0 += %1\n" \
256 : "+r"(__addr) \
257 : "r"(__base), \
258 [max]"i"(sizeof(base) - sizeof((base) member))); \
259 __addr; \
260 })
261 #endif /* MEMBER_VPTR */
262
263 /**
264 * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
265 * @arr: array to index into
266 * @i: array index
267 * @n: number of elements in array
268 *
269 * Similar to MEMBER_VPTR() but is intended for use with arrays where the
270 * element count needs to be explicit.
271 * It can be used in cases where a global array is defined with an initial
272 * size but is intended to be be resized before loading the BPF program.
273 * Without this version of the macro, MEMBER_VPTR() will use the compile time
274 * size of the array to compute the max, which will result in rejection by
275 * the verifier.
276 */
277 #ifndef ARRAY_ELEM_PTR
278 #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \
279 ({ \
280 u64 __base = (u64)arr; \
281 u64 __addr = (u64)&(arr[i]) - __base; \
282 asm volatile ( \
283 "if %0 <= %[max] goto +2\n" \
284 "%0 = 0\n" \
285 "goto +1\n" \
286 "%0 += %1\n" \
287 : "+r"(__addr) \
288 : "r"(__base), \
289 [max]"r"(sizeof(arr[0]) * ((n) - 1))); \
290 __addr; \
291 })
292 #endif /* ARRAY_ELEM_PTR */
293
294 /**
295 * __sink - Hide @expr's value from the compiler and BPF verifier
296 * @expr: The expression whose value should be opacified
297 *
298 * No-op at runtime. The empty inline assembly with a read-write constraint
299 * ("+g") has two effects at compile/verify time:
300 *
301 * 1. Compiler: treats @expr as both read and written, preventing dead-code
302 * elimination and keeping @expr (and any side effects that produced it)
303 * alive.
304 *
305 * 2. BPF verifier: forgets the precise value/range of @expr ("makes it
306 * imprecise"). The verifier normally tracks exact ranges for every register
307 * and stack slot. While useful, precision means each distinct value creates a
308 * separate verifier state. Inside loops this leads to state explosion - each
309 * iteration carries different precise values so states never merge and the
310 * verifier explores every iteration individually.
311 *
312 * Example - preventing loop state explosion::
313 *
314 * u32 nr_intersects = 0, nr_covered = 0;
315 * __sink(nr_intersects);
316 * __sink(nr_covered);
317 * bpf_for(i, 0, nr_nodes) {
318 * if (intersects(cpumask, node_mask[i]))
319 * nr_intersects++;
320 * if (covers(cpumask, node_mask[i]))
321 * nr_covered++;
322 * }
323 *
324 * Without __sink(), the verifier tracks every possible (nr_intersects,
325 * nr_covered) pair across iterations, causing "BPF program is too large". With
326 * __sink(), the values become unknown scalars so all iterations collapse into
327 * one reusable state.
328 *
329 * Example - keeping a reference alive::
330 *
331 * struct task_struct *t = bpf_task_acquire(task);
332 * __sink(t);
333 *
334 * Follows the convention from BPF selftests (bpf_misc.h).
335 */
336 #define __sink(expr) asm volatile ("" : "+g"(expr))
337
338 /*
339 * BPF declarations and helpers
340 */
341
342 /* list and rbtree */
343 #define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
344 #define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
345
346 void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
347 void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
348
349 #define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
350 #define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
351
352 int bpf_list_push_front_impl(struct bpf_list_head *head,
353 struct bpf_list_node *node,
354 void *meta, __u64 off) __ksym;
355 #define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0)
356
357 int bpf_list_push_back_impl(struct bpf_list_head *head,
358 struct bpf_list_node *node,
359 void *meta, __u64 off) __ksym;
360 #define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0)
361
362 struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
363 struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
364 struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
365 struct bpf_rb_node *node) __ksym;
366 int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
367 bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
368 void *meta, __u64 off) __ksym;
369 #define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
370
371 struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
372
373 void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
374 #define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
375
376 /* task */
377 struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
378 struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
379 void bpf_task_release(struct task_struct *p) __ksym;
380
381 /* cgroup */
382 struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
383 struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) __ksym;
384 void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
385 struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
386
387 /* css iteration */
388 struct bpf_iter_css;
389 struct cgroup_subsys_state;
390 extern int bpf_iter_css_new(struct bpf_iter_css *it,
391 struct cgroup_subsys_state *start,
392 unsigned int flags) __weak __ksym;
393 extern struct cgroup_subsys_state *
394 bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
395 extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
396
397 /* cpumask */
398 struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
399 struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
400 void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
401 u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
402 u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
403 void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
404 void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
405 bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
406 bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
407 bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
408 void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
409 void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
410 bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
411 const struct cpumask *src2) __ksym;
412 void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1,
413 const struct cpumask *src2) __ksym;
414 void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1,
415 const struct cpumask *src2) __ksym;
416 bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
417 bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
418 bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
419 bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
420 bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
421 void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
422 u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
423 u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
424 const struct cpumask *src2) __ksym;
425 u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
426
427 int bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words) __ksym;
428 int *bpf_iter_bits_next(struct bpf_iter_bits *it) __ksym;
429 void bpf_iter_bits_destroy(struct bpf_iter_bits *it) __ksym;
430
431 #define def_iter_struct(name) \
432 struct bpf_iter_##name { \
433 struct bpf_iter_bits it; \
434 const struct cpumask *bitmap; \
435 };
436
437 #define def_iter_new(name) \
438 static inline int bpf_iter_##name##_new( \
439 struct bpf_iter_##name *it, const u64 *unsafe_ptr__ign, u32 nr_words) \
440 { \
441 it->bitmap = scx_bpf_get_##name##_cpumask(); \
442 return bpf_iter_bits_new(&it->it, (const u64 *)it->bitmap, \
443 sizeof(struct cpumask) / 8); \
444 }
445
446 #define def_iter_next(name) \
447 static inline int *bpf_iter_##name##_next(struct bpf_iter_##name *it) { \
448 return bpf_iter_bits_next(&it->it); \
449 }
450
451 #define def_iter_destroy(name) \
452 static inline void bpf_iter_##name##_destroy(struct bpf_iter_##name *it) { \
453 scx_bpf_put_cpumask(it->bitmap); \
454 bpf_iter_bits_destroy(&it->it); \
455 }
456 #define def_for_each_cpu(cpu, name) for_each_##name##_cpu(cpu)
457
458 /// Provides iterator for possible and online cpus.
459 ///
460 /// # Example
461 ///
462 /// ```
463 /// static inline void example_use() {
464 /// int *cpu;
465 ///
466 /// for_each_possible_cpu(cpu){
467 /// bpf_printk("CPU %d is possible", *cpu);
468 /// }
469 ///
470 /// for_each_online_cpu(cpu){
471 /// bpf_printk("CPU %d is online", *cpu);
472 /// }
473 /// }
474 /// ```
475 def_iter_struct(possible);
476 def_iter_new(possible);
477 def_iter_next(possible);
478 def_iter_destroy(possible);
479 #define for_each_possible_cpu(cpu) bpf_for_each(possible, cpu, NULL, 0)
480
481 def_iter_struct(online);
482 def_iter_new(online);
483 def_iter_next(online);
484 def_iter_destroy(online);
485 #define for_each_online_cpu(cpu) bpf_for_each(online, cpu, NULL, 0)
486
487 /*
488 * Access a cpumask in read-only mode (typically to check bits).
489 */
cast_mask(struct bpf_cpumask * mask)490 static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
491 {
492 return (const struct cpumask *)mask;
493 }
494
495 /*
496 * Return true if task @p cannot migrate to a different CPU, false
497 * otherwise.
498 */
is_migration_disabled(const struct task_struct * p)499 static inline bool is_migration_disabled(const struct task_struct *p)
500 {
501 /*
502 * Testing p->migration_disabled in a BPF code is tricky because the
503 * migration is _always_ disabled while running the BPF code.
504 * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF
505 * code execution disable and re-enable the migration of the current
506 * task, respectively. So, the _current_ task of the sched_ext ops is
507 * always migration-disabled. Moreover, p->migration_disabled could be
508 * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is
509 * executed in the middle of the other BPF code execution.
510 *
511 * Therefore, we should decide that the _current_ task is
512 * migration-disabled only when its migration_disabled count is greater
513 * than one. In other words, when p->migration_disabled == 1, there is
514 * an ambiguity, so we should check if @p is the current task or not.
515 */
516 if (bpf_core_field_exists(p->migration_disabled)) {
517 if (p->migration_disabled == 1)
518 return bpf_get_current_task_btf() != p;
519 else
520 return p->migration_disabled;
521 }
522 return false;
523 }
524
525 /* rcu */
526 void bpf_rcu_read_lock(void) __ksym;
527 void bpf_rcu_read_unlock(void) __ksym;
528
529 /*
530 * Time helpers, most of which are from jiffies.h.
531 */
532
533 /**
534 * time_delta - Calculate the delta between new and old time stamp
535 * @after: first comparable as u64
536 * @before: second comparable as u64
537 *
538 * Return: the time difference, which is >= 0
539 */
time_delta(u64 after,u64 before)540 static inline s64 time_delta(u64 after, u64 before)
541 {
542 return (s64)(after - before) > 0 ? (s64)(after - before) : 0;
543 }
544
545 /**
546 * time_after - returns true if the time a is after time b.
547 * @a: first comparable as u64
548 * @b: second comparable as u64
549 *
550 * Do this with "<0" and ">=0" to only test the sign of the result. A
551 * good compiler would generate better code (and a really good compiler
552 * wouldn't care). Gcc is currently neither.
553 *
554 * Return: %true is time a is after time b, otherwise %false.
555 */
time_after(u64 a,u64 b)556 static inline bool time_after(u64 a, u64 b)
557 {
558 return (s64)(b - a) < 0;
559 }
560
561 /**
562 * time_before - returns true if the time a is before time b.
563 * @a: first comparable as u64
564 * @b: second comparable as u64
565 *
566 * Return: %true is time a is before time b, otherwise %false.
567 */
time_before(u64 a,u64 b)568 static inline bool time_before(u64 a, u64 b)
569 {
570 return time_after(b, a);
571 }
572
573 /**
574 * time_after_eq - returns true if the time a is after or the same as time b.
575 * @a: first comparable as u64
576 * @b: second comparable as u64
577 *
578 * Return: %true is time a is after or the same as time b, otherwise %false.
579 */
time_after_eq(u64 a,u64 b)580 static inline bool time_after_eq(u64 a, u64 b)
581 {
582 return (s64)(a - b) >= 0;
583 }
584
585 /**
586 * time_before_eq - returns true if the time a is before or the same as time b.
587 * @a: first comparable as u64
588 * @b: second comparable as u64
589 *
590 * Return: %true is time a is before or the same as time b, otherwise %false.
591 */
time_before_eq(u64 a,u64 b)592 static inline bool time_before_eq(u64 a, u64 b)
593 {
594 return time_after_eq(b, a);
595 }
596
597 /**
598 * time_in_range - Calculate whether a is in the range of [b, c].
599 * @a: time to test
600 * @b: beginning of the range
601 * @c: end of the range
602 *
603 * Return: %true is time a is in the range [b, c], otherwise %false.
604 */
time_in_range(u64 a,u64 b,u64 c)605 static inline bool time_in_range(u64 a, u64 b, u64 c)
606 {
607 return time_after_eq(a, b) && time_before_eq(a, c);
608 }
609
610 /**
611 * time_in_range_open - Calculate whether a is in the range of [b, c).
612 * @a: time to test
613 * @b: beginning of the range
614 * @c: end of the range
615 *
616 * Return: %true is time a is in the range [b, c), otherwise %false.
617 */
time_in_range_open(u64 a,u64 b,u64 c)618 static inline bool time_in_range_open(u64 a, u64 b, u64 c)
619 {
620 return time_after_eq(a, b) && time_before(a, c);
621 }
622
623
624 /*
625 * Other helpers
626 */
627
628 /* useful compiler attributes */
629 #ifndef likely
630 #define likely(x) __builtin_expect(!!(x), 1)
631 #endif
632 #ifndef unlikely
633 #define unlikely(x) __builtin_expect(!!(x), 0)
634 #endif
635 #ifndef __maybe_unused
636 #define __maybe_unused __attribute__((__unused__))
637 #endif
638
639 /*
640 * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They
641 * prevent compiler from caching, redoing or reordering reads or writes.
642 */
643 typedef __u8 __attribute__((__may_alias__)) __u8_alias_t;
644 typedef __u16 __attribute__((__may_alias__)) __u16_alias_t;
645 typedef __u32 __attribute__((__may_alias__)) __u32_alias_t;
646 typedef __u64 __attribute__((__may_alias__)) __u64_alias_t;
647
__read_once_size(const volatile void * p,void * res,int size)648 static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
649 {
650 switch (size) {
651 case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break;
652 case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
653 case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
654 case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
655 default:
656 barrier();
657 __builtin_memcpy((void *)res, (const void *)p, size);
658 barrier();
659 }
660 }
661
__write_once_size(volatile void * p,void * res,int size)662 static __always_inline void __write_once_size(volatile void *p, void *res, int size)
663 {
664 switch (size) {
665 case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break;
666 case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
667 case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
668 case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
669 default:
670 barrier();
671 __builtin_memcpy((void *)p, (const void *)res, size);
672 barrier();
673 }
674 }
675
676 /*
677 * __unqual_typeof(x) - Declare an unqualified scalar type, leaving
678 * non-scalar types unchanged,
679 *
680 * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char'
681 * is not type-compatible with 'signed char', and we define a separate case.
682 *
683 * This is copied verbatim from kernel's include/linux/compiler_types.h, but
684 * with default expression (for pointers) changed from (x) to (typeof(x)0).
685 *
686 * This is because LLVM has a bug where for lvalue (x), it does not get rid of
687 * an extra address_space qualifier, but does in case of rvalue (typeof(x)0).
688 * Hence, for pointers, we need to create an rvalue expression to get the
689 * desired type. See https://github.com/llvm/llvm-project/issues/53400.
690 */
691 #define __scalar_type_to_expr_cases(type) \
692 unsigned type : (unsigned type)0, signed type : (signed type)0
693
694 #define __unqual_typeof(x) \
695 typeof(_Generic((x), \
696 char: (char)0, \
697 __scalar_type_to_expr_cases(char), \
698 __scalar_type_to_expr_cases(short), \
699 __scalar_type_to_expr_cases(int), \
700 __scalar_type_to_expr_cases(long), \
701 __scalar_type_to_expr_cases(long long), \
702 default: (typeof(x))0))
703
704 #define READ_ONCE(x) \
705 ({ \
706 union { __unqual_typeof(x) __val; char __c[1]; } __u = \
707 { .__c = { 0 } }; \
708 __read_once_size((__unqual_typeof(x) *)&(x), __u.__c, sizeof(x)); \
709 __u.__val; \
710 })
711
712 #define WRITE_ONCE(x, val) \
713 ({ \
714 union { __unqual_typeof(x) __val; char __c[1]; } __u = \
715 { .__val = (val) }; \
716 __write_once_size((__unqual_typeof(x) *)&(x), __u.__c, sizeof(x)); \
717 __u.__val; \
718 })
719
720 /*
721 * __calc_avg - Calculate exponential weighted moving average (EWMA) with
722 * @old and @new values. @decay represents how large the @old value remains.
723 * With a larger @decay value, the moving average changes slowly, exhibiting
724 * fewer fluctuations.
725 */
726 #define __calc_avg(old, new, decay) ({ \
727 typeof(decay) thr = 1 << (decay); \
728 typeof(old) ret; \
729 if (((old) < thr) || ((new) < thr)) { \
730 if (((old) == 1) && ((new) == 0)) \
731 ret = 0; \
732 else \
733 ret = ((old) - ((old) >> 1)) + ((new) >> 1); \
734 } else { \
735 ret = ((old) - ((old) >> (decay))) + ((new) >> (decay)); \
736 } \
737 ret; \
738 })
739
740 /*
741 * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
742 * @v: The value for which we're computing the base 2 logarithm.
743 */
log2_u32(u32 v)744 static inline u32 log2_u32(u32 v)
745 {
746 u32 r;
747 u32 shift;
748
749 r = (v > 0xFFFF) << 4; v >>= r;
750 shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
751 shift = (v > 0xF) << 2; v >>= shift; r |= shift;
752 shift = (v > 0x3) << 1; v >>= shift; r |= shift;
753 r |= (v >> 1);
754 return r;
755 }
756
757 /*
758 * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value.
759 * @v: The value for which we're computing the base 2 logarithm.
760 */
log2_u64(u64 v)761 static inline u32 log2_u64(u64 v)
762 {
763 u32 hi = v >> 32;
764 if (hi)
765 return log2_u32(hi) + 32 + 1;
766 else
767 return log2_u32(v) + 1;
768 }
769
770 /*
771 * sqrt_u64 - Calculate the square root of value @x using Newton's method.
772 */
__sqrt_u64(u64 x)773 static inline u64 __sqrt_u64(u64 x)
774 {
775 if (x == 0 || x == 1)
776 return x;
777
778 u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32);
779
780 for (int i = 0; i < 8; ++i) {
781 u64 q = x / r;
782 if (r <= q)
783 break;
784 r = (r + q) >> 1;
785 }
786 return r;
787 }
788
789 /*
790 * ctzll -- Counts trailing zeros in an unsigned long long. If the input value
791 * is zero, the return value is undefined.
792 */
ctzll(u64 v)793 static inline int ctzll(u64 v)
794 {
795 #if (!defined(__BPF__) && defined(__SCX_TARGET_ARCH_x86)) || \
796 (defined(__BPF__) && defined(__clang_major__) && __clang_major__ >= 19)
797 /*
798 * Use the ctz builtin when: (1) building for native x86, or
799 * (2) building for BPF with clang >= 19 (BPF backend supports
800 * the intrinsic from clang 19 onward; earlier versions hit
801 * "unimplemented opcode" in the backend).
802 */
803 return __builtin_ctzll(v);
804 #else
805 /*
806 * If neither the target architecture nor the toolchains support ctzll,
807 * use software-based emulation. Let's use the De Bruijn sequence-based
808 * approach to find LSB fastly. See the details of De Bruijn sequence:
809 *
810 * https://en.wikipedia.org/wiki/De_Bruijn_sequence
811 * https://www.chessprogramming.org/BitScan#De_Bruijn_Multiplication
812 */
813 const int lookup_table[64] = {
814 0, 1, 48, 2, 57, 49, 28, 3, 61, 58, 50, 42, 38, 29, 17, 4,
815 62, 55, 59, 36, 53, 51, 43, 22, 45, 39, 33, 30, 24, 18, 12, 5,
816 63, 47, 56, 27, 60, 41, 37, 16, 54, 35, 52, 21, 44, 32, 23, 11,
817 46, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
818 };
819 const u64 DEBRUIJN_CONSTANT = 0x03f79d71b4cb0a89ULL;
820 unsigned int index;
821 u64 lowest_bit;
822 const int *lt;
823
824 if (v == 0)
825 return -1;
826
827 /*
828 * Isolate the least significant bit (LSB).
829 * For example, if v = 0b...10100, then v & -v = 0b...00100
830 */
831 lowest_bit = v & -v;
832
833 /*
834 * Each isolated bit produces a unique 6-bit value, guaranteed by the
835 * De Bruijn property. Calculate a unique index into the lookup table
836 * using the magic constant and a right shift.
837 *
838 * Multiplying by the 64-bit constant "spreads out" that 1-bit into a
839 * unique pattern in the top 6 bits. This uniqueness property is
840 * exactly what a De Bruijn sequence guarantees: Every possible 6-bit
841 * pattern (in top bits) occurs exactly once for each LSB position. So,
842 * the constant 0x03f79d71b4cb0a89ULL is carefully chosen to be a
843 * De Bruijn sequence, ensuring no collisions in the table index.
844 */
845 index = (lowest_bit * DEBRUIJN_CONSTANT) >> 58;
846
847 /*
848 * Lookup in a precomputed table. No collision is guaranteed by the
849 * De Bruijn property.
850 */
851 lt = MEMBER_VPTR(lookup_table, [index]);
852 return (lt)? *lt : -1;
853 #endif
854 }
855
856 /*
857 * Return a value proportionally scaled to the task's weight.
858 */
scale_by_task_weight(const struct task_struct * p,u64 value)859 static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value)
860 {
861 return (value * p->scx.weight) / 100;
862 }
863
864 /*
865 * Return a value inversely proportional to the task's weight.
866 */
scale_by_task_weight_inverse(const struct task_struct * p,u64 value)867 static inline u64 scale_by_task_weight_inverse(const struct task_struct *p, u64 value)
868 {
869 return value * 100 / p->scx.weight;
870 }
871
872
873 /*
874 * Get a random u64 from the kernel's pseudo-random generator.
875 */
get_prandom_u64()876 static inline u64 get_prandom_u64()
877 {
878 return ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32();
879 }
880
881 /*
882 * Define the shadow structure to avoid a compilation error when
883 * vmlinux.h does not enable necessary kernel configs. The ___local
884 * suffix is a CO-RE convention that tells the loader to match this
885 * against the base struct rq in the kernel. The attribute
886 * preserve_access_index tells the compiler to generate a CO-RE
887 * relocation for these fields.
888 */
889 struct rq___local {
890 /*
891 * A monotonically increasing clock per CPU. It is rq->clock minus
892 * cumulative IRQ time and hypervisor steal time. Unlike rq->clock,
893 * it does not advance during IRQ processing or hypervisor preemption.
894 * It does advance during idle (the idle task counts as a running task
895 * for this purpose).
896 */
897 u64 clock_task;
898 /*
899 * Invariant version of clock_task scaled by CPU capacity and
900 * frequency. For example, clock_pelt advances 2x slower on a CPU
901 * with half the capacity.
902 *
903 * At idle exit, rq->clock_pelt jumps forward to resync with
904 * clock_task. The kernel's rq_clock_pelt() corrects for this jump
905 * by subtracting lost_idle_time, yielding a clock that appears
906 * continuous across idle transitions. scx_clock_pelt() mirrors
907 * rq_clock_pelt() by performing the same subtraction.
908 */
909 u64 clock_pelt;
910 /*
911 * Accumulates the magnitude of each clock_pelt jump at idle exit.
912 * Subtracting this from clock_pelt gives rq_clock_pelt(): a
913 * continuous, capacity-invariant clock suitable for both task
914 * execution time stamping and cross-idle measurements.
915 */
916 unsigned long lost_idle_time;
917 /*
918 * Shadow of paravirt_steal_clock() (the hypervisor's cumulative
919 * stolen time counter). Stays frozen while the hypervisor preempts
920 * the vCPU; catches up the next time update_rq_clock_task() is
921 * called. The delta is the stolen time not yet subtracted from
922 * clock_task.
923 *
924 * Unlike irqtime->total (a plain kernel-side field), the live stolen
925 * time counter lives in hypervisor-specific shared memory and has no
926 * kernel-side equivalent readable from BPF in a hypervisor-agnostic
927 * way. This field is therefore the only portable BPF-accessible
928 * approximation of cumulative steal time.
929 *
930 * Available only when CONFIG_PARAVIRT_TIME_ACCOUNTING is on.
931 */
932 u64 prev_steal_time_rq;
933 } __attribute__((preserve_access_index));
934
935 extern struct rq runqueues __ksym;
936
937 /*
938 * Define the shadow structure to avoid a compilation error when
939 * vmlinux.h does not enable necessary kernel configs.
940 */
941 struct irqtime___local {
942 /*
943 * Cumulative IRQ time counter for this CPU, in nanoseconds. Advances
944 * immediately at the exit of every hardirq and non-ksoftirqd softirq
945 * via irqtime_account_irq(). ksoftirqd time is counted as normal
946 * task time and is NOT included. NMI time is also NOT included.
947 *
948 * The companion field irqtime->sync (struct u64_stats_sync) protects
949 * against 64-bit tearing on 32-bit architectures. On 64-bit kernels,
950 * u64_stats_sync is an empty struct and all seqcount operations are
951 * no-ops, so a plain BPF_CORE_READ of this field is safe.
952 *
953 * Available only when CONFIG_IRQ_TIME_ACCOUNTING is on.
954 */
955 u64 total;
956 } __attribute__((preserve_access_index));
957
958 /*
959 * cpu_irqtime is a per-CPU variable defined only when
960 * CONFIG_IRQ_TIME_ACCOUNTING is on. Declare it as __weak so the BPF
961 * loader sets its address to 0 (rather than failing) when the symbol
962 * is absent from the running kernel.
963 */
964 extern struct irqtime___local cpu_irqtime __ksym __weak;
965
get_current_rq(u32 cpu)966 static inline struct rq___local *get_current_rq(u32 cpu)
967 {
968 /*
969 * This is a workaround to get an rq pointer since we decided to
970 * deprecate scx_bpf_cpu_rq().
971 *
972 * WARNING: The caller must hold the rq lock for @cpu. This is
973 * guaranteed when called from scheduling callbacks (ops.running,
974 * ops.stopping, ops.enqueue, ops.dequeue, ops.dispatch, etc.).
975 * There is no runtime check available in BPF for kernel spinlock
976 * state — correctness is enforced by calling context only.
977 */
978 return (void *)bpf_per_cpu_ptr(&runqueues, cpu);
979 }
980
scx_clock_task(u32 cpu)981 static inline u64 scx_clock_task(u32 cpu)
982 {
983 struct rq___local *rq = get_current_rq(cpu);
984
985 /* Equivalent to the kernel's rq_clock_task(). */
986 return rq ? rq->clock_task : 0;
987 }
988
scx_clock_pelt(u32 cpu)989 static inline u64 scx_clock_pelt(u32 cpu)
990 {
991 struct rq___local *rq = get_current_rq(cpu);
992
993 /*
994 * Equivalent to the kernel's rq_clock_pelt(): subtracts
995 * lost_idle_time from clock_pelt to absorb the jump that occurs
996 * when clock_pelt resyncs with clock_task at idle exit. The result
997 * is a continuous, capacity-invariant clock safe for both task
998 * execution time stamping and cross-idle measurements.
999 */
1000 return rq ? (rq->clock_pelt - rq->lost_idle_time) : 0;
1001 }
1002
scx_clock_virt(u32 cpu)1003 static inline u64 scx_clock_virt(u32 cpu)
1004 {
1005 struct rq___local *rq;
1006
1007 /*
1008 * Check field existence before calling get_current_rq() so we avoid
1009 * the per_cpu lookup entirely on kernels built without
1010 * CONFIG_PARAVIRT_TIME_ACCOUNTING.
1011 */
1012 if (!bpf_core_field_exists(((struct rq___local *)0)->prev_steal_time_rq))
1013 return 0;
1014
1015 /* Lagging shadow of the kernel's paravirt_steal_clock(). */
1016 rq = get_current_rq(cpu);
1017 return rq ? BPF_CORE_READ(rq, prev_steal_time_rq) : 0;
1018 }
1019
scx_clock_irq(u32 cpu)1020 static inline u64 scx_clock_irq(u32 cpu)
1021 {
1022 struct irqtime___local *irqt;
1023
1024 /*
1025 * bpf_core_type_exists() resolves at load time: if struct irqtime is
1026 * absent from kernel BTF (CONFIG_IRQ_TIME_ACCOUNTING off), the loader
1027 * patches this into an unconditional return 0, making the
1028 * bpf_per_cpu_ptr() call below dead code that the verifier never sees.
1029 */
1030 if (!bpf_core_type_exists(struct irqtime___local))
1031 return 0;
1032
1033 /* Equivalent to the kernel's irq_time_read(). */
1034 irqt = bpf_per_cpu_ptr(&cpu_irqtime, cpu);
1035 return irqt ? BPF_CORE_READ(irqt, total) : 0;
1036 }
1037
1038 #include "compat.bpf.h"
1039 #include "enums.bpf.h"
1040
1041 #endif /* __SCX_COMMON_BPF_H */
1042