1edc41a10SNamhyung Kim // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2edc41a10SNamhyung Kim // Copyright (c) 2022 Google
3edc41a10SNamhyung Kim #include "vmlinux.h"
4edc41a10SNamhyung Kim #include <bpf/bpf_helpers.h>
5edc41a10SNamhyung Kim #include <bpf/bpf_tracing.h>
6edc41a10SNamhyung Kim #include <bpf/bpf_core_read.h>
7edc41a10SNamhyung Kim
8edc41a10SNamhyung Kim /* task->flags for off-cpu analysis */
9edc41a10SNamhyung Kim #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
10edc41a10SNamhyung Kim
11edc41a10SNamhyung Kim /* task->state for off-cpu analysis */
12edc41a10SNamhyung Kim #define TASK_INTERRUPTIBLE 0x0001
13edc41a10SNamhyung Kim #define TASK_UNINTERRUPTIBLE 0x0002
14edc41a10SNamhyung Kim
15d2347763SNamhyung Kim /* create a new thread */
16d2347763SNamhyung Kim #define CLONE_THREAD 0x10000
17d2347763SNamhyung Kim
18edc41a10SNamhyung Kim #define MAX_STACKS 32
19edc41a10SNamhyung Kim #define MAX_ENTRIES 102400
20edc41a10SNamhyung Kim
21edc41a10SNamhyung Kim #define MAX_CPUS 4096
22edc41a10SNamhyung Kim #define MAX_OFFCPU_LEN 37
23edc41a10SNamhyung Kim
24edc41a10SNamhyung Kim // We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1
25edc41a10SNamhyung Kim struct __stack {
26edc41a10SNamhyung Kim u64 array[MAX_STACKS];
27edc41a10SNamhyung Kim };
28edc41a10SNamhyung Kim
29edc41a10SNamhyung Kim struct tstamp_data {
30edc41a10SNamhyung Kim __u32 stack_id;
31edc41a10SNamhyung Kim __u32 state;
32685439a7SNamhyung Kim __u64 timestamp;
33edc41a10SNamhyung Kim struct __stack stack;
34edc41a10SNamhyung Kim };
35edc41a10SNamhyung Kim
36edc41a10SNamhyung Kim struct offcpu_key {
37edc41a10SNamhyung Kim __u32 pid;
38edc41a10SNamhyung Kim __u32 tgid;
39edc41a10SNamhyung Kim __u32 stack_id;
40edc41a10SNamhyung Kim __u32 state;
41edc41a10SNamhyung Kim __u64 cgroup_id;
42edc41a10SNamhyung Kim };
43edc41a10SNamhyung Kim
44edc41a10SNamhyung Kim struct {
45edc41a10SNamhyung Kim __uint(type, BPF_MAP_TYPE_STACK_TRACE);
46edc41a10SNamhyung Kim __uint(key_size, sizeof(__u32));
47edc41a10SNamhyung Kim __uint(value_size, MAX_STACKS * sizeof(__u64));
48edc41a10SNamhyung Kim __uint(max_entries, MAX_ENTRIES);
49edc41a10SNamhyung Kim } stacks SEC(".maps");
50edc41a10SNamhyung Kim
51edc41a10SNamhyung Kim struct offcpu_data {
52edc41a10SNamhyung Kim u64 array[MAX_OFFCPU_LEN];
53edc41a10SNamhyung Kim };
54edc41a10SNamhyung Kim
55edc41a10SNamhyung Kim struct {
5610742d0cSNamhyung Kim __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
5710742d0cSNamhyung Kim __uint(key_size, sizeof(int));
5810742d0cSNamhyung Kim __uint(value_size, sizeof(int));
5910742d0cSNamhyung Kim __uint(max_entries, MAX_CPUS);
6010742d0cSNamhyung Kim } offcpu_output SEC(".maps");
6110742d0cSNamhyung Kim
6210742d0cSNamhyung Kim struct {
6310742d0cSNamhyung Kim __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
6410742d0cSNamhyung Kim __uint(key_size, sizeof(__u32));
6510742d0cSNamhyung Kim __uint(value_size, sizeof(struct offcpu_data));
6610742d0cSNamhyung Kim __uint(max_entries, 1);
6710742d0cSNamhyung Kim } offcpu_payload SEC(".maps");
6810742d0cSNamhyung Kim
6910742d0cSNamhyung Kim struct {
70685439a7SNamhyung Kim __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
71685439a7SNamhyung Kim __uint(map_flags, BPF_F_NO_PREALLOC);
72685439a7SNamhyung Kim __type(key, int);
73685439a7SNamhyung Kim __type(value, struct tstamp_data);
74685439a7SNamhyung Kim } tstamp SEC(".maps");
75685439a7SNamhyung Kim
76685439a7SNamhyung Kim struct {
77d6838ec4SNamhyung Kim __uint(type, BPF_MAP_TYPE_HASH);
78d6838ec4SNamhyung Kim __uint(key_size, sizeof(struct offcpu_key));
79d6838ec4SNamhyung Kim __uint(value_size, sizeof(__u64));
80d6838ec4SNamhyung Kim __uint(max_entries, MAX_ENTRIES);
81d6838ec4SNamhyung Kim } off_cpu SEC(".maps");
82edc41a10SNamhyung Kim
83edc41a10SNamhyung Kim struct {
84edc41a10SNamhyung Kim __uint(type, BPF_MAP_TYPE_HASH);
85edc41a10SNamhyung Kim __uint(key_size, sizeof(__u32));
86edc41a10SNamhyung Kim __uint(value_size, sizeof(__u8));
87edc41a10SNamhyung Kim __uint(max_entries, 1);
88*8b3b1bb3SNamhyung Kim } cpu_filter SEC(".maps");
89*8b3b1bb3SNamhyung Kim
90*8b3b1bb3SNamhyung Kim struct {
91*8b3b1bb3SNamhyung Kim __uint(type, BPF_MAP_TYPE_HASH);
92*8b3b1bb3SNamhyung Kim __uint(key_size, sizeof(__u32));
93edc41a10SNamhyung Kim __uint(value_size, sizeof(__u8));
94b36888f7SNamhyung Kim __uint(max_entries, 1);
95685439a7SNamhyung Kim } task_filter SEC(".maps");
96685439a7SNamhyung Kim
97b36888f7SNamhyung Kim struct {
98e42c9c54SNamhyung Kim __uint(type, BPF_MAP_TYPE_HASH);
99e42c9c54SNamhyung Kim __uint(key_size, sizeof(__u64));
100edc41a10SNamhyung Kim __uint(value_size, sizeof(__u8));
101edc41a10SNamhyung Kim __uint(max_entries, 1);
102edc41a10SNamhyung Kim } cgroup_filter SEC(".maps");
103edc41a10SNamhyung Kim
104edc41a10SNamhyung Kim /* new kernel task_struct definition */
105edc41a10SNamhyung Kim struct task_struct___new {
106edc41a10SNamhyung Kim long __state;
107edc41a10SNamhyung Kim } __attribute__((preserve_access_index));
108d6838ec4SNamhyung Kim
109d6838ec4SNamhyung Kim /* old kernel task_struct definition */
110edc41a10SNamhyung Kim struct task_struct___old {
111d6838ec4SNamhyung Kim long state;
112d6838ec4SNamhyung Kim } __attribute__((preserve_access_index));
113d6838ec4SNamhyung Kim
114d6838ec4SNamhyung Kim int enabled = 0;
115edc41a10SNamhyung Kim
116edc41a10SNamhyung Kim const volatile int has_cpu = 0;
117edc41a10SNamhyung Kim const volatile int has_task = 0;
118edc41a10SNamhyung Kim const volatile int has_cgroup = 0;
119d6838ec4SNamhyung Kim const volatile int uses_tgid = 0;
120edc41a10SNamhyung Kim
121685439a7SNamhyung Kim const volatile bool has_prev_state = false;
122685439a7SNamhyung Kim const volatile bool needs_cgroup = false;
123685439a7SNamhyung Kim const volatile bool uses_cgroup_v1 = false;
124685439a7SNamhyung Kim
125e42c9c54SNamhyung Kim int perf_subsys_id = -1;
126e42c9c54SNamhyung Kim
127685439a7SNamhyung Kim __u64 offcpu_thresh_ns;
128e42c9c54SNamhyung Kim
129e42c9c54SNamhyung Kim /*
130e42c9c54SNamhyung Kim * Old kernel used to call it task_struct->state and now it's '__state'.
131e42c9c54SNamhyung Kim * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
132e42c9c54SNamhyung Kim *
133e42c9c54SNamhyung Kim * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
134e42c9c54SNamhyung Kim */
get_task_state(struct task_struct * t)135e42c9c54SNamhyung Kim static inline int get_task_state(struct task_struct *t)
136e42c9c54SNamhyung Kim {
137e42c9c54SNamhyung Kim /* recast pointer to capture new type for compiler */
138685439a7SNamhyung Kim struct task_struct___new *t_new = (void *)t;
139685439a7SNamhyung Kim
140685439a7SNamhyung Kim if (bpf_core_field_exists(t_new->__state)) {
14110742d0cSNamhyung Kim return BPF_CORE_READ(t_new, __state);
14210742d0cSNamhyung Kim } else {
14310742d0cSNamhyung Kim /* recast pointer to capture old type for compiler */
14410742d0cSNamhyung Kim struct task_struct___old *t_old = (void *)t;
14510742d0cSNamhyung Kim
14610742d0cSNamhyung Kim return BPF_CORE_READ(t_old, state);
14710742d0cSNamhyung Kim }
14810742d0cSNamhyung Kim }
14910742d0cSNamhyung Kim
get_cgroup_id(struct task_struct * t)15010742d0cSNamhyung Kim static inline __u64 get_cgroup_id(struct task_struct *t)
15110742d0cSNamhyung Kim {
15210742d0cSNamhyung Kim struct cgroup *cgrp;
15310742d0cSNamhyung Kim
15410742d0cSNamhyung Kim if (!uses_cgroup_v1)
15510742d0cSNamhyung Kim return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
15610742d0cSNamhyung Kim
15710742d0cSNamhyung Kim if (perf_subsys_id == -1) {
15810742d0cSNamhyung Kim #if __has_builtin(__builtin_preserve_enum_value)
15910742d0cSNamhyung Kim perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
16010742d0cSNamhyung Kim perf_event_cgrp_id);
16110742d0cSNamhyung Kim #else
16207fc958bSNamhyung Kim perf_subsys_id = perf_event_cgrp_id;
16307fc958bSNamhyung Kim #endif
16407fc958bSNamhyung Kim }
16507fc958bSNamhyung Kim
16607fc958bSNamhyung Kim cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
16707fc958bSNamhyung Kim return BPF_CORE_READ(cgrp, kn, id);
16810742d0cSNamhyung Kim }
16910742d0cSNamhyung Kim
can_record(struct task_struct * t,int state)17010742d0cSNamhyung Kim static inline int can_record(struct task_struct *t, int state)
17110742d0cSNamhyung Kim {
17210742d0cSNamhyung Kim /* kernel threads don't have user stack */
17310742d0cSNamhyung Kim if (t->flags & PF_KTHREAD)
174685439a7SNamhyung Kim return 0;
175685439a7SNamhyung Kim
176685439a7SNamhyung Kim if (state != TASK_INTERRUPTIBLE &&
177685439a7SNamhyung Kim state != TASK_UNINTERRUPTIBLE)
178685439a7SNamhyung Kim return 0;
179685439a7SNamhyung Kim
180685439a7SNamhyung Kim if (has_cpu) {
181685439a7SNamhyung Kim __u32 cpu = bpf_get_smp_processor_id();
182685439a7SNamhyung Kim __u8 *ok;
18310742d0cSNamhyung Kim
18410742d0cSNamhyung Kim ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
18510742d0cSNamhyung Kim if (!ok)
186b36888f7SNamhyung Kim return 0;
187b36888f7SNamhyung Kim }
188edc41a10SNamhyung Kim
189edc41a10SNamhyung Kim if (has_task) {
190edc41a10SNamhyung Kim __u8 *ok;
191edc41a10SNamhyung Kim __u32 pid;
192edc41a10SNamhyung Kim
193edc41a10SNamhyung Kim if (uses_tgid)
194edc41a10SNamhyung Kim pid = t->tgid;
19510742d0cSNamhyung Kim else
196edc41a10SNamhyung Kim pid = t->pid;
197edc41a10SNamhyung Kim
198edc41a10SNamhyung Kim ok = bpf_map_lookup_elem(&task_filter, &pid);
199edc41a10SNamhyung Kim if (!ok)
200edc41a10SNamhyung Kim return 0;
201edc41a10SNamhyung Kim }
202edc41a10SNamhyung Kim
203edc41a10SNamhyung Kim if (has_cgroup) {
204edc41a10SNamhyung Kim __u8 *ok;
205edc41a10SNamhyung Kim __u64 cgrp_id = get_cgroup_id(t);
206edc41a10SNamhyung Kim
207edc41a10SNamhyung Kim ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
208edc41a10SNamhyung Kim if (!ok)
209edc41a10SNamhyung Kim return 0;
210edc41a10SNamhyung Kim }
211edc41a10SNamhyung Kim
212edc41a10SNamhyung Kim return 1;
213edc41a10SNamhyung Kim }
214edc41a10SNamhyung Kim
copy_stack(struct __stack * from,struct offcpu_data * to,int n)215edc41a10SNamhyung Kim static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n)
216edc41a10SNamhyung Kim {
217edc41a10SNamhyung Kim int len = 0;
218edc41a10SNamhyung Kim
219685439a7SNamhyung Kim for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len)
220edc41a10SNamhyung Kim to->array[n + 2 + i] = from->array[i];
221edc41a10SNamhyung Kim
222edc41a10SNamhyung Kim return len;
223edc41a10SNamhyung Kim }
224edc41a10SNamhyung Kim
225edc41a10SNamhyung Kim /**
226edc41a10SNamhyung Kim * off_cpu_dump - dump off-cpu samples to ring buffer
227edc41a10SNamhyung Kim * @data: payload for dumping off-cpu samples
228edc41a10SNamhyung Kim * @key: off-cpu data
229edc41a10SNamhyung Kim * @stack: stack trace of the task before being scheduled out
230edc41a10SNamhyung Kim *
231edc41a10SNamhyung Kim * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id
232edc41a10SNamhyung Kim * information of the task, and dump it as a raw sample to perf ring buffer
233edc41a10SNamhyung Kim */
off_cpu_dump(void * ctx,struct offcpu_data * data,struct offcpu_key * key,struct __stack * stack,__u64 delta)234edc41a10SNamhyung Kim static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
235edc41a10SNamhyung Kim struct __stack *stack, __u64 delta)
236edc41a10SNamhyung Kim {
237d2347763SNamhyung Kim int n = 0, len = 0;
238d2347763SNamhyung Kim
239d2347763SNamhyung Kim data->array[n++] = (u64)key->tgid << 32 | key->pid;
240d2347763SNamhyung Kim data->array[n++] = delta;
241d2347763SNamhyung Kim
242d2347763SNamhyung Kim /* data->array[n] is callchain->nr (updated later) */
243d2347763SNamhyung Kim data->array[n + 1] = PERF_CONTEXT_USER;
244d2347763SNamhyung Kim data->array[n + 2] = 0;
245d2347763SNamhyung Kim len = copy_stack(stack, data, n);
246d2347763SNamhyung Kim
247d2347763SNamhyung Kim /* update length of callchain */
248d2347763SNamhyung Kim data->array[n] = len + 1;
249d2347763SNamhyung Kim n += len + 2;
250d2347763SNamhyung Kim
251d2347763SNamhyung Kim data->array[n++] = key->cgroup_id;
252d2347763SNamhyung Kim
253d2347763SNamhyung Kim return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64));
254d2347763SNamhyung Kim }
255d2347763SNamhyung Kim
off_cpu_stat(u64 * ctx,struct task_struct * prev,struct task_struct * next,int state)256d2347763SNamhyung Kim static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
257d2347763SNamhyung Kim struct task_struct *next, int state)
258d2347763SNamhyung Kim {
259d2347763SNamhyung Kim __u64 ts;
260d2347763SNamhyung Kim __u32 stack_id;
261d2347763SNamhyung Kim struct tstamp_data *pelem;
262d2347763SNamhyung Kim
263d2347763SNamhyung Kim ts = bpf_ktime_get_ns();
264b36888f7SNamhyung Kim
265b36888f7SNamhyung Kim if (!can_record(prev, state))
266b36888f7SNamhyung Kim goto next;
267b36888f7SNamhyung Kim
268b36888f7SNamhyung Kim stack_id = bpf_get_stackid(ctx, &stacks,
269b36888f7SNamhyung Kim BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
270b36888f7SNamhyung Kim
271b36888f7SNamhyung Kim pelem = bpf_task_storage_get(&tstamp, prev, NULL,
272b36888f7SNamhyung Kim BPF_LOCAL_STORAGE_GET_F_CREATE);
273b36888f7SNamhyung Kim if (!pelem)
274b36888f7SNamhyung Kim goto next;
275b36888f7SNamhyung Kim
276b36888f7SNamhyung Kim pelem->timestamp = ts;
277b36888f7SNamhyung Kim pelem->state = state;
278b36888f7SNamhyung Kim pelem->stack_id = stack_id;
279b36888f7SNamhyung Kim
280b36888f7SNamhyung Kim /*
281b48279afSNamhyung Kim * If stacks are successfully collected by bpf_get_stackid(), collect them once more
282b36888f7SNamhyung Kim * in task_storage for direct off-cpu sample dumping
283b36888f7SNamhyung Kim */
284edc41a10SNamhyung Kim if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) {
285 /*
286 * This empty if block is used to avoid 'result unused warning' from bpf_get_stack().
287 * If the collection fails, continue with the logic for the next task.
288 */
289 }
290 next:
291 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
292
293 if (pelem && pelem->timestamp) {
294 struct offcpu_key key = {
295 .pid = next->pid,
296 .tgid = next->tgid,
297 .stack_id = pelem->stack_id,
298 .state = pelem->state,
299 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
300 };
301 __u64 delta = ts - pelem->timestamp;
302 __u64 *total;
303
304 if (delta >= offcpu_thresh_ns) {
305 int zero = 0;
306 struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);
307
308 if (data)
309 off_cpu_dump(ctx, data, &key, &pelem->stack, delta);
310 } else {
311 total = bpf_map_lookup_elem(&off_cpu, &key);
312 if (total)
313 *total += delta;
314 else
315 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
316 }
317
318 /* prevent to reuse the timestamp later */
319 pelem->timestamp = 0;
320 }
321
322 return 0;
323 }
324
325 SEC("tp_btf/task_newtask")
on_newtask(u64 * ctx)326 int on_newtask(u64 *ctx)
327 {
328 struct task_struct *task;
329 u64 clone_flags;
330 u32 pid;
331 u8 val = 1;
332
333 if (!uses_tgid)
334 return 0;
335
336 task = (struct task_struct *)bpf_get_current_task();
337
338 pid = BPF_CORE_READ(task, tgid);
339 if (!bpf_map_lookup_elem(&task_filter, &pid))
340 return 0;
341
342 task = (struct task_struct *)ctx[0];
343 clone_flags = ctx[1];
344
345 pid = task->tgid;
346 if (!(clone_flags & CLONE_THREAD))
347 bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
348
349 return 0;
350 }
351
352 SEC("tp_btf/sched_switch")
on_switch(u64 * ctx)353 int on_switch(u64 *ctx)
354 {
355 struct task_struct *prev, *next;
356 int prev_state;
357
358 if (!enabled)
359 return 0;
360
361 prev = (struct task_struct *)ctx[1];
362 next = (struct task_struct *)ctx[2];
363
364 if (has_prev_state)
365 prev_state = (int)ctx[3];
366 else
367 prev_state = get_task_state(prev);
368
369 return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
370 }
371
372 char LICENSE[] SEC("license") = "Dual BSD/GPL";
373