1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 #include <asm-generic/errno-base.h>
8
9 #include "lock_data.h"
10
11 /* for collect_lock_syms(). 4096 was rejected by the verifier */
12 #define MAX_CPUS 1024
13
14 /* lock contention flags from include/trace/events/lock.h */
15 #define LCB_F_SPIN (1U << 0)
16 #define LCB_F_READ (1U << 1)
17 #define LCB_F_WRITE (1U << 2)
18 #define LCB_F_RT (1U << 3)
19 #define LCB_F_PERCPU (1U << 4)
20 #define LCB_F_MUTEX (1U << 5)
21
22 /* callstack storage */
23 struct {
24 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
25 __uint(key_size, sizeof(__u32));
26 __uint(value_size, sizeof(__u64));
27 __uint(max_entries, MAX_ENTRIES);
28 } stacks SEC(".maps");
29
30 /* buffer for owner stacktrace */
31 struct {
32 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
33 __uint(key_size, sizeof(__u32));
34 __uint(value_size, sizeof(__u64));
35 __uint(max_entries, 1);
36 } stack_buf SEC(".maps");
37
38 /* a map for tracing owner stacktrace to owner stack id */
39 struct {
40 __uint(type, BPF_MAP_TYPE_HASH);
41 __uint(key_size, sizeof(__u64)); // owner stacktrace
42 __uint(value_size, sizeof(__s32)); // owner stack id
43 __uint(max_entries, 1);
44 } owner_stacks SEC(".maps");
45
46 /* a map for tracing lock address to owner data */
47 struct {
48 __uint(type, BPF_MAP_TYPE_HASH);
49 __uint(key_size, sizeof(__u64)); // lock address
50 __uint(value_size, sizeof(struct owner_tracing_data));
51 __uint(max_entries, 1);
52 } owner_data SEC(".maps");
53
54 /* a map for contention_key (stores owner stack id) to contention data */
55 struct {
56 __uint(type, BPF_MAP_TYPE_HASH);
57 __uint(key_size, sizeof(struct contention_key));
58 __uint(value_size, sizeof(struct contention_data));
59 __uint(max_entries, 1);
60 } owner_stat SEC(".maps");
61
62 /* maintain timestamp at the beginning of contention */
63 struct {
64 __uint(type, BPF_MAP_TYPE_HASH);
65 __type(key, int);
66 __type(value, struct tstamp_data);
67 __uint(max_entries, MAX_ENTRIES);
68 } tstamp SEC(".maps");
69
70 /* maintain per-CPU timestamp at the beginning of contention */
71 struct {
72 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
73 __uint(key_size, sizeof(__u32));
74 __uint(value_size, sizeof(struct tstamp_data));
75 __uint(max_entries, 1);
76 } tstamp_cpu SEC(".maps");
77
78 /* actual lock contention statistics */
79 struct {
80 __uint(type, BPF_MAP_TYPE_HASH);
81 __uint(key_size, sizeof(struct contention_key));
82 __uint(value_size, sizeof(struct contention_data));
83 __uint(max_entries, MAX_ENTRIES);
84 } lock_stat SEC(".maps");
85
86 struct {
87 __uint(type, BPF_MAP_TYPE_HASH);
88 __uint(key_size, sizeof(__u32));
89 __uint(value_size, sizeof(struct contention_task_data));
90 __uint(max_entries, MAX_ENTRIES);
91 } task_data SEC(".maps");
92
93 struct {
94 __uint(type, BPF_MAP_TYPE_HASH);
95 __uint(key_size, sizeof(__u64));
96 __uint(value_size, sizeof(__u32));
97 __uint(max_entries, MAX_ENTRIES);
98 } lock_syms SEC(".maps");
99
100 struct {
101 __uint(type, BPF_MAP_TYPE_HASH);
102 __uint(key_size, sizeof(__u32));
103 __uint(value_size, sizeof(__u8));
104 __uint(max_entries, 1);
105 } cpu_filter SEC(".maps");
106
107 struct {
108 __uint(type, BPF_MAP_TYPE_HASH);
109 __uint(key_size, sizeof(__u32));
110 __uint(value_size, sizeof(__u8));
111 __uint(max_entries, 1);
112 } task_filter SEC(".maps");
113
114 struct {
115 __uint(type, BPF_MAP_TYPE_HASH);
116 __uint(key_size, sizeof(__u32));
117 __uint(value_size, sizeof(__u8));
118 __uint(max_entries, 1);
119 } type_filter SEC(".maps");
120
121 struct {
122 __uint(type, BPF_MAP_TYPE_HASH);
123 __uint(key_size, sizeof(__u64));
124 __uint(value_size, sizeof(__u8));
125 __uint(max_entries, 1);
126 } addr_filter SEC(".maps");
127
128 struct {
129 __uint(type, BPF_MAP_TYPE_HASH);
130 __uint(key_size, sizeof(__u64));
131 __uint(value_size, sizeof(__u8));
132 __uint(max_entries, 1);
133 } cgroup_filter SEC(".maps");
134
135 struct {
136 __uint(type, BPF_MAP_TYPE_HASH);
137 __uint(key_size, sizeof(long));
138 __uint(value_size, sizeof(__u8));
139 __uint(max_entries, 1);
140 } slab_filter SEC(".maps");
141
142 struct {
143 __uint(type, BPF_MAP_TYPE_HASH);
144 __uint(key_size, sizeof(long));
145 __uint(value_size, sizeof(struct slab_cache_data));
146 __uint(max_entries, 1);
147 } slab_caches SEC(".maps");
148
149 struct rw_semaphore___old {
150 struct task_struct *owner;
151 } __attribute__((preserve_access_index));
152
153 struct rw_semaphore___new {
154 atomic_long_t owner;
155 } __attribute__((preserve_access_index));
156
157 struct mm_struct___old {
158 struct rw_semaphore mmap_sem;
159 } __attribute__((preserve_access_index));
160
161 struct mm_struct___new {
162 struct rw_semaphore mmap_lock;
163 } __attribute__((preserve_access_index));
164
165 extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak;
166
167 /* control flags */
168 const volatile int has_cpu;
169 const volatile int has_task;
170 const volatile int has_type;
171 const volatile int has_addr;
172 const volatile int has_cgroup;
173 const volatile int has_slab;
174 const volatile int needs_callstack;
175 const volatile int stack_skip;
176 const volatile int lock_owner;
177 const volatile int use_cgroup_v2;
178 const volatile int max_stack;
179
180 /* determine the key of lock stat */
181 const volatile int aggr_mode;
182
183 int enabled;
184
185 int perf_subsys_id = -1;
186
187 __u64 end_ts;
188
189 __u32 slab_cache_id;
190
191 /* error stat */
192 int task_fail;
193 int stack_fail;
194 int time_fail;
195 int data_fail;
196
197 int task_map_full;
198 int data_map_full;
199
200 struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak;
201 void bpf_task_release(struct task_struct *p) __ksym __weak;
202
get_current_cgroup_id(void)203 static inline __u64 get_current_cgroup_id(void)
204 {
205 struct task_struct *task;
206 struct cgroup *cgrp;
207
208 if (use_cgroup_v2)
209 return bpf_get_current_cgroup_id();
210
211 task = bpf_get_current_task_btf();
212
213 if (perf_subsys_id == -1) {
214 #if __has_builtin(__builtin_preserve_enum_value)
215 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
216 perf_event_cgrp_id);
217 #else
218 perf_subsys_id = perf_event_cgrp_id;
219 #endif
220 }
221
222 cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
223 return BPF_CORE_READ(cgrp, kn, id);
224 }
225
can_record(u64 * ctx)226 static inline int can_record(u64 *ctx)
227 {
228 if (has_cpu) {
229 __u32 cpu = bpf_get_smp_processor_id();
230 __u8 *ok;
231
232 ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
233 if (!ok)
234 return 0;
235 }
236
237 if (has_task) {
238 __u8 *ok;
239 __u32 pid = bpf_get_current_pid_tgid();
240
241 ok = bpf_map_lookup_elem(&task_filter, &pid);
242 if (!ok)
243 return 0;
244 }
245
246 if (has_type) {
247 __u8 *ok;
248 __u32 flags = (__u32)ctx[1];
249
250 ok = bpf_map_lookup_elem(&type_filter, &flags);
251 if (!ok)
252 return 0;
253 }
254
255 if (has_addr) {
256 __u8 *ok;
257 __u64 addr = ctx[0];
258
259 ok = bpf_map_lookup_elem(&addr_filter, &addr);
260 if (!ok && !has_slab)
261 return 0;
262 }
263
264 if (has_cgroup) {
265 __u8 *ok;
266 __u64 cgrp = get_current_cgroup_id();
267
268 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp);
269 if (!ok)
270 return 0;
271 }
272
273 if (has_slab && bpf_get_kmem_cache) {
274 __u8 *ok;
275 __u64 addr = ctx[0];
276 long kmem_cache_addr;
277
278 kmem_cache_addr = (long)bpf_get_kmem_cache(addr);
279 ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr);
280 if (!ok)
281 return 0;
282 }
283
284 return 1;
285 }
286
update_task_data(struct task_struct * task)287 static inline int update_task_data(struct task_struct *task)
288 {
289 struct contention_task_data *p;
290 int pid, err;
291
292 err = bpf_core_read(&pid, sizeof(pid), &task->pid);
293 if (err)
294 return -1;
295
296 p = bpf_map_lookup_elem(&task_data, &pid);
297 if (p == NULL && !task_map_full) {
298 struct contention_task_data data = {};
299
300 BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
301 if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
302 task_map_full = 1;
303 }
304
305 return 0;
306 }
307
308 #ifndef __has_builtin
309 # define __has_builtin(x) 0
310 #endif
311
get_lock_owner(__u64 lock,__u32 flags)312 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
313 {
314 struct task_struct *task;
315 __u64 owner = 0;
316
317 if (flags & LCB_F_MUTEX) {
318 struct mutex *mutex = (void *)lock;
319 owner = BPF_CORE_READ(mutex, owner.counter);
320 } else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
321 /*
322 * Support for the BPF_TYPE_MATCHES argument to the
323 * __builtin_preserve_type_info builtin was added at some point during
324 * development of clang 15 and it's what is needed for
325 * bpf_core_type_matches.
326 */
327 #if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
328 if (bpf_core_type_matches(struct rw_semaphore___old)) {
329 struct rw_semaphore___old *rwsem = (void *)lock;
330 owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
331 } else if (bpf_core_type_matches(struct rw_semaphore___new)) {
332 struct rw_semaphore___new *rwsem = (void *)lock;
333 owner = BPF_CORE_READ(rwsem, owner.counter);
334 }
335 #else
336 /* assume new struct */
337 struct rw_semaphore *rwsem = (void *)lock;
338 owner = BPF_CORE_READ(rwsem, owner.counter);
339 #endif
340 }
341
342 if (!owner)
343 return NULL;
344
345 task = (void *)(owner & ~7UL);
346 return task;
347 }
348
check_lock_type(__u64 lock,__u32 flags)349 static inline __u32 check_lock_type(__u64 lock, __u32 flags)
350 {
351 struct task_struct *curr;
352 struct mm_struct___old *mm_old;
353 struct mm_struct___new *mm_new;
354 struct sighand_struct *sighand;
355
356 switch (flags) {
357 case LCB_F_READ: /* rwsem */
358 case LCB_F_WRITE:
359 curr = bpf_get_current_task_btf();
360 if (curr->mm == NULL)
361 break;
362 mm_new = (void *)curr->mm;
363 if (bpf_core_field_exists(mm_new->mmap_lock)) {
364 if (&mm_new->mmap_lock == (void *)lock)
365 return LCD_F_MMAP_LOCK;
366 break;
367 }
368 mm_old = (void *)curr->mm;
369 if (bpf_core_field_exists(mm_old->mmap_sem)) {
370 if (&mm_old->mmap_sem == (void *)lock)
371 return LCD_F_MMAP_LOCK;
372 }
373 break;
374 case LCB_F_SPIN: /* spinlock */
375 curr = bpf_get_current_task_btf();
376 sighand = curr->sighand;
377
378 if (sighand && &sighand->siglock == (void *)lock)
379 return LCD_F_SIGHAND_LOCK;
380 break;
381 default:
382 break;
383 }
384 return 0;
385 }
386
get_tstamp_elem(__u32 flags)387 static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
388 {
389 __u32 pid;
390 struct tstamp_data *pelem;
391
392 /* Use per-cpu array map for spinlock and rwlock */
393 if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) {
394 __u32 idx = 0;
395
396 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
397 /* Do not update the element for nested locks */
398 if (pelem && pelem->lock)
399 pelem = NULL;
400 return pelem;
401 }
402
403 pid = bpf_get_current_pid_tgid();
404 pelem = bpf_map_lookup_elem(&tstamp, &pid);
405 /* Do not update the element for nested locks */
406 if (pelem && pelem->lock)
407 return NULL;
408
409 if (pelem == NULL) {
410 struct tstamp_data zero = {};
411
412 if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
413 __sync_fetch_and_add(&task_fail, 1);
414 return NULL;
415 }
416
417 pelem = bpf_map_lookup_elem(&tstamp, &pid);
418 if (pelem == NULL) {
419 __sync_fetch_and_add(&task_fail, 1);
420 return NULL;
421 }
422 }
423 return pelem;
424 }
425
get_owner_stack_id(u64 * stacktrace)426 static inline s32 get_owner_stack_id(u64 *stacktrace)
427 {
428 s32 *id, new_id;
429 static s64 id_gen = 1;
430
431 id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
432 if (id)
433 return *id;
434
435 new_id = (s32)__sync_fetch_and_add(&id_gen, 1);
436
437 bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST);
438
439 id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
440 if (id)
441 return *id;
442
443 return -1;
444 }
445
update_contention_data(struct contention_data * data,u64 duration,u32 count)446 static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count)
447 {
448 __sync_fetch_and_add(&data->total_time, duration);
449 __sync_fetch_and_add(&data->count, count);
450
451 /* FIXME: need atomic operations */
452 if (data->max_time < duration)
453 data->max_time = duration;
454 if (data->min_time > duration)
455 data->min_time = duration;
456 }
457
update_owner_stat(u32 id,u64 duration,u32 flags)458 static inline void update_owner_stat(u32 id, u64 duration, u32 flags)
459 {
460 struct contention_key key = {
461 .stack_id = id,
462 .pid = 0,
463 .lock_addr_or_cgroup = 0,
464 };
465 struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key);
466
467 if (!data) {
468 struct contention_data first = {
469 .total_time = duration,
470 .max_time = duration,
471 .min_time = duration,
472 .count = 1,
473 .flags = flags,
474 };
475 bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST);
476 } else {
477 update_contention_data(data, duration, 1);
478 }
479 }
480
481 SEC("tp_btf/contention_begin")
contention_begin(u64 * ctx)482 int contention_begin(u64 *ctx)
483 {
484 struct tstamp_data *pelem;
485
486 if (!enabled || !can_record(ctx))
487 return 0;
488
489 pelem = get_tstamp_elem(ctx[1]);
490 if (pelem == NULL)
491 return 0;
492
493 pelem->timestamp = bpf_ktime_get_ns();
494 pelem->lock = (__u64)ctx[0];
495 pelem->flags = (__u32)ctx[1];
496
497 if (needs_callstack) {
498 u32 i = 0;
499 u32 id = 0;
500 int owner_pid;
501 u64 *buf;
502 struct task_struct *task;
503 struct owner_tracing_data *otdata;
504
505 if (!lock_owner)
506 goto skip_owner;
507
508 task = get_lock_owner(pelem->lock, pelem->flags);
509 if (!task)
510 goto skip_owner;
511
512 owner_pid = BPF_CORE_READ(task, pid);
513
514 buf = bpf_map_lookup_elem(&stack_buf, &i);
515 if (!buf)
516 goto skip_owner;
517 for (i = 0; i < max_stack; i++)
518 buf[i] = 0x0;
519
520 if (!bpf_task_from_pid)
521 goto skip_owner;
522
523 task = bpf_task_from_pid(owner_pid);
524 if (!task)
525 goto skip_owner;
526
527 bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0);
528 bpf_task_release(task);
529
530 otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
531 id = get_owner_stack_id(buf);
532
533 /*
534 * Contention just happens, or corner case `lock` is owned by process not
535 * `owner_pid`. For the corner case we treat it as unexpected internal error and
536 * just ignore the precvious tracing record.
537 */
538 if (!otdata || otdata->pid != owner_pid) {
539 struct owner_tracing_data first = {
540 .pid = owner_pid,
541 .timestamp = pelem->timestamp,
542 .count = 1,
543 .stack_id = id,
544 };
545 bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY);
546 }
547 /* Contention is ongoing and new waiter joins */
548 else {
549 __sync_fetch_and_add(&otdata->count, 1);
550
551 /*
552 * The owner is the same, but stacktrace might be changed. In this case we
553 * store/update `owner_stat` based on current owner stack id.
554 */
555 if (id != otdata->stack_id) {
556 update_owner_stat(id, pelem->timestamp - otdata->timestamp,
557 pelem->flags);
558
559 otdata->timestamp = pelem->timestamp;
560 otdata->stack_id = id;
561 }
562 }
563 skip_owner:
564 pelem->stack_id = bpf_get_stackid(ctx, &stacks,
565 BPF_F_FAST_STACK_CMP | stack_skip);
566 if (pelem->stack_id < 0)
567 __sync_fetch_and_add(&stack_fail, 1);
568 } else if (aggr_mode == LOCK_AGGR_TASK) {
569 struct task_struct *task;
570
571 if (lock_owner) {
572 task = get_lock_owner(pelem->lock, pelem->flags);
573
574 /* The flags is not used anymore. Pass the owner pid. */
575 if (task)
576 pelem->flags = BPF_CORE_READ(task, pid);
577 else
578 pelem->flags = -1U;
579
580 } else {
581 task = bpf_get_current_task_btf();
582 }
583
584 if (task) {
585 if (update_task_data(task) < 0 && lock_owner)
586 pelem->flags = -1U;
587 }
588 }
589
590 return 0;
591 }
592
593 SEC("tp_btf/contention_end")
contention_end(u64 * ctx)594 int contention_end(u64 *ctx)
595 {
596 __u32 pid = 0, idx = 0;
597 struct tstamp_data *pelem;
598 struct contention_key key = {};
599 struct contention_data *data;
600 __u64 timestamp;
601 __u64 duration;
602 bool need_delete = false;
603
604 if (!enabled)
605 return 0;
606
607 /*
608 * For spinlock and rwlock, it needs to get the timestamp for the
609 * per-cpu map. However, contention_end does not have the flags
610 * so it cannot know whether it reads percpu or hash map.
611 *
612 * Try per-cpu map first and check if there's active contention.
613 * If it is, do not read hash map because it cannot go to sleeping
614 * locks before releasing the spinning locks.
615 */
616 pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
617 if (pelem && pelem->lock) {
618 if (pelem->lock != ctx[0])
619 return 0;
620 } else {
621 pid = bpf_get_current_pid_tgid();
622 pelem = bpf_map_lookup_elem(&tstamp, &pid);
623 if (!pelem || pelem->lock != ctx[0])
624 return 0;
625 need_delete = true;
626 }
627
628 timestamp = bpf_ktime_get_ns();
629 duration = timestamp - pelem->timestamp;
630 if ((__s64)duration < 0) {
631 __sync_fetch_and_add(&time_fail, 1);
632 goto out;
633 }
634
635 if (needs_callstack && lock_owner) {
636 struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
637
638 if (!otdata)
639 goto skip_owner;
640
641 /* Update `owner_stat` */
642 update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags);
643
644 /* No contention is occurring, delete `lock` entry in `owner_data` */
645 if (otdata->count <= 1)
646 bpf_map_delete_elem(&owner_data, &pelem->lock);
647 /*
648 * Contention is still ongoing, with a new owner (current task). `owner_data`
649 * should be updated accordingly.
650 */
651 else {
652 u32 i = 0;
653 s32 ret = (s32)ctx[1];
654 u64 *buf;
655
656 otdata->timestamp = timestamp;
657 __sync_fetch_and_add(&otdata->count, -1);
658
659 buf = bpf_map_lookup_elem(&stack_buf, &i);
660 if (!buf)
661 goto skip_owner;
662 for (i = 0; i < (u32)max_stack; i++)
663 buf[i] = 0x0;
664
665 /*
666 * `ret` has the return code of the lock function.
667 * If `ret` is negative, the current task terminates lock waiting without
668 * acquiring it. Owner is not changed, but we still need to update the owner
669 * stack.
670 */
671 if (ret < 0) {
672 s32 id = 0;
673 struct task_struct *task;
674
675 if (!bpf_task_from_pid)
676 goto skip_owner;
677
678 task = bpf_task_from_pid(otdata->pid);
679 if (!task)
680 goto skip_owner;
681
682 bpf_get_task_stack(task, buf,
683 max_stack * sizeof(unsigned long), 0);
684 bpf_task_release(task);
685
686 id = get_owner_stack_id(buf);
687
688 /*
689 * If owner stack is changed, update owner stack id for this lock.
690 */
691 if (id != otdata->stack_id)
692 otdata->stack_id = id;
693 }
694 /*
695 * Otherwise, update tracing data with the current task, which is the new
696 * owner.
697 */
698 else {
699 otdata->pid = pid;
700 /*
701 * We don't want to retrieve callstack here, since it is where the
702 * current task acquires the lock and provides no additional
703 * information. We simply assign -1 to invalidate it.
704 */
705 otdata->stack_id = -1;
706 }
707 }
708 }
709 skip_owner:
710 switch (aggr_mode) {
711 case LOCK_AGGR_CALLER:
712 key.stack_id = pelem->stack_id;
713 break;
714 case LOCK_AGGR_TASK:
715 if (lock_owner)
716 key.pid = pelem->flags;
717 else {
718 if (!need_delete)
719 pid = bpf_get_current_pid_tgid();
720 key.pid = pid;
721 }
722 if (needs_callstack)
723 key.stack_id = pelem->stack_id;
724 break;
725 case LOCK_AGGR_ADDR:
726 key.lock_addr_or_cgroup = pelem->lock;
727 if (needs_callstack)
728 key.stack_id = pelem->stack_id;
729 break;
730 case LOCK_AGGR_CGROUP:
731 key.lock_addr_or_cgroup = get_current_cgroup_id();
732 break;
733 default:
734 /* should not happen */
735 return 0;
736 }
737
738 data = bpf_map_lookup_elem(&lock_stat, &key);
739 if (!data) {
740 if (data_map_full) {
741 __sync_fetch_and_add(&data_fail, 1);
742 goto out;
743 }
744
745 struct contention_data first = {
746 .total_time = duration,
747 .max_time = duration,
748 .min_time = duration,
749 .count = 1,
750 .flags = pelem->flags,
751 };
752 int err;
753
754 if (aggr_mode == LOCK_AGGR_ADDR) {
755 first.flags |= check_lock_type(pelem->lock,
756 pelem->flags & LCB_F_TYPE_MASK);
757
758 /* Check if it's from a slab object */
759 if (bpf_get_kmem_cache) {
760 struct kmem_cache *s;
761 struct slab_cache_data *d;
762
763 s = bpf_get_kmem_cache(pelem->lock);
764 if (s != NULL) {
765 /*
766 * Save the ID of the slab cache in the flags
767 * (instead of full address) to reduce the
768 * space in the contention_data.
769 */
770 d = bpf_map_lookup_elem(&slab_caches, &s);
771 if (d != NULL)
772 first.flags |= d->id;
773 }
774 }
775 }
776
777 err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
778 if (err < 0) {
779 if (err == -EEXIST) {
780 /* it lost the race, try to get it again */
781 data = bpf_map_lookup_elem(&lock_stat, &key);
782 if (data != NULL)
783 goto found;
784 }
785 if (err == -E2BIG)
786 data_map_full = 1;
787 __sync_fetch_and_add(&data_fail, 1);
788 }
789 goto out;
790 }
791
792 found:
793 update_contention_data(data, duration, 1);
794
795 out:
796 pelem->lock = 0;
797 if (need_delete)
798 bpf_map_delete_elem(&tstamp, &pid);
799 return 0;
800 }
801
802 extern struct rq runqueues __ksym;
803
804 struct rq___old {
805 raw_spinlock_t lock;
806 } __attribute__((preserve_access_index));
807
808 struct rq___new {
809 raw_spinlock_t __lock;
810 } __attribute__((preserve_access_index));
811
812 SEC("raw_tp/bpf_test_finish")
BPF_PROG(collect_lock_syms)813 int BPF_PROG(collect_lock_syms)
814 {
815 __u64 lock_addr, lock_off;
816 __u32 lock_flag;
817
818 if (bpf_core_field_exists(struct rq___new, __lock))
819 lock_off = offsetof(struct rq___new, __lock);
820 else
821 lock_off = offsetof(struct rq___old, lock);
822
823 for (int i = 0; i < MAX_CPUS; i++) {
824 struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
825
826 if (rq == NULL)
827 break;
828
829 lock_addr = (__u64)(void *)rq + lock_off;
830 lock_flag = LOCK_CLASS_RQLOCK;
831 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
832 }
833 return 0;
834 }
835
836 SEC("raw_tp/bpf_test_finish")
BPF_PROG(end_timestamp)837 int BPF_PROG(end_timestamp)
838 {
839 end_ts = bpf_ktime_get_ns();
840 return 0;
841 }
842
843 /*
844 * bpf_iter__kmem_cache added recently so old kernels don't have it in the
845 * vmlinux.h. But we cannot add it here since it will cause a compiler error
846 * due to redefinition of the struct on later kernels.
847 *
848 * So it uses a CO-RE trick to access the member only if it has the type.
849 * This will support both old and new kernels without compiler errors.
850 */
851 struct bpf_iter__kmem_cache___new {
852 struct kmem_cache *s;
853 } __attribute__((preserve_access_index));
854
855 SEC("iter/kmem_cache")
slab_cache_iter(void * ctx)856 int slab_cache_iter(void *ctx)
857 {
858 struct kmem_cache *s = NULL;
859 struct slab_cache_data d;
860 const char *nameptr;
861
862 if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) {
863 struct bpf_iter__kmem_cache___new *iter = ctx;
864
865 s = iter->s;
866 }
867
868 if (s == NULL)
869 return 0;
870
871 nameptr = s->name;
872 bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr);
873
874 d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT;
875 if (d.id >= LCB_F_SLAB_ID_END)
876 return 0;
877
878 bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST);
879 return 0;
880 }
881
882 char LICENSE[] SEC("license") = "Dual BSD/GPL";
883