1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7 #include <asm-generic/errno-base.h>
8 
9 #include "lock_data.h"
10 
11 /* for collect_lock_syms().  4096 was rejected by the verifier */
12 #define MAX_CPUS  1024
13 
14 /* lock contention flags from include/trace/events/lock.h */
15 #define LCB_F_SPIN	(1U << 0)
16 #define LCB_F_READ	(1U << 1)
17 #define LCB_F_WRITE	(1U << 2)
18 #define LCB_F_RT	(1U << 3)
19 #define LCB_F_PERCPU	(1U << 4)
20 #define LCB_F_MUTEX	(1U << 5)
21 
22 /* callstack storage  */
23 struct {
24 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
25 	__uint(key_size, sizeof(__u32));
26 	__uint(value_size, sizeof(__u64));
27 	__uint(max_entries, MAX_ENTRIES);
28 } stacks SEC(".maps");
29 
30 /* buffer for owner stacktrace */
31 struct {
32 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
33 	__uint(key_size, sizeof(__u32));
34 	__uint(value_size, sizeof(__u64));
35 	__uint(max_entries, 1);
36 } stack_buf SEC(".maps");
37 
38 /* a map for tracing owner stacktrace to owner stack id */
39 struct {
40 	__uint(type, BPF_MAP_TYPE_HASH);
41 	__uint(key_size, sizeof(__u64)); // owner stacktrace
42 	__uint(value_size, sizeof(__s32)); // owner stack id
43 	__uint(max_entries, 1);
44 } owner_stacks SEC(".maps");
45 
46 /* a map for tracing lock address to owner data */
47 struct {
48 	__uint(type, BPF_MAP_TYPE_HASH);
49 	__uint(key_size, sizeof(__u64)); // lock address
50 	__uint(value_size, sizeof(struct owner_tracing_data));
51 	__uint(max_entries, 1);
52 } owner_data SEC(".maps");
53 
54 /* a map for contention_key (stores owner stack id) to contention data */
55 struct {
56 	__uint(type, BPF_MAP_TYPE_HASH);
57 	__uint(key_size, sizeof(struct contention_key));
58 	__uint(value_size, sizeof(struct contention_data));
59 	__uint(max_entries, 1);
60 } owner_stat SEC(".maps");
61 
62 /* maintain timestamp at the beginning of contention */
63 struct {
64 	__uint(type, BPF_MAP_TYPE_HASH);
65 	__type(key, int);
66 	__type(value, struct tstamp_data);
67 	__uint(max_entries, MAX_ENTRIES);
68 } tstamp SEC(".maps");
69 
70 /* maintain per-CPU timestamp at the beginning of contention */
71 struct {
72 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
73 	__uint(key_size, sizeof(__u32));
74 	__uint(value_size, sizeof(struct tstamp_data));
75 	__uint(max_entries, 1);
76 } tstamp_cpu SEC(".maps");
77 
78 /* actual lock contention statistics */
79 struct {
80 	__uint(type, BPF_MAP_TYPE_HASH);
81 	__uint(key_size, sizeof(struct contention_key));
82 	__uint(value_size, sizeof(struct contention_data));
83 	__uint(max_entries, MAX_ENTRIES);
84 } lock_stat SEC(".maps");
85 
86 struct {
87 	__uint(type, BPF_MAP_TYPE_HASH);
88 	__uint(key_size, sizeof(__u32));
89 	__uint(value_size, sizeof(struct contention_task_data));
90 	__uint(max_entries, MAX_ENTRIES);
91 } task_data SEC(".maps");
92 
93 struct {
94 	__uint(type, BPF_MAP_TYPE_HASH);
95 	__uint(key_size, sizeof(__u64));
96 	__uint(value_size, sizeof(__u32));
97 	__uint(max_entries, MAX_ENTRIES);
98 } lock_syms SEC(".maps");
99 
100 struct {
101 	__uint(type, BPF_MAP_TYPE_HASH);
102 	__uint(key_size, sizeof(__u32));
103 	__uint(value_size, sizeof(__u8));
104 	__uint(max_entries, 1);
105 } cpu_filter SEC(".maps");
106 
107 struct {
108 	__uint(type, BPF_MAP_TYPE_HASH);
109 	__uint(key_size, sizeof(__u32));
110 	__uint(value_size, sizeof(__u8));
111 	__uint(max_entries, 1);
112 } task_filter SEC(".maps");
113 
114 struct {
115 	__uint(type, BPF_MAP_TYPE_HASH);
116 	__uint(key_size, sizeof(__u32));
117 	__uint(value_size, sizeof(__u8));
118 	__uint(max_entries, 1);
119 } type_filter SEC(".maps");
120 
121 struct {
122 	__uint(type, BPF_MAP_TYPE_HASH);
123 	__uint(key_size, sizeof(__u64));
124 	__uint(value_size, sizeof(__u8));
125 	__uint(max_entries, 1);
126 } addr_filter SEC(".maps");
127 
128 struct {
129 	__uint(type, BPF_MAP_TYPE_HASH);
130 	__uint(key_size, sizeof(__u64));
131 	__uint(value_size, sizeof(__u8));
132 	__uint(max_entries, 1);
133 } cgroup_filter SEC(".maps");
134 
135 struct {
136 	__uint(type, BPF_MAP_TYPE_HASH);
137 	__uint(key_size, sizeof(long));
138 	__uint(value_size, sizeof(__u8));
139 	__uint(max_entries, 1);
140 } slab_filter SEC(".maps");
141 
142 struct {
143 	__uint(type, BPF_MAP_TYPE_HASH);
144 	__uint(key_size, sizeof(long));
145 	__uint(value_size, sizeof(struct slab_cache_data));
146 	__uint(max_entries, 1);
147 } slab_caches SEC(".maps");
148 
149 struct rw_semaphore___old {
150 	struct task_struct *owner;
151 } __attribute__((preserve_access_index));
152 
153 struct rw_semaphore___new {
154 	atomic_long_t owner;
155 } __attribute__((preserve_access_index));
156 
157 struct mm_struct___old {
158 	struct rw_semaphore mmap_sem;
159 } __attribute__((preserve_access_index));
160 
161 struct mm_struct___new {
162 	struct rw_semaphore mmap_lock;
163 } __attribute__((preserve_access_index));
164 
165 extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak;
166 
167 /* control flags */
168 const volatile int has_cpu;
169 const volatile int has_task;
170 const volatile int has_type;
171 const volatile int has_addr;
172 const volatile int has_cgroup;
173 const volatile int has_slab;
174 const volatile int needs_callstack;
175 const volatile int stack_skip;
176 const volatile int lock_owner;
177 const volatile int use_cgroup_v2;
178 const volatile int max_stack;
179 
180 /* determine the key of lock stat */
181 const volatile int aggr_mode;
182 
183 int enabled;
184 
185 int perf_subsys_id = -1;
186 
187 __u64 end_ts;
188 
189 __u32 slab_cache_id;
190 
191 /* error stat */
192 int task_fail;
193 int stack_fail;
194 int time_fail;
195 int data_fail;
196 
197 int task_map_full;
198 int data_map_full;
199 
200 struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak;
201 void bpf_task_release(struct task_struct *p) __ksym __weak;
202 
get_current_cgroup_id(void)203 static inline __u64 get_current_cgroup_id(void)
204 {
205 	struct task_struct *task;
206 	struct cgroup *cgrp;
207 
208 	if (use_cgroup_v2)
209 		return bpf_get_current_cgroup_id();
210 
211 	task = bpf_get_current_task_btf();
212 
213 	if (perf_subsys_id == -1) {
214 #if __has_builtin(__builtin_preserve_enum_value)
215 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
216 						     perf_event_cgrp_id);
217 #else
218 		perf_subsys_id = perf_event_cgrp_id;
219 #endif
220 	}
221 
222 	cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
223 	return BPF_CORE_READ(cgrp, kn, id);
224 }
225 
can_record(u64 * ctx)226 static inline int can_record(u64 *ctx)
227 {
228 	if (has_cpu) {
229 		__u32 cpu = bpf_get_smp_processor_id();
230 		__u8 *ok;
231 
232 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
233 		if (!ok)
234 			return 0;
235 	}
236 
237 	if (has_task) {
238 		__u8 *ok;
239 		__u32 pid = bpf_get_current_pid_tgid();
240 
241 		ok = bpf_map_lookup_elem(&task_filter, &pid);
242 		if (!ok)
243 			return 0;
244 	}
245 
246 	if (has_type) {
247 		__u8 *ok;
248 		__u32 flags = (__u32)ctx[1];
249 
250 		ok = bpf_map_lookup_elem(&type_filter, &flags);
251 		if (!ok)
252 			return 0;
253 	}
254 
255 	if (has_addr) {
256 		__u8 *ok;
257 		__u64 addr = ctx[0];
258 
259 		ok = bpf_map_lookup_elem(&addr_filter, &addr);
260 		if (!ok && !has_slab)
261 			return 0;
262 	}
263 
264 	if (has_cgroup) {
265 		__u8 *ok;
266 		__u64 cgrp = get_current_cgroup_id();
267 
268 		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp);
269 		if (!ok)
270 			return 0;
271 	}
272 
273 	if (has_slab && bpf_get_kmem_cache) {
274 		__u8 *ok;
275 		__u64 addr = ctx[0];
276 		long kmem_cache_addr;
277 
278 		kmem_cache_addr = (long)bpf_get_kmem_cache(addr);
279 		ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr);
280 		if (!ok)
281 			return 0;
282 	}
283 
284 	return 1;
285 }
286 
update_task_data(struct task_struct * task)287 static inline int update_task_data(struct task_struct *task)
288 {
289 	struct contention_task_data *p;
290 	int pid, err;
291 
292 	err = bpf_core_read(&pid, sizeof(pid), &task->pid);
293 	if (err)
294 		return -1;
295 
296 	p = bpf_map_lookup_elem(&task_data, &pid);
297 	if (p == NULL && !task_map_full) {
298 		struct contention_task_data data = {};
299 
300 		BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
301 		if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
302 			task_map_full = 1;
303 	}
304 
305 	return 0;
306 }
307 
308 #ifndef __has_builtin
309 # define __has_builtin(x) 0
310 #endif
311 
get_lock_owner(__u64 lock,__u32 flags)312 static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
313 {
314 	struct task_struct *task;
315 	__u64 owner = 0;
316 
317 	if (flags & LCB_F_MUTEX) {
318 		struct mutex *mutex = (void *)lock;
319 		owner = BPF_CORE_READ(mutex, owner.counter);
320 	} else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
321 	/*
322 	 * Support for the BPF_TYPE_MATCHES argument to the
323 	 * __builtin_preserve_type_info builtin was added at some point during
324 	 * development of clang 15 and it's what is needed for
325 	 * bpf_core_type_matches.
326 	 */
327 #if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
328 		if (bpf_core_type_matches(struct rw_semaphore___old)) {
329 			struct rw_semaphore___old *rwsem = (void *)lock;
330 			owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
331 		} else if (bpf_core_type_matches(struct rw_semaphore___new)) {
332 			struct rw_semaphore___new *rwsem = (void *)lock;
333 			owner = BPF_CORE_READ(rwsem, owner.counter);
334 		}
335 #else
336 		/* assume new struct */
337 		struct rw_semaphore *rwsem = (void *)lock;
338 		owner = BPF_CORE_READ(rwsem, owner.counter);
339 #endif
340 	}
341 
342 	if (!owner)
343 		return NULL;
344 
345 	task = (void *)(owner & ~7UL);
346 	return task;
347 }
348 
check_lock_type(__u64 lock,__u32 flags)349 static inline __u32 check_lock_type(__u64 lock, __u32 flags)
350 {
351 	struct task_struct *curr;
352 	struct mm_struct___old *mm_old;
353 	struct mm_struct___new *mm_new;
354 	struct sighand_struct *sighand;
355 
356 	switch (flags) {
357 	case LCB_F_READ:  /* rwsem */
358 	case LCB_F_WRITE:
359 		curr = bpf_get_current_task_btf();
360 		if (curr->mm == NULL)
361 			break;
362 		mm_new = (void *)curr->mm;
363 		if (bpf_core_field_exists(mm_new->mmap_lock)) {
364 			if (&mm_new->mmap_lock == (void *)lock)
365 				return LCD_F_MMAP_LOCK;
366 			break;
367 		}
368 		mm_old = (void *)curr->mm;
369 		if (bpf_core_field_exists(mm_old->mmap_sem)) {
370 			if (&mm_old->mmap_sem == (void *)lock)
371 				return LCD_F_MMAP_LOCK;
372 		}
373 		break;
374 	case LCB_F_SPIN:  /* spinlock */
375 		curr = bpf_get_current_task_btf();
376 		sighand = curr->sighand;
377 
378 		if (sighand && &sighand->siglock == (void *)lock)
379 			return LCD_F_SIGHAND_LOCK;
380 		break;
381 	default:
382 		break;
383 	}
384 	return 0;
385 }
386 
get_tstamp_elem(__u32 flags)387 static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
388 {
389 	__u32 pid;
390 	struct tstamp_data *pelem;
391 
392 	/* Use per-cpu array map for spinlock and rwlock */
393 	if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) {
394 		__u32 idx = 0;
395 
396 		pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
397 		/* Do not update the element for nested locks */
398 		if (pelem && pelem->lock)
399 			pelem = NULL;
400 		return pelem;
401 	}
402 
403 	pid = bpf_get_current_pid_tgid();
404 	pelem = bpf_map_lookup_elem(&tstamp, &pid);
405 	/* Do not update the element for nested locks */
406 	if (pelem && pelem->lock)
407 		return NULL;
408 
409 	if (pelem == NULL) {
410 		struct tstamp_data zero = {};
411 
412 		if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
413 			__sync_fetch_and_add(&task_fail, 1);
414 			return NULL;
415 		}
416 
417 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
418 		if (pelem == NULL) {
419 			__sync_fetch_and_add(&task_fail, 1);
420 			return NULL;
421 		}
422 	}
423 	return pelem;
424 }
425 
get_owner_stack_id(u64 * stacktrace)426 static inline s32 get_owner_stack_id(u64 *stacktrace)
427 {
428 	s32 *id, new_id;
429 	static s64 id_gen = 1;
430 
431 	id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
432 	if (id)
433 		return *id;
434 
435 	new_id = (s32)__sync_fetch_and_add(&id_gen, 1);
436 
437 	bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST);
438 
439 	id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
440 	if (id)
441 		return *id;
442 
443 	return -1;
444 }
445 
update_contention_data(struct contention_data * data,u64 duration,u32 count)446 static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count)
447 {
448 	__sync_fetch_and_add(&data->total_time, duration);
449 	__sync_fetch_and_add(&data->count, count);
450 
451 	/* FIXME: need atomic operations */
452 	if (data->max_time < duration)
453 		data->max_time = duration;
454 	if (data->min_time > duration)
455 		data->min_time = duration;
456 }
457 
update_owner_stat(u32 id,u64 duration,u32 flags)458 static inline void update_owner_stat(u32 id, u64 duration, u32 flags)
459 {
460 	struct contention_key key = {
461 		.stack_id = id,
462 		.pid = 0,
463 		.lock_addr_or_cgroup = 0,
464 	};
465 	struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key);
466 
467 	if (!data) {
468 		struct contention_data first = {
469 			.total_time = duration,
470 			.max_time = duration,
471 			.min_time = duration,
472 			.count = 1,
473 			.flags = flags,
474 		};
475 		bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST);
476 	} else {
477 		update_contention_data(data, duration, 1);
478 	}
479 }
480 
481 SEC("tp_btf/contention_begin")
contention_begin(u64 * ctx)482 int contention_begin(u64 *ctx)
483 {
484 	struct tstamp_data *pelem;
485 
486 	if (!enabled || !can_record(ctx))
487 		return 0;
488 
489 	pelem = get_tstamp_elem(ctx[1]);
490 	if (pelem == NULL)
491 		return 0;
492 
493 	pelem->timestamp = bpf_ktime_get_ns();
494 	pelem->lock = (__u64)ctx[0];
495 	pelem->flags = (__u32)ctx[1];
496 
497 	if (needs_callstack) {
498 		u32 i = 0;
499 		u32 id = 0;
500 		int owner_pid;
501 		u64 *buf;
502 		struct task_struct *task;
503 		struct owner_tracing_data *otdata;
504 
505 		if (!lock_owner)
506 			goto skip_owner;
507 
508 		task = get_lock_owner(pelem->lock, pelem->flags);
509 		if (!task)
510 			goto skip_owner;
511 
512 		owner_pid = BPF_CORE_READ(task, pid);
513 
514 		buf = bpf_map_lookup_elem(&stack_buf, &i);
515 		if (!buf)
516 			goto skip_owner;
517 		for (i = 0; i < max_stack; i++)
518 			buf[i] = 0x0;
519 
520 		if (!bpf_task_from_pid)
521 			goto skip_owner;
522 
523 		task = bpf_task_from_pid(owner_pid);
524 		if (!task)
525 			goto skip_owner;
526 
527 		bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0);
528 		bpf_task_release(task);
529 
530 		otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
531 		id = get_owner_stack_id(buf);
532 
533 		/*
534 		 * Contention just happens, or corner case `lock` is owned by process not
535 		 * `owner_pid`. For the corner case we treat it as unexpected internal error and
536 		 * just ignore the precvious tracing record.
537 		 */
538 		if (!otdata || otdata->pid != owner_pid) {
539 			struct owner_tracing_data first = {
540 				.pid = owner_pid,
541 				.timestamp = pelem->timestamp,
542 				.count = 1,
543 				.stack_id = id,
544 			};
545 			bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY);
546 		}
547 		/* Contention is ongoing and new waiter joins */
548 		else {
549 			__sync_fetch_and_add(&otdata->count, 1);
550 
551 			/*
552 			 * The owner is the same, but stacktrace might be changed. In this case we
553 			 * store/update `owner_stat` based on current owner stack id.
554 			 */
555 			if (id != otdata->stack_id) {
556 				update_owner_stat(id, pelem->timestamp - otdata->timestamp,
557 						  pelem->flags);
558 
559 				otdata->timestamp = pelem->timestamp;
560 				otdata->stack_id = id;
561 			}
562 		}
563 skip_owner:
564 		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
565 						  BPF_F_FAST_STACK_CMP | stack_skip);
566 		if (pelem->stack_id < 0)
567 			__sync_fetch_and_add(&stack_fail, 1);
568 	} else if (aggr_mode == LOCK_AGGR_TASK) {
569 		struct task_struct *task;
570 
571 		if (lock_owner) {
572 			task = get_lock_owner(pelem->lock, pelem->flags);
573 
574 			/* The flags is not used anymore.  Pass the owner pid. */
575 			if (task)
576 				pelem->flags = BPF_CORE_READ(task, pid);
577 			else
578 				pelem->flags = -1U;
579 
580 		} else {
581 			task = bpf_get_current_task_btf();
582 		}
583 
584 		if (task) {
585 			if (update_task_data(task) < 0 && lock_owner)
586 				pelem->flags = -1U;
587 		}
588 	}
589 
590 	return 0;
591 }
592 
593 SEC("tp_btf/contention_end")
contention_end(u64 * ctx)594 int contention_end(u64 *ctx)
595 {
596 	__u32 pid = 0, idx = 0;
597 	struct tstamp_data *pelem;
598 	struct contention_key key = {};
599 	struct contention_data *data;
600 	__u64 timestamp;
601 	__u64 duration;
602 	bool need_delete = false;
603 
604 	if (!enabled)
605 		return 0;
606 
607 	/*
608 	 * For spinlock and rwlock, it needs to get the timestamp for the
609 	 * per-cpu map.  However, contention_end does not have the flags
610 	 * so it cannot know whether it reads percpu or hash map.
611 	 *
612 	 * Try per-cpu map first and check if there's active contention.
613 	 * If it is, do not read hash map because it cannot go to sleeping
614 	 * locks before releasing the spinning locks.
615 	 */
616 	pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
617 	if (pelem && pelem->lock) {
618 		if (pelem->lock != ctx[0])
619 			return 0;
620 	} else {
621 		pid = bpf_get_current_pid_tgid();
622 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
623 		if (!pelem || pelem->lock != ctx[0])
624 			return 0;
625 		need_delete = true;
626 	}
627 
628 	timestamp = bpf_ktime_get_ns();
629 	duration = timestamp - pelem->timestamp;
630 	if ((__s64)duration < 0) {
631 		__sync_fetch_and_add(&time_fail, 1);
632 		goto out;
633 	}
634 
635 	if (needs_callstack && lock_owner) {
636 		struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
637 
638 		if (!otdata)
639 			goto skip_owner;
640 
641 		/* Update `owner_stat` */
642 		update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags);
643 
644 		/* No contention is occurring, delete `lock` entry in `owner_data` */
645 		if (otdata->count <= 1)
646 			bpf_map_delete_elem(&owner_data, &pelem->lock);
647 		/*
648 		 * Contention is still ongoing, with a new owner (current task). `owner_data`
649 		 * should be updated accordingly.
650 		 */
651 		else {
652 			u32 i = 0;
653 			s32 ret = (s32)ctx[1];
654 			u64 *buf;
655 
656 			otdata->timestamp = timestamp;
657 			__sync_fetch_and_add(&otdata->count, -1);
658 
659 			buf = bpf_map_lookup_elem(&stack_buf, &i);
660 			if (!buf)
661 				goto skip_owner;
662 			for (i = 0; i < (u32)max_stack; i++)
663 				buf[i] = 0x0;
664 
665 			/*
666 			 * `ret` has the return code of the lock function.
667 			 * If `ret` is negative, the current task terminates lock waiting without
668 			 * acquiring it. Owner is not changed, but we still need to update the owner
669 			 * stack.
670 			 */
671 			if (ret < 0) {
672 				s32 id = 0;
673 				struct task_struct *task;
674 
675 				if (!bpf_task_from_pid)
676 					goto skip_owner;
677 
678 				task = bpf_task_from_pid(otdata->pid);
679 				if (!task)
680 					goto skip_owner;
681 
682 				bpf_get_task_stack(task, buf,
683 						   max_stack * sizeof(unsigned long), 0);
684 				bpf_task_release(task);
685 
686 				id = get_owner_stack_id(buf);
687 
688 				/*
689 				 * If owner stack is changed, update owner stack id for this lock.
690 				 */
691 				if (id != otdata->stack_id)
692 					otdata->stack_id = id;
693 			}
694 			/*
695 			 * Otherwise, update tracing data with the current task, which is the new
696 			 * owner.
697 			 */
698 			else {
699 				otdata->pid = pid;
700 				/*
701 				 * We don't want to retrieve callstack here, since it is where the
702 				 * current task acquires the lock and provides no additional
703 				 * information. We simply assign -1 to invalidate it.
704 				 */
705 				otdata->stack_id = -1;
706 			}
707 		}
708 	}
709 skip_owner:
710 	switch (aggr_mode) {
711 	case LOCK_AGGR_CALLER:
712 		key.stack_id = pelem->stack_id;
713 		break;
714 	case LOCK_AGGR_TASK:
715 		if (lock_owner)
716 			key.pid = pelem->flags;
717 		else {
718 			if (!need_delete)
719 				pid = bpf_get_current_pid_tgid();
720 			key.pid = pid;
721 		}
722 		if (needs_callstack)
723 			key.stack_id = pelem->stack_id;
724 		break;
725 	case LOCK_AGGR_ADDR:
726 		key.lock_addr_or_cgroup = pelem->lock;
727 		if (needs_callstack)
728 			key.stack_id = pelem->stack_id;
729 		break;
730 	case LOCK_AGGR_CGROUP:
731 		key.lock_addr_or_cgroup = get_current_cgroup_id();
732 		break;
733 	default:
734 		/* should not happen */
735 		return 0;
736 	}
737 
738 	data = bpf_map_lookup_elem(&lock_stat, &key);
739 	if (!data) {
740 		if (data_map_full) {
741 			__sync_fetch_and_add(&data_fail, 1);
742 			goto out;
743 		}
744 
745 		struct contention_data first = {
746 			.total_time = duration,
747 			.max_time = duration,
748 			.min_time = duration,
749 			.count = 1,
750 			.flags = pelem->flags,
751 		};
752 		int err;
753 
754 		if (aggr_mode == LOCK_AGGR_ADDR) {
755 			first.flags |= check_lock_type(pelem->lock,
756 						       pelem->flags & LCB_F_TYPE_MASK);
757 
758 			/* Check if it's from a slab object */
759 			if (bpf_get_kmem_cache) {
760 				struct kmem_cache *s;
761 				struct slab_cache_data *d;
762 
763 				s = bpf_get_kmem_cache(pelem->lock);
764 				if (s != NULL) {
765 					/*
766 					 * Save the ID of the slab cache in the flags
767 					 * (instead of full address) to reduce the
768 					 * space in the contention_data.
769 					 */
770 					d = bpf_map_lookup_elem(&slab_caches, &s);
771 					if (d != NULL)
772 						first.flags |= d->id;
773 				}
774 			}
775 		}
776 
777 		err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
778 		if (err < 0) {
779 			if (err == -EEXIST) {
780 				/* it lost the race, try to get it again */
781 				data = bpf_map_lookup_elem(&lock_stat, &key);
782 				if (data != NULL)
783 					goto found;
784 			}
785 			if (err == -E2BIG)
786 				data_map_full = 1;
787 			__sync_fetch_and_add(&data_fail, 1);
788 		}
789 		goto out;
790 	}
791 
792 found:
793 	update_contention_data(data, duration, 1);
794 
795 out:
796 	pelem->lock = 0;
797 	if (need_delete)
798 		bpf_map_delete_elem(&tstamp, &pid);
799 	return 0;
800 }
801 
802 extern struct rq runqueues __ksym;
803 
804 struct rq___old {
805 	raw_spinlock_t lock;
806 } __attribute__((preserve_access_index));
807 
808 struct rq___new {
809 	raw_spinlock_t __lock;
810 } __attribute__((preserve_access_index));
811 
812 SEC("raw_tp/bpf_test_finish")
BPF_PROG(collect_lock_syms)813 int BPF_PROG(collect_lock_syms)
814 {
815 	__u64 lock_addr, lock_off;
816 	__u32 lock_flag;
817 
818 	if (bpf_core_field_exists(struct rq___new, __lock))
819 		lock_off = offsetof(struct rq___new, __lock);
820 	else
821 		lock_off = offsetof(struct rq___old, lock);
822 
823 	for (int i = 0; i < MAX_CPUS; i++) {
824 		struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
825 
826 		if (rq == NULL)
827 			break;
828 
829 		lock_addr = (__u64)(void *)rq + lock_off;
830 		lock_flag = LOCK_CLASS_RQLOCK;
831 		bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
832 	}
833 	return 0;
834 }
835 
836 SEC("raw_tp/bpf_test_finish")
BPF_PROG(end_timestamp)837 int BPF_PROG(end_timestamp)
838 {
839 	end_ts = bpf_ktime_get_ns();
840 	return 0;
841 }
842 
843 /*
844  * bpf_iter__kmem_cache added recently so old kernels don't have it in the
845  * vmlinux.h.  But we cannot add it here since it will cause a compiler error
846  * due to redefinition of the struct on later kernels.
847  *
848  * So it uses a CO-RE trick to access the member only if it has the type.
849  * This will support both old and new kernels without compiler errors.
850  */
851 struct bpf_iter__kmem_cache___new {
852 	struct kmem_cache *s;
853 } __attribute__((preserve_access_index));
854 
855 SEC("iter/kmem_cache")
slab_cache_iter(void * ctx)856 int slab_cache_iter(void *ctx)
857 {
858 	struct kmem_cache *s = NULL;
859 	struct slab_cache_data d;
860 	const char *nameptr;
861 
862 	if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) {
863 		struct bpf_iter__kmem_cache___new *iter = ctx;
864 
865 		s = iter->s;
866 	}
867 
868 	if (s == NULL)
869 		return 0;
870 
871 	nameptr = s->name;
872 	bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr);
873 
874 	d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT;
875 	if (d.id >= LCB_F_SLAB_ID_END)
876 		return 0;
877 
878 	bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST);
879 	return 0;
880 }
881 
882 char LICENSE[] SEC("license") = "Dual BSD/GPL";
883