xref: /linux/kernel/bpf/stream.c (revision 0a91336e287ca2557fead5221d2c79e0effd034e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
3 
4 #include <linux/bpf.h>
5 #include <linux/filter.h>
6 #include <linux/bpf_mem_alloc.h>
7 #include <linux/percpu.h>
8 #include <linux/refcount.h>
9 #include <linux/gfp.h>
10 #include <linux/memory.h>
11 #include <linux/local_lock.h>
12 #include <linux/mutex.h>
13 
14 /*
15  * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe
16  * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and
17  * stash it in a local per-CPU variable, and bump allocate from the page
18  * whenever items need to be printed to a stream. Each page holds a global
19  * atomic refcount in its first 4 bytes, and then records of variable length
20  * that describe the printed messages. Once the global refcount has dropped to
21  * zero, it is a signal to free the page back to the kernel's page allocator,
22  * given all the individual records in it have been consumed.
23  *
24  * It is possible the same page is used to serve allocations across different
25  * programs, which may be consumed at different times individually, hence
26  * maintaining a reference count per-page is critical for correct lifetime
27  * tracking.
28  *
29  * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it
30  * lands.
31  */
32 struct bpf_stream_page {
33 	refcount_t ref;
34 	u32 consumed;
35 	char buf[];
36 };
37 
38 /* Available room to add data to a refcounted page. */
39 #define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed))
40 
41 static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock);
42 static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page);
43 
bpf_stream_page_local_lock(unsigned long * flags)44 static bool bpf_stream_page_local_lock(unsigned long *flags)
45 {
46 	return local_trylock_irqsave(&stream_local_lock, *flags);
47 }
48 
bpf_stream_page_local_unlock(unsigned long * flags)49 static void bpf_stream_page_local_unlock(unsigned long *flags)
50 {
51 	local_unlock_irqrestore(&stream_local_lock, *flags);
52 }
53 
bpf_stream_page_free(struct bpf_stream_page * stream_page)54 static void bpf_stream_page_free(struct bpf_stream_page *stream_page)
55 {
56 	struct page *p;
57 
58 	if (!stream_page)
59 		return;
60 	p = virt_to_page(stream_page);
61 	free_pages_nolock(p, 0);
62 }
63 
bpf_stream_page_get(struct bpf_stream_page * stream_page)64 static void bpf_stream_page_get(struct bpf_stream_page *stream_page)
65 {
66 	refcount_inc(&stream_page->ref);
67 }
68 
bpf_stream_page_put(struct bpf_stream_page * stream_page)69 static void bpf_stream_page_put(struct bpf_stream_page *stream_page)
70 {
71 	if (refcount_dec_and_test(&stream_page->ref))
72 		bpf_stream_page_free(stream_page);
73 }
74 
bpf_stream_page_init(struct bpf_stream_page * stream_page)75 static void bpf_stream_page_init(struct bpf_stream_page *stream_page)
76 {
77 	refcount_set(&stream_page->ref, 1);
78 	stream_page->consumed = 0;
79 }
80 
bpf_stream_page_replace(void)81 static struct bpf_stream_page *bpf_stream_page_replace(void)
82 {
83 	struct bpf_stream_page *stream_page, *old_stream_page;
84 	struct page *page;
85 
86 	page = alloc_pages_nolock(NUMA_NO_NODE, 0);
87 	if (!page)
88 		return NULL;
89 	stream_page = page_address(page);
90 	bpf_stream_page_init(stream_page);
91 
92 	old_stream_page = this_cpu_read(stream_pcpu_page);
93 	if (old_stream_page)
94 		bpf_stream_page_put(old_stream_page);
95 	this_cpu_write(stream_pcpu_page, stream_page);
96 	return stream_page;
97 }
98 
bpf_stream_page_check_room(struct bpf_stream_page * stream_page,int len)99 static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len)
100 {
101 	int min = offsetof(struct bpf_stream_elem, str[0]);
102 	int consumed = stream_page->consumed;
103 	int total = BPF_STREAM_PAGE_SZ;
104 	int rem = max(0, total - consumed - min);
105 
106 	/* Let's give room of at least 8 bytes. */
107 	WARN_ON_ONCE(rem % 8 != 0);
108 	rem = rem < 8 ? 0 : rem;
109 	return min(len, rem);
110 }
111 
bpf_stream_elem_init(struct bpf_stream_elem * elem,int len)112 static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
113 {
114 	init_llist_node(&elem->node);
115 	elem->total_len = len;
116 	elem->consumed_len = 0;
117 }
118 
bpf_stream_page_from_elem(struct bpf_stream_elem * elem)119 static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem)
120 {
121 	unsigned long addr = (unsigned long)elem;
122 
123 	return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr);
124 }
125 
bpf_stream_page_push_elem(struct bpf_stream_page * stream_page,int len)126 static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len)
127 {
128 	u32 consumed = stream_page->consumed;
129 
130 	stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8);
131 	return (struct bpf_stream_elem *)&stream_page->buf[consumed];
132 }
133 
bpf_stream_page_reserve_elem(int len)134 static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len)
135 {
136 	struct bpf_stream_elem *elem = NULL;
137 	struct bpf_stream_page *page;
138 	int room = 0;
139 
140 	page = this_cpu_read(stream_pcpu_page);
141 	if (!page)
142 		page = bpf_stream_page_replace();
143 	if (!page)
144 		return NULL;
145 
146 	room = bpf_stream_page_check_room(page, len);
147 	if (room != len)
148 		page = bpf_stream_page_replace();
149 	if (!page)
150 		return NULL;
151 	bpf_stream_page_get(page);
152 	room = bpf_stream_page_check_room(page, len);
153 	WARN_ON_ONCE(room != len);
154 
155 	elem = bpf_stream_page_push_elem(page, room);
156 	bpf_stream_elem_init(elem, room);
157 	return elem;
158 }
159 
bpf_stream_elem_alloc(int len)160 static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
161 {
162 	const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
163 	struct bpf_stream_elem *elem;
164 	unsigned long flags;
165 
166 	BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ);
167 	/*
168 	 * Length denotes the amount of data to be written as part of stream element,
169 	 * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
170 	 * accomodate, therefore deny allocations that won't fit into them.
171 	 */
172 	if (len < 0 || len > max_len)
173 		return NULL;
174 
175 	if (!bpf_stream_page_local_lock(&flags))
176 		return NULL;
177 	elem = bpf_stream_page_reserve_elem(len);
178 	bpf_stream_page_local_unlock(&flags);
179 	return elem;
180 }
181 
__bpf_stream_push_str(struct llist_head * log,const char * str,int len)182 static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len)
183 {
184 	struct bpf_stream_elem *elem = NULL;
185 
186 	/*
187 	 * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream
188 	 * log, elements will be popped at once and reversed to print the log.
189 	 */
190 	elem = bpf_stream_elem_alloc(len);
191 	if (!elem)
192 		return -ENOMEM;
193 
194 	memcpy(elem->str, str, len);
195 	llist_add(&elem->node, log);
196 
197 	return 0;
198 }
199 
bpf_stream_consume_capacity(struct bpf_stream * stream,int len)200 static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len)
201 {
202 	if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY)
203 		return -ENOSPC;
204 	if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) {
205 		atomic_sub(len, &stream->capacity);
206 		return -ENOSPC;
207 	}
208 	return 0;
209 }
210 
bpf_stream_release_capacity(struct bpf_stream * stream,struct bpf_stream_elem * elem)211 static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem)
212 {
213 	int len = elem->total_len;
214 
215 	atomic_sub(len, &stream->capacity);
216 }
217 
bpf_stream_push_str(struct bpf_stream * stream,const char * str,int len)218 static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len)
219 {
220 	int ret = bpf_stream_consume_capacity(stream, len);
221 
222 	return ret ?: __bpf_stream_push_str(&stream->log, str, len);
223 }
224 
bpf_stream_get(enum bpf_stream_id stream_id,struct bpf_prog_aux * aux)225 static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux)
226 {
227 	if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR)
228 		return NULL;
229 	return &aux->stream[stream_id - 1];
230 }
231 
bpf_stream_free_elem(struct bpf_stream_elem * elem)232 static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
233 {
234 	struct bpf_stream_page *p;
235 
236 	p = bpf_stream_page_from_elem(elem);
237 	bpf_stream_page_put(p);
238 }
239 
bpf_stream_free_list(struct llist_node * list)240 static void bpf_stream_free_list(struct llist_node *list)
241 {
242 	struct bpf_stream_elem *elem, *tmp;
243 
244 	llist_for_each_entry_safe(elem, tmp, list, node)
245 		bpf_stream_free_elem(elem);
246 }
247 
bpf_stream_backlog_peek(struct bpf_stream * stream)248 static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream)
249 {
250 	return stream->backlog_head;
251 }
252 
bpf_stream_backlog_pop(struct bpf_stream * stream)253 static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream)
254 {
255 	struct llist_node *node;
256 
257 	node = stream->backlog_head;
258 	if (stream->backlog_head == stream->backlog_tail)
259 		stream->backlog_head = stream->backlog_tail = NULL;
260 	else
261 		stream->backlog_head = node->next;
262 	return node;
263 }
264 
bpf_stream_backlog_fill(struct bpf_stream * stream)265 static void bpf_stream_backlog_fill(struct bpf_stream *stream)
266 {
267 	struct llist_node *head, *tail;
268 
269 	if (llist_empty(&stream->log))
270 		return;
271 	tail = llist_del_all(&stream->log);
272 	if (!tail)
273 		return;
274 	head = llist_reverse_order(tail);
275 
276 	if (!stream->backlog_head) {
277 		stream->backlog_head = head;
278 		stream->backlog_tail = tail;
279 	} else {
280 		stream->backlog_tail->next = head;
281 		stream->backlog_tail = tail;
282 	}
283 
284 	return;
285 }
286 
bpf_stream_consume_elem(struct bpf_stream_elem * elem,int * len)287 static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len)
288 {
289 	int rem = elem->total_len - elem->consumed_len;
290 	int used = min(rem, *len);
291 
292 	elem->consumed_len += used;
293 	*len -= used;
294 
295 	return elem->consumed_len == elem->total_len;
296 }
297 
bpf_stream_read(struct bpf_stream * stream,void __user * buf,int len)298 static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len)
299 {
300 	int rem_len = len, cons_len, ret = 0;
301 	struct bpf_stream_elem *elem = NULL;
302 	struct llist_node *node;
303 
304 	mutex_lock(&stream->lock);
305 
306 	while (rem_len) {
307 		int pos = len - rem_len;
308 		bool cont;
309 
310 		node = bpf_stream_backlog_peek(stream);
311 		if (!node) {
312 			bpf_stream_backlog_fill(stream);
313 			node = bpf_stream_backlog_peek(stream);
314 		}
315 		if (!node)
316 			break;
317 		elem = container_of(node, typeof(*elem), node);
318 
319 		cons_len = elem->consumed_len;
320 		cont = bpf_stream_consume_elem(elem, &rem_len) == false;
321 
322 		ret = copy_to_user(buf + pos, elem->str + cons_len,
323 				   elem->consumed_len - cons_len);
324 		/* Restore in case of error. */
325 		if (ret) {
326 			ret = -EFAULT;
327 			elem->consumed_len = cons_len;
328 			break;
329 		}
330 
331 		if (cont)
332 			continue;
333 		bpf_stream_backlog_pop(stream);
334 		bpf_stream_release_capacity(stream, elem);
335 		bpf_stream_free_elem(elem);
336 	}
337 
338 	mutex_unlock(&stream->lock);
339 	return ret ? ret : len - rem_len;
340 }
341 
bpf_prog_stream_read(struct bpf_prog * prog,enum bpf_stream_id stream_id,void __user * buf,int len)342 int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len)
343 {
344 	struct bpf_stream *stream;
345 
346 	stream = bpf_stream_get(stream_id, prog->aux);
347 	if (!stream)
348 		return -ENOENT;
349 	return bpf_stream_read(stream, buf, len);
350 }
351 
352 __bpf_kfunc_start_defs();
353 
354 /*
355  * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
356  * enum in headers.
357  */
bpf_stream_vprintk(int stream_id,const char * fmt__str,const void * args,u32 len__sz,void * aux__prog)358 __bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog)
359 {
360 	struct bpf_bprintf_data data = {
361 		.get_bin_args	= true,
362 		.get_buf	= true,
363 	};
364 	struct bpf_prog_aux *aux = aux__prog;
365 	u32 fmt_size = strlen(fmt__str) + 1;
366 	struct bpf_stream *stream;
367 	u32 data_len = len__sz;
368 	int ret, num_args;
369 
370 	stream = bpf_stream_get(stream_id, aux);
371 	if (!stream)
372 		return -ENOENT;
373 
374 	if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
375 	    (data_len && !args))
376 		return -EINVAL;
377 	num_args = data_len / 8;
378 
379 	ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data);
380 	if (ret < 0)
381 		return ret;
382 
383 	ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args);
384 	/* Exclude NULL byte during push. */
385 	ret = bpf_stream_push_str(stream, data.buf, ret);
386 	bpf_bprintf_cleanup(&data);
387 
388 	return ret;
389 }
390 
391 __bpf_kfunc_end_defs();
392 
393 /* Added kfunc to common_btf_ids */
394 
bpf_prog_stream_init(struct bpf_prog * prog)395 void bpf_prog_stream_init(struct bpf_prog *prog)
396 {
397 	int i;
398 
399 	for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
400 		atomic_set(&prog->aux->stream[i].capacity, 0);
401 		init_llist_head(&prog->aux->stream[i].log);
402 		mutex_init(&prog->aux->stream[i].lock);
403 		prog->aux->stream[i].backlog_head = NULL;
404 		prog->aux->stream[i].backlog_tail = NULL;
405 	}
406 }
407 
bpf_prog_stream_free(struct bpf_prog * prog)408 void bpf_prog_stream_free(struct bpf_prog *prog)
409 {
410 	struct llist_node *list;
411 	int i;
412 
413 	for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
414 		list = llist_del_all(&prog->aux->stream[i].log);
415 		bpf_stream_free_list(list);
416 		bpf_stream_free_list(prog->aux->stream[i].backlog_head);
417 	}
418 }
419 
bpf_stream_stage_init(struct bpf_stream_stage * ss)420 void bpf_stream_stage_init(struct bpf_stream_stage *ss)
421 {
422 	init_llist_head(&ss->log);
423 	ss->len = 0;
424 }
425 
bpf_stream_stage_free(struct bpf_stream_stage * ss)426 void bpf_stream_stage_free(struct bpf_stream_stage *ss)
427 {
428 	struct llist_node *node;
429 
430 	node = llist_del_all(&ss->log);
431 	bpf_stream_free_list(node);
432 }
433 
bpf_stream_stage_printk(struct bpf_stream_stage * ss,const char * fmt,...)434 int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...)
435 {
436 	struct bpf_bprintf_buffers *buf;
437 	va_list args;
438 	int ret;
439 
440 	if (bpf_try_get_buffers(&buf))
441 		return -EBUSY;
442 
443 	va_start(args, fmt);
444 	ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args);
445 	va_end(args);
446 	ss->len += ret;
447 	/* Exclude NULL byte during push. */
448 	ret = __bpf_stream_push_str(&ss->log, buf->buf, ret);
449 	bpf_put_buffers();
450 	return ret;
451 }
452 
bpf_stream_stage_commit(struct bpf_stream_stage * ss,struct bpf_prog * prog,enum bpf_stream_id stream_id)453 int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
454 			    enum bpf_stream_id stream_id)
455 {
456 	struct llist_node *list, *head, *tail;
457 	struct bpf_stream *stream;
458 	int ret;
459 
460 	stream = bpf_stream_get(stream_id, prog->aux);
461 	if (!stream)
462 		return -EINVAL;
463 
464 	ret = bpf_stream_consume_capacity(stream, ss->len);
465 	if (ret)
466 		return ret;
467 
468 	list = llist_del_all(&ss->log);
469 	head = tail = list;
470 
471 	if (!list)
472 		return 0;
473 	while (llist_next(list)) {
474 		tail = llist_next(list);
475 		list = tail;
476 	}
477 	llist_add_batch(head, tail, &stream->log);
478 	return 0;
479 }
480 
481 struct dump_stack_ctx {
482 	struct bpf_stream_stage *ss;
483 	int err;
484 };
485 
dump_stack_cb(void * cookie,u64 ip,u64 sp,u64 bp)486 static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
487 {
488 	struct dump_stack_ctx *ctxp = cookie;
489 	const char *file = "", *line = "";
490 	struct bpf_prog *prog;
491 	int num, ret;
492 
493 	rcu_read_lock();
494 	prog = bpf_prog_ksym_find(ip);
495 	rcu_read_unlock();
496 	if (prog) {
497 		ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num);
498 		if (ret < 0)
499 			goto end;
500 		ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n  %s @ %s:%d\n",
501 						    (void *)(long)ip, line, file, num);
502 		return !ctxp->err;
503 	}
504 end:
505 	ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip);
506 	return !ctxp->err;
507 }
508 
bpf_stream_stage_dump_stack(struct bpf_stream_stage * ss)509 int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss)
510 {
511 	struct dump_stack_ctx ctx = { .ss = ss };
512 	int ret;
513 
514 	ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n",
515 				      raw_smp_processor_id(), __kuid_val(current_real_cred()->euid),
516 				      current->pid, current->comm);
517 	if (ret)
518 		return ret;
519 	ret = bpf_stream_stage_printk(ss, "Call trace:\n");
520 	if (ret)
521 		return ret;
522 	arch_bpf_stack_walk(dump_stack_cb, &ctx);
523 	if (ctx.err)
524 		return ctx.err;
525 	return bpf_stream_stage_printk(ss, "\n");
526 }
527