1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
3
4 #include <linux/bpf.h>
5 #include <linux/filter.h>
6 #include <linux/bpf_mem_alloc.h>
7 #include <linux/percpu.h>
8 #include <linux/refcount.h>
9 #include <linux/gfp.h>
10 #include <linux/memory.h>
11 #include <linux/local_lock.h>
12 #include <linux/mutex.h>
13
14 /*
15 * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe
16 * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and
17 * stash it in a local per-CPU variable, and bump allocate from the page
18 * whenever items need to be printed to a stream. Each page holds a global
19 * atomic refcount in its first 4 bytes, and then records of variable length
20 * that describe the printed messages. Once the global refcount has dropped to
21 * zero, it is a signal to free the page back to the kernel's page allocator,
22 * given all the individual records in it have been consumed.
23 *
24 * It is possible the same page is used to serve allocations across different
25 * programs, which may be consumed at different times individually, hence
26 * maintaining a reference count per-page is critical for correct lifetime
27 * tracking.
28 *
29 * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it
30 * lands.
31 */
32 struct bpf_stream_page {
33 refcount_t ref;
34 u32 consumed;
35 char buf[];
36 };
37
38 /* Available room to add data to a refcounted page. */
39 #define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed))
40
41 static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock);
42 static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page);
43
bpf_stream_page_local_lock(unsigned long * flags)44 static bool bpf_stream_page_local_lock(unsigned long *flags)
45 {
46 return local_trylock_irqsave(&stream_local_lock, *flags);
47 }
48
bpf_stream_page_local_unlock(unsigned long * flags)49 static void bpf_stream_page_local_unlock(unsigned long *flags)
50 {
51 local_unlock_irqrestore(&stream_local_lock, *flags);
52 }
53
bpf_stream_page_free(struct bpf_stream_page * stream_page)54 static void bpf_stream_page_free(struct bpf_stream_page *stream_page)
55 {
56 struct page *p;
57
58 if (!stream_page)
59 return;
60 p = virt_to_page(stream_page);
61 free_pages_nolock(p, 0);
62 }
63
bpf_stream_page_get(struct bpf_stream_page * stream_page)64 static void bpf_stream_page_get(struct bpf_stream_page *stream_page)
65 {
66 refcount_inc(&stream_page->ref);
67 }
68
bpf_stream_page_put(struct bpf_stream_page * stream_page)69 static void bpf_stream_page_put(struct bpf_stream_page *stream_page)
70 {
71 if (refcount_dec_and_test(&stream_page->ref))
72 bpf_stream_page_free(stream_page);
73 }
74
bpf_stream_page_init(struct bpf_stream_page * stream_page)75 static void bpf_stream_page_init(struct bpf_stream_page *stream_page)
76 {
77 refcount_set(&stream_page->ref, 1);
78 stream_page->consumed = 0;
79 }
80
bpf_stream_page_replace(void)81 static struct bpf_stream_page *bpf_stream_page_replace(void)
82 {
83 struct bpf_stream_page *stream_page, *old_stream_page;
84 struct page *page;
85
86 page = alloc_pages_nolock(NUMA_NO_NODE, 0);
87 if (!page)
88 return NULL;
89 stream_page = page_address(page);
90 bpf_stream_page_init(stream_page);
91
92 old_stream_page = this_cpu_read(stream_pcpu_page);
93 if (old_stream_page)
94 bpf_stream_page_put(old_stream_page);
95 this_cpu_write(stream_pcpu_page, stream_page);
96 return stream_page;
97 }
98
bpf_stream_page_check_room(struct bpf_stream_page * stream_page,int len)99 static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len)
100 {
101 int min = offsetof(struct bpf_stream_elem, str[0]);
102 int consumed = stream_page->consumed;
103 int total = BPF_STREAM_PAGE_SZ;
104 int rem = max(0, total - consumed - min);
105
106 /* Let's give room of at least 8 bytes. */
107 WARN_ON_ONCE(rem % 8 != 0);
108 rem = rem < 8 ? 0 : rem;
109 return min(len, rem);
110 }
111
bpf_stream_elem_init(struct bpf_stream_elem * elem,int len)112 static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
113 {
114 init_llist_node(&elem->node);
115 elem->total_len = len;
116 elem->consumed_len = 0;
117 }
118
bpf_stream_page_from_elem(struct bpf_stream_elem * elem)119 static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem)
120 {
121 unsigned long addr = (unsigned long)elem;
122
123 return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr);
124 }
125
bpf_stream_page_push_elem(struct bpf_stream_page * stream_page,int len)126 static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len)
127 {
128 u32 consumed = stream_page->consumed;
129
130 stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8);
131 return (struct bpf_stream_elem *)&stream_page->buf[consumed];
132 }
133
bpf_stream_page_reserve_elem(int len)134 static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len)
135 {
136 struct bpf_stream_elem *elem = NULL;
137 struct bpf_stream_page *page;
138 int room = 0;
139
140 page = this_cpu_read(stream_pcpu_page);
141 if (!page)
142 page = bpf_stream_page_replace();
143 if (!page)
144 return NULL;
145
146 room = bpf_stream_page_check_room(page, len);
147 if (room != len)
148 page = bpf_stream_page_replace();
149 if (!page)
150 return NULL;
151 bpf_stream_page_get(page);
152 room = bpf_stream_page_check_room(page, len);
153 WARN_ON_ONCE(room != len);
154
155 elem = bpf_stream_page_push_elem(page, room);
156 bpf_stream_elem_init(elem, room);
157 return elem;
158 }
159
bpf_stream_elem_alloc(int len)160 static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
161 {
162 const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
163 struct bpf_stream_elem *elem;
164 unsigned long flags;
165
166 BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ);
167 /*
168 * Length denotes the amount of data to be written as part of stream element,
169 * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
170 * accomodate, therefore deny allocations that won't fit into them.
171 */
172 if (len < 0 || len > max_len)
173 return NULL;
174
175 if (!bpf_stream_page_local_lock(&flags))
176 return NULL;
177 elem = bpf_stream_page_reserve_elem(len);
178 bpf_stream_page_local_unlock(&flags);
179 return elem;
180 }
181
__bpf_stream_push_str(struct llist_head * log,const char * str,int len)182 static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len)
183 {
184 struct bpf_stream_elem *elem = NULL;
185
186 /*
187 * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream
188 * log, elements will be popped at once and reversed to print the log.
189 */
190 elem = bpf_stream_elem_alloc(len);
191 if (!elem)
192 return -ENOMEM;
193
194 memcpy(elem->str, str, len);
195 llist_add(&elem->node, log);
196
197 return 0;
198 }
199
bpf_stream_consume_capacity(struct bpf_stream * stream,int len)200 static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len)
201 {
202 if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY)
203 return -ENOSPC;
204 if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) {
205 atomic_sub(len, &stream->capacity);
206 return -ENOSPC;
207 }
208 return 0;
209 }
210
bpf_stream_release_capacity(struct bpf_stream * stream,struct bpf_stream_elem * elem)211 static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem)
212 {
213 int len = elem->total_len;
214
215 atomic_sub(len, &stream->capacity);
216 }
217
bpf_stream_push_str(struct bpf_stream * stream,const char * str,int len)218 static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len)
219 {
220 int ret = bpf_stream_consume_capacity(stream, len);
221
222 return ret ?: __bpf_stream_push_str(&stream->log, str, len);
223 }
224
bpf_stream_get(enum bpf_stream_id stream_id,struct bpf_prog_aux * aux)225 static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux)
226 {
227 if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR)
228 return NULL;
229 return &aux->stream[stream_id - 1];
230 }
231
bpf_stream_free_elem(struct bpf_stream_elem * elem)232 static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
233 {
234 struct bpf_stream_page *p;
235
236 p = bpf_stream_page_from_elem(elem);
237 bpf_stream_page_put(p);
238 }
239
bpf_stream_free_list(struct llist_node * list)240 static void bpf_stream_free_list(struct llist_node *list)
241 {
242 struct bpf_stream_elem *elem, *tmp;
243
244 llist_for_each_entry_safe(elem, tmp, list, node)
245 bpf_stream_free_elem(elem);
246 }
247
bpf_stream_backlog_peek(struct bpf_stream * stream)248 static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream)
249 {
250 return stream->backlog_head;
251 }
252
bpf_stream_backlog_pop(struct bpf_stream * stream)253 static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream)
254 {
255 struct llist_node *node;
256
257 node = stream->backlog_head;
258 if (stream->backlog_head == stream->backlog_tail)
259 stream->backlog_head = stream->backlog_tail = NULL;
260 else
261 stream->backlog_head = node->next;
262 return node;
263 }
264
bpf_stream_backlog_fill(struct bpf_stream * stream)265 static void bpf_stream_backlog_fill(struct bpf_stream *stream)
266 {
267 struct llist_node *head, *tail;
268
269 if (llist_empty(&stream->log))
270 return;
271 tail = llist_del_all(&stream->log);
272 if (!tail)
273 return;
274 head = llist_reverse_order(tail);
275
276 if (!stream->backlog_head) {
277 stream->backlog_head = head;
278 stream->backlog_tail = tail;
279 } else {
280 stream->backlog_tail->next = head;
281 stream->backlog_tail = tail;
282 }
283
284 return;
285 }
286
bpf_stream_consume_elem(struct bpf_stream_elem * elem,int * len)287 static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len)
288 {
289 int rem = elem->total_len - elem->consumed_len;
290 int used = min(rem, *len);
291
292 elem->consumed_len += used;
293 *len -= used;
294
295 return elem->consumed_len == elem->total_len;
296 }
297
bpf_stream_read(struct bpf_stream * stream,void __user * buf,int len)298 static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len)
299 {
300 int rem_len = len, cons_len, ret = 0;
301 struct bpf_stream_elem *elem = NULL;
302 struct llist_node *node;
303
304 mutex_lock(&stream->lock);
305
306 while (rem_len) {
307 int pos = len - rem_len;
308 bool cont;
309
310 node = bpf_stream_backlog_peek(stream);
311 if (!node) {
312 bpf_stream_backlog_fill(stream);
313 node = bpf_stream_backlog_peek(stream);
314 }
315 if (!node)
316 break;
317 elem = container_of(node, typeof(*elem), node);
318
319 cons_len = elem->consumed_len;
320 cont = bpf_stream_consume_elem(elem, &rem_len) == false;
321
322 ret = copy_to_user(buf + pos, elem->str + cons_len,
323 elem->consumed_len - cons_len);
324 /* Restore in case of error. */
325 if (ret) {
326 ret = -EFAULT;
327 elem->consumed_len = cons_len;
328 break;
329 }
330
331 if (cont)
332 continue;
333 bpf_stream_backlog_pop(stream);
334 bpf_stream_release_capacity(stream, elem);
335 bpf_stream_free_elem(elem);
336 }
337
338 mutex_unlock(&stream->lock);
339 return ret ? ret : len - rem_len;
340 }
341
bpf_prog_stream_read(struct bpf_prog * prog,enum bpf_stream_id stream_id,void __user * buf,int len)342 int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len)
343 {
344 struct bpf_stream *stream;
345
346 stream = bpf_stream_get(stream_id, prog->aux);
347 if (!stream)
348 return -ENOENT;
349 return bpf_stream_read(stream, buf, len);
350 }
351
352 __bpf_kfunc_start_defs();
353
354 /*
355 * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
356 * enum in headers.
357 */
bpf_stream_vprintk(int stream_id,const char * fmt__str,const void * args,u32 len__sz,void * aux__prog)358 __bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog)
359 {
360 struct bpf_bprintf_data data = {
361 .get_bin_args = true,
362 .get_buf = true,
363 };
364 struct bpf_prog_aux *aux = aux__prog;
365 u32 fmt_size = strlen(fmt__str) + 1;
366 struct bpf_stream *stream;
367 u32 data_len = len__sz;
368 int ret, num_args;
369
370 stream = bpf_stream_get(stream_id, aux);
371 if (!stream)
372 return -ENOENT;
373
374 if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
375 (data_len && !args))
376 return -EINVAL;
377 num_args = data_len / 8;
378
379 ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data);
380 if (ret < 0)
381 return ret;
382
383 ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args);
384 /* Exclude NULL byte during push. */
385 ret = bpf_stream_push_str(stream, data.buf, ret);
386 bpf_bprintf_cleanup(&data);
387
388 return ret;
389 }
390
391 __bpf_kfunc_end_defs();
392
393 /* Added kfunc to common_btf_ids */
394
bpf_prog_stream_init(struct bpf_prog * prog)395 void bpf_prog_stream_init(struct bpf_prog *prog)
396 {
397 int i;
398
399 for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
400 atomic_set(&prog->aux->stream[i].capacity, 0);
401 init_llist_head(&prog->aux->stream[i].log);
402 mutex_init(&prog->aux->stream[i].lock);
403 prog->aux->stream[i].backlog_head = NULL;
404 prog->aux->stream[i].backlog_tail = NULL;
405 }
406 }
407
bpf_prog_stream_free(struct bpf_prog * prog)408 void bpf_prog_stream_free(struct bpf_prog *prog)
409 {
410 struct llist_node *list;
411 int i;
412
413 for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
414 list = llist_del_all(&prog->aux->stream[i].log);
415 bpf_stream_free_list(list);
416 bpf_stream_free_list(prog->aux->stream[i].backlog_head);
417 }
418 }
419
bpf_stream_stage_init(struct bpf_stream_stage * ss)420 void bpf_stream_stage_init(struct bpf_stream_stage *ss)
421 {
422 init_llist_head(&ss->log);
423 ss->len = 0;
424 }
425
bpf_stream_stage_free(struct bpf_stream_stage * ss)426 void bpf_stream_stage_free(struct bpf_stream_stage *ss)
427 {
428 struct llist_node *node;
429
430 node = llist_del_all(&ss->log);
431 bpf_stream_free_list(node);
432 }
433
bpf_stream_stage_printk(struct bpf_stream_stage * ss,const char * fmt,...)434 int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...)
435 {
436 struct bpf_bprintf_buffers *buf;
437 va_list args;
438 int ret;
439
440 if (bpf_try_get_buffers(&buf))
441 return -EBUSY;
442
443 va_start(args, fmt);
444 ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args);
445 va_end(args);
446 ss->len += ret;
447 /* Exclude NULL byte during push. */
448 ret = __bpf_stream_push_str(&ss->log, buf->buf, ret);
449 bpf_put_buffers();
450 return ret;
451 }
452
bpf_stream_stage_commit(struct bpf_stream_stage * ss,struct bpf_prog * prog,enum bpf_stream_id stream_id)453 int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
454 enum bpf_stream_id stream_id)
455 {
456 struct llist_node *list, *head, *tail;
457 struct bpf_stream *stream;
458 int ret;
459
460 stream = bpf_stream_get(stream_id, prog->aux);
461 if (!stream)
462 return -EINVAL;
463
464 ret = bpf_stream_consume_capacity(stream, ss->len);
465 if (ret)
466 return ret;
467
468 list = llist_del_all(&ss->log);
469 head = tail = list;
470
471 if (!list)
472 return 0;
473 while (llist_next(list)) {
474 tail = llist_next(list);
475 list = tail;
476 }
477 llist_add_batch(head, tail, &stream->log);
478 return 0;
479 }
480
481 struct dump_stack_ctx {
482 struct bpf_stream_stage *ss;
483 int err;
484 };
485
dump_stack_cb(void * cookie,u64 ip,u64 sp,u64 bp)486 static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
487 {
488 struct dump_stack_ctx *ctxp = cookie;
489 const char *file = "", *line = "";
490 struct bpf_prog *prog;
491 int num, ret;
492
493 rcu_read_lock();
494 prog = bpf_prog_ksym_find(ip);
495 rcu_read_unlock();
496 if (prog) {
497 ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num);
498 if (ret < 0)
499 goto end;
500 ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n",
501 (void *)(long)ip, line, file, num);
502 return !ctxp->err;
503 }
504 end:
505 ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip);
506 return !ctxp->err;
507 }
508
bpf_stream_stage_dump_stack(struct bpf_stream_stage * ss)509 int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss)
510 {
511 struct dump_stack_ctx ctx = { .ss = ss };
512 int ret;
513
514 ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n",
515 raw_smp_processor_id(), __kuid_val(current_real_cred()->euid),
516 current->pid, current->comm);
517 if (ret)
518 return ret;
519 ret = bpf_stream_stage_printk(ss, "Call trace:\n");
520 if (ret)
521 return ret;
522 arch_bpf_stack_walk(dump_stack_cb, &ctx);
523 if (ctx.err)
524 return ctx.err;
525 return bpf_stream_stage_printk(ss, "\n");
526 }
527