xref: /linux/kernel/bpf/task_iter.c (revision f5ad4101009e7f5f5984ffea6923d4fcd470932a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2020 Facebook */
3 
4 #include <linux/init.h>
5 #include <linux/namei.h>
6 #include <linux/pid_namespace.h>
7 #include <linux/fs.h>
8 #include <linux/filter.h>
9 #include <linux/bpf_mem_alloc.h>
10 #include <linux/btf_ids.h>
11 #include <linux/mm_types.h>
12 #include <linux/mmap_lock.h>
13 #include <linux/sched/mm.h>
14 #include "mmap_unlock_work.h"
15 
16 static const char * const iter_task_type_names[] = {
17 	"ALL",
18 	"TID",
19 	"PID",
20 };
21 
22 struct bpf_iter_seq_task_common {
23 	struct pid_namespace *ns;
24 	enum bpf_iter_task_type	type;
25 	u32 pid;
26 	u32 pid_visiting;
27 };
28 
29 struct bpf_iter_seq_task_info {
30 	/* The first field must be struct bpf_iter_seq_task_common.
31 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
32 	 */
33 	struct bpf_iter_seq_task_common common;
34 	u32 tid;
35 };
36 
37 static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
38 						   u32 *tid,
39 						   bool skip_if_dup_files)
40 {
41 	struct task_struct *task;
42 	struct pid *pid;
43 	u32 next_tid;
44 
45 	if (!*tid) {
46 		/* The first time, the iterator calls this function. */
47 		pid = find_pid_ns(common->pid, common->ns);
48 		task = get_pid_task(pid, PIDTYPE_TGID);
49 		if (!task)
50 			return NULL;
51 
52 		*tid = common->pid;
53 		common->pid_visiting = common->pid;
54 
55 		return task;
56 	}
57 
58 	/* If the control returns to user space and comes back to the
59 	 * kernel again, *tid and common->pid_visiting should be the
60 	 * same for task_seq_start() to pick up the correct task.
61 	 */
62 	if (*tid == common->pid_visiting) {
63 		pid = find_pid_ns(common->pid_visiting, common->ns);
64 		task = get_pid_task(pid, PIDTYPE_PID);
65 
66 		return task;
67 	}
68 
69 	task = find_task_by_pid_ns(common->pid_visiting, common->ns);
70 	if (!task)
71 		return NULL;
72 
73 retry:
74 	task = __next_thread(task);
75 	if (!task)
76 		return NULL;
77 
78 	next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
79 	if (!next_tid)
80 		goto retry;
81 
82 	if (skip_if_dup_files && task->files == task->group_leader->files)
83 		goto retry;
84 
85 	*tid = common->pid_visiting = next_tid;
86 	get_task_struct(task);
87 	return task;
88 }
89 
90 static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
91 					     u32 *tid,
92 					     bool skip_if_dup_files)
93 {
94 	struct task_struct *task = NULL;
95 	struct pid *pid;
96 
97 	if (common->type == BPF_TASK_ITER_TID) {
98 		if (*tid && *tid != common->pid)
99 			return NULL;
100 		rcu_read_lock();
101 		pid = find_pid_ns(common->pid, common->ns);
102 		if (pid) {
103 			task = get_pid_task(pid, PIDTYPE_PID);
104 			*tid = common->pid;
105 		}
106 		rcu_read_unlock();
107 
108 		return task;
109 	}
110 
111 	if (common->type == BPF_TASK_ITER_TGID) {
112 		rcu_read_lock();
113 		task = task_group_seq_get_next(common, tid, skip_if_dup_files);
114 		rcu_read_unlock();
115 
116 		return task;
117 	}
118 
119 	rcu_read_lock();
120 retry:
121 	pid = find_ge_pid(*tid, common->ns);
122 	if (pid) {
123 		*tid = pid_nr_ns(pid, common->ns);
124 		task = get_pid_task(pid, PIDTYPE_PID);
125 		if (!task) {
126 			++*tid;
127 			goto retry;
128 		} else if (skip_if_dup_files && !thread_group_leader(task) &&
129 			   task->files == task->group_leader->files) {
130 			put_task_struct(task);
131 			task = NULL;
132 			++*tid;
133 			goto retry;
134 		}
135 	}
136 	rcu_read_unlock();
137 
138 	return task;
139 }
140 
141 static void *task_seq_start(struct seq_file *seq, loff_t *pos)
142 {
143 	struct bpf_iter_seq_task_info *info = seq->private;
144 	struct task_struct *task;
145 
146 	task = task_seq_get_next(&info->common, &info->tid, false);
147 	if (!task)
148 		return NULL;
149 
150 	if (*pos == 0)
151 		++*pos;
152 	return task;
153 }
154 
155 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
156 {
157 	struct bpf_iter_seq_task_info *info = seq->private;
158 	struct task_struct *task;
159 
160 	++*pos;
161 	++info->tid;
162 	put_task_struct((struct task_struct *)v);
163 	task = task_seq_get_next(&info->common, &info->tid, false);
164 	if (!task)
165 		return NULL;
166 
167 	return task;
168 }
169 
170 struct bpf_iter__task {
171 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
172 	__bpf_md_ptr(struct task_struct *, task);
173 };
174 
175 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
176 
177 static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
178 			   bool in_stop)
179 {
180 	struct bpf_iter_meta meta;
181 	struct bpf_iter__task ctx;
182 	struct bpf_prog *prog;
183 
184 	meta.seq = seq;
185 	prog = bpf_iter_get_info(&meta, in_stop);
186 	if (!prog)
187 		return 0;
188 
189 	ctx.meta = &meta;
190 	ctx.task = task;
191 	return bpf_iter_run_prog(prog, &ctx);
192 }
193 
194 static int task_seq_show(struct seq_file *seq, void *v)
195 {
196 	return __task_seq_show(seq, v, false);
197 }
198 
199 static void task_seq_stop(struct seq_file *seq, void *v)
200 {
201 	if (!v)
202 		(void)__task_seq_show(seq, v, true);
203 	else
204 		put_task_struct((struct task_struct *)v);
205 }
206 
207 static int bpf_iter_attach_task(struct bpf_prog *prog,
208 				union bpf_iter_link_info *linfo,
209 				struct bpf_iter_aux_info *aux)
210 {
211 	unsigned int flags;
212 	struct pid *pid;
213 	pid_t tgid;
214 
215 	if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
216 		return -EINVAL;
217 
218 	aux->task.type = BPF_TASK_ITER_ALL;
219 	if (linfo->task.tid != 0) {
220 		aux->task.type = BPF_TASK_ITER_TID;
221 		aux->task.pid = linfo->task.tid;
222 	}
223 	if (linfo->task.pid != 0) {
224 		aux->task.type = BPF_TASK_ITER_TGID;
225 		aux->task.pid = linfo->task.pid;
226 	}
227 	if (linfo->task.pid_fd != 0) {
228 		aux->task.type = BPF_TASK_ITER_TGID;
229 
230 		pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
231 		if (IS_ERR(pid))
232 			return PTR_ERR(pid);
233 
234 		tgid = pid_nr_ns(pid, task_active_pid_ns(current));
235 		aux->task.pid = tgid;
236 		put_pid(pid);
237 	}
238 
239 	return 0;
240 }
241 
242 static const struct seq_operations task_seq_ops = {
243 	.start	= task_seq_start,
244 	.next	= task_seq_next,
245 	.stop	= task_seq_stop,
246 	.show	= task_seq_show,
247 };
248 
249 struct bpf_iter_seq_task_file_info {
250 	/* The first field must be struct bpf_iter_seq_task_common.
251 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
252 	 */
253 	struct bpf_iter_seq_task_common common;
254 	struct task_struct *task;
255 	u32 tid;
256 	u32 fd;
257 };
258 
259 static struct file *
260 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
261 {
262 	u32 saved_tid = info->tid;
263 	struct task_struct *curr_task;
264 	unsigned int curr_fd = info->fd;
265 	struct file *f;
266 
267 	/* If this function returns a non-NULL file object,
268 	 * it held a reference to the task/file.
269 	 * Otherwise, it does not hold any reference.
270 	 */
271 again:
272 	if (info->task) {
273 		curr_task = info->task;
274 		curr_fd = info->fd;
275 	} else {
276 		curr_task = task_seq_get_next(&info->common, &info->tid, true);
277                 if (!curr_task) {
278                         info->task = NULL;
279                         return NULL;
280                 }
281 
282 		/* set info->task */
283 		info->task = curr_task;
284 		if (saved_tid == info->tid)
285 			curr_fd = info->fd;
286 		else
287 			curr_fd = 0;
288 	}
289 
290 	f = fget_task_next(curr_task, &curr_fd);
291 	if (f) {
292 		/* set info->fd */
293 		info->fd = curr_fd;
294 		return f;
295 	}
296 
297 	/* the current task is done, go to the next task */
298 	put_task_struct(curr_task);
299 
300 	if (info->common.type == BPF_TASK_ITER_TID) {
301 		info->task = NULL;
302 		return NULL;
303 	}
304 
305 	info->task = NULL;
306 	info->fd = 0;
307 	saved_tid = ++(info->tid);
308 	goto again;
309 }
310 
311 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
312 {
313 	struct bpf_iter_seq_task_file_info *info = seq->private;
314 	struct file *file;
315 
316 	info->task = NULL;
317 	file = task_file_seq_get_next(info);
318 	if (file && *pos == 0)
319 		++*pos;
320 
321 	return file;
322 }
323 
324 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
325 {
326 	struct bpf_iter_seq_task_file_info *info = seq->private;
327 
328 	++*pos;
329 	++info->fd;
330 	fput((struct file *)v);
331 	return task_file_seq_get_next(info);
332 }
333 
334 struct bpf_iter__task_file {
335 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
336 	__bpf_md_ptr(struct task_struct *, task);
337 	u32 fd __aligned(8);
338 	__bpf_md_ptr(struct file *, file);
339 };
340 
341 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
342 		     struct task_struct *task, u32 fd,
343 		     struct file *file)
344 
345 static int __task_file_seq_show(struct seq_file *seq, struct file *file,
346 				bool in_stop)
347 {
348 	struct bpf_iter_seq_task_file_info *info = seq->private;
349 	struct bpf_iter__task_file ctx;
350 	struct bpf_iter_meta meta;
351 	struct bpf_prog *prog;
352 
353 	meta.seq = seq;
354 	prog = bpf_iter_get_info(&meta, in_stop);
355 	if (!prog)
356 		return 0;
357 
358 	ctx.meta = &meta;
359 	ctx.task = info->task;
360 	ctx.fd = info->fd;
361 	ctx.file = file;
362 	return bpf_iter_run_prog(prog, &ctx);
363 }
364 
365 static int task_file_seq_show(struct seq_file *seq, void *v)
366 {
367 	return __task_file_seq_show(seq, v, false);
368 }
369 
370 static void task_file_seq_stop(struct seq_file *seq, void *v)
371 {
372 	struct bpf_iter_seq_task_file_info *info = seq->private;
373 
374 	if (!v) {
375 		(void)__task_file_seq_show(seq, v, true);
376 	} else {
377 		fput((struct file *)v);
378 		put_task_struct(info->task);
379 		info->task = NULL;
380 	}
381 }
382 
383 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
384 {
385 	struct bpf_iter_seq_task_common *common = priv_data;
386 
387 	common->ns = get_pid_ns(task_active_pid_ns(current));
388 	common->type = aux->task.type;
389 	common->pid = aux->task.pid;
390 
391 	return 0;
392 }
393 
394 static void fini_seq_pidns(void *priv_data)
395 {
396 	struct bpf_iter_seq_task_common *common = priv_data;
397 
398 	put_pid_ns(common->ns);
399 }
400 
401 static const struct seq_operations task_file_seq_ops = {
402 	.start	= task_file_seq_start,
403 	.next	= task_file_seq_next,
404 	.stop	= task_file_seq_stop,
405 	.show	= task_file_seq_show,
406 };
407 
408 struct bpf_iter_seq_task_vma_info {
409 	/* The first field must be struct bpf_iter_seq_task_common.
410 	 * this is assumed by {init, fini}_seq_pidns() callback functions.
411 	 */
412 	struct bpf_iter_seq_task_common common;
413 	struct task_struct *task;
414 	struct mm_struct *mm;
415 	struct vm_area_struct *vma;
416 	u32 tid;
417 	unsigned long prev_vm_start;
418 	unsigned long prev_vm_end;
419 };
420 
421 enum bpf_task_vma_iter_find_op {
422 	task_vma_iter_first_vma,   /* use find_vma() with addr 0 */
423 	task_vma_iter_next_vma,    /* use vma_next() with curr_vma */
424 	task_vma_iter_find_vma,    /* use find_vma() to find next vma */
425 };
426 
427 static struct vm_area_struct *
428 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
429 {
430 	enum bpf_task_vma_iter_find_op op;
431 	struct vm_area_struct *curr_vma;
432 	struct task_struct *curr_task;
433 	struct mm_struct *curr_mm;
434 	u32 saved_tid = info->tid;
435 
436 	/* If this function returns a non-NULL vma, it holds a reference to
437 	 * the task_struct, holds a refcount on mm->mm_users, and holds
438 	 * read lock on vma->mm->mmap_lock.
439 	 * If this function returns NULL, it does not hold any reference or
440 	 * lock.
441 	 */
442 	if (info->task) {
443 		curr_task = info->task;
444 		curr_vma = info->vma;
445 		curr_mm = info->mm;
446 		/* In case of lock contention, drop mmap_lock to unblock
447 		 * the writer.
448 		 *
449 		 * After relock, call find(mm, prev_vm_end - 1) to find
450 		 * new vma to process.
451 		 *
452 		 *   +------+------+-----------+
453 		 *   | VMA1 | VMA2 | VMA3      |
454 		 *   +------+------+-----------+
455 		 *   |      |      |           |
456 		 *  4k     8k     16k         400k
457 		 *
458 		 * For example, curr_vma == VMA2. Before unlock, we set
459 		 *
460 		 *    prev_vm_start = 8k
461 		 *    prev_vm_end   = 16k
462 		 *
463 		 * There are a few cases:
464 		 *
465 		 * 1) VMA2 is freed, but VMA3 exists.
466 		 *
467 		 *    find_vma() will return VMA3, just process VMA3.
468 		 *
469 		 * 2) VMA2 still exists.
470 		 *
471 		 *    find_vma() will return VMA2, process VMA2->next.
472 		 *
473 		 * 3) no more vma in this mm.
474 		 *
475 		 *    Process the next task.
476 		 *
477 		 * 4) find_vma() returns a different vma, VMA2'.
478 		 *
479 		 *    4.1) If VMA2 covers same range as VMA2', skip VMA2',
480 		 *         because we already covered the range;
481 		 *    4.2) VMA2 and VMA2' covers different ranges, process
482 		 *         VMA2'.
483 		 */
484 		if (mmap_lock_is_contended(curr_mm)) {
485 			info->prev_vm_start = curr_vma->vm_start;
486 			info->prev_vm_end = curr_vma->vm_end;
487 			op = task_vma_iter_find_vma;
488 			mmap_read_unlock(curr_mm);
489 			if (mmap_read_lock_killable(curr_mm)) {
490 				mmput(curr_mm);
491 				goto finish;
492 			}
493 		} else {
494 			op = task_vma_iter_next_vma;
495 		}
496 	} else {
497 again:
498 		curr_task = task_seq_get_next(&info->common, &info->tid, true);
499 		if (!curr_task) {
500 			info->tid++;
501 			goto finish;
502 		}
503 
504 		if (saved_tid != info->tid) {
505 			/* new task, process the first vma */
506 			op = task_vma_iter_first_vma;
507 		} else {
508 			/* Found the same tid, which means the user space
509 			 * finished data in previous buffer and read more.
510 			 * We dropped mmap_lock before returning to user
511 			 * space, so it is necessary to use find_vma() to
512 			 * find the next vma to process.
513 			 */
514 			op = task_vma_iter_find_vma;
515 		}
516 
517 		curr_mm = get_task_mm(curr_task);
518 		if (!curr_mm)
519 			goto next_task;
520 
521 		if (mmap_read_lock_killable(curr_mm)) {
522 			mmput(curr_mm);
523 			goto finish;
524 		}
525 	}
526 
527 	switch (op) {
528 	case task_vma_iter_first_vma:
529 		curr_vma = find_vma(curr_mm, 0);
530 		break;
531 	case task_vma_iter_next_vma:
532 		curr_vma = find_vma(curr_mm, curr_vma->vm_end);
533 		break;
534 	case task_vma_iter_find_vma:
535 		/* We dropped mmap_lock so it is necessary to use find_vma
536 		 * to find the next vma. This is similar to the  mechanism
537 		 * in show_smaps_rollup().
538 		 */
539 		curr_vma = find_vma(curr_mm, info->prev_vm_end - 1);
540 		/* case 1) and 4.2) above just use curr_vma */
541 
542 		/* check for case 2) or case 4.1) above */
543 		if (curr_vma &&
544 		    curr_vma->vm_start == info->prev_vm_start &&
545 		    curr_vma->vm_end == info->prev_vm_end)
546 			curr_vma = find_vma(curr_mm, curr_vma->vm_end);
547 		break;
548 	}
549 	if (!curr_vma) {
550 		/* case 3) above, or case 2) 4.1) with vma->next == NULL */
551 		mmap_read_unlock(curr_mm);
552 		mmput(curr_mm);
553 		goto next_task;
554 	}
555 	info->task = curr_task;
556 	info->vma = curr_vma;
557 	info->mm = curr_mm;
558 	return curr_vma;
559 
560 next_task:
561 	if (info->common.type == BPF_TASK_ITER_TID)
562 		goto finish;
563 
564 	put_task_struct(curr_task);
565 	info->task = NULL;
566 	info->mm = NULL;
567 	info->tid++;
568 	goto again;
569 
570 finish:
571 	if (curr_task)
572 		put_task_struct(curr_task);
573 	info->task = NULL;
574 	info->vma = NULL;
575 	info->mm = NULL;
576 	return NULL;
577 }
578 
579 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
580 {
581 	struct bpf_iter_seq_task_vma_info *info = seq->private;
582 	struct vm_area_struct *vma;
583 
584 	vma = task_vma_seq_get_next(info);
585 	if (vma && *pos == 0)
586 		++*pos;
587 
588 	return vma;
589 }
590 
591 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
592 {
593 	struct bpf_iter_seq_task_vma_info *info = seq->private;
594 
595 	++*pos;
596 	return task_vma_seq_get_next(info);
597 }
598 
599 struct bpf_iter__task_vma {
600 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
601 	__bpf_md_ptr(struct task_struct *, task);
602 	__bpf_md_ptr(struct vm_area_struct *, vma);
603 };
604 
605 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
606 		     struct task_struct *task, struct vm_area_struct *vma)
607 
608 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
609 {
610 	struct bpf_iter_seq_task_vma_info *info = seq->private;
611 	struct bpf_iter__task_vma ctx;
612 	struct bpf_iter_meta meta;
613 	struct bpf_prog *prog;
614 
615 	meta.seq = seq;
616 	prog = bpf_iter_get_info(&meta, in_stop);
617 	if (!prog)
618 		return 0;
619 
620 	ctx.meta = &meta;
621 	ctx.task = info->task;
622 	ctx.vma = info->vma;
623 	return bpf_iter_run_prog(prog, &ctx);
624 }
625 
626 static int task_vma_seq_show(struct seq_file *seq, void *v)
627 {
628 	return __task_vma_seq_show(seq, false);
629 }
630 
631 static void task_vma_seq_stop(struct seq_file *seq, void *v)
632 {
633 	struct bpf_iter_seq_task_vma_info *info = seq->private;
634 
635 	if (!v) {
636 		(void)__task_vma_seq_show(seq, true);
637 	} else {
638 		/* info->vma has not been seen by the BPF program. If the
639 		 * user space reads more, task_vma_seq_get_next should
640 		 * return this vma again. Set prev_vm_start to ~0UL,
641 		 * so that we don't skip the vma returned by the next
642 		 * find_vma() (case task_vma_iter_find_vma in
643 		 * task_vma_seq_get_next()).
644 		 */
645 		info->prev_vm_start = ~0UL;
646 		info->prev_vm_end = info->vma->vm_end;
647 		mmap_read_unlock(info->mm);
648 		mmput(info->mm);
649 		info->mm = NULL;
650 		put_task_struct(info->task);
651 		info->task = NULL;
652 	}
653 }
654 
655 static const struct seq_operations task_vma_seq_ops = {
656 	.start	= task_vma_seq_start,
657 	.next	= task_vma_seq_next,
658 	.stop	= task_vma_seq_stop,
659 	.show	= task_vma_seq_show,
660 };
661 
662 static const struct bpf_iter_seq_info task_seq_info = {
663 	.seq_ops		= &task_seq_ops,
664 	.init_seq_private	= init_seq_pidns,
665 	.fini_seq_private	= fini_seq_pidns,
666 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
667 };
668 
669 static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
670 {
671 	switch (aux->task.type) {
672 	case BPF_TASK_ITER_TID:
673 		info->iter.task.tid = aux->task.pid;
674 		break;
675 	case BPF_TASK_ITER_TGID:
676 		info->iter.task.pid = aux->task.pid;
677 		break;
678 	default:
679 		break;
680 	}
681 	return 0;
682 }
683 
684 static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
685 {
686 	seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
687 	if (aux->task.type == BPF_TASK_ITER_TID)
688 		seq_printf(seq, "tid:\t%u\n", aux->task.pid);
689 	else if (aux->task.type == BPF_TASK_ITER_TGID)
690 		seq_printf(seq, "pid:\t%u\n", aux->task.pid);
691 }
692 
693 static struct bpf_iter_reg task_reg_info = {
694 	.target			= "task",
695 	.attach_target		= bpf_iter_attach_task,
696 	.feature		= BPF_ITER_RESCHED,
697 	.ctx_arg_info_size	= 1,
698 	.ctx_arg_info		= {
699 		{ offsetof(struct bpf_iter__task, task),
700 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
701 	},
702 	.seq_info		= &task_seq_info,
703 	.fill_link_info		= bpf_iter_fill_link_info,
704 	.show_fdinfo		= bpf_iter_task_show_fdinfo,
705 };
706 
707 static const struct bpf_iter_seq_info task_file_seq_info = {
708 	.seq_ops		= &task_file_seq_ops,
709 	.init_seq_private	= init_seq_pidns,
710 	.fini_seq_private	= fini_seq_pidns,
711 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
712 };
713 
714 static struct bpf_iter_reg task_file_reg_info = {
715 	.target			= "task_file",
716 	.attach_target		= bpf_iter_attach_task,
717 	.feature		= BPF_ITER_RESCHED,
718 	.ctx_arg_info_size	= 2,
719 	.ctx_arg_info		= {
720 		{ offsetof(struct bpf_iter__task_file, task),
721 		  PTR_TO_BTF_ID_OR_NULL },
722 		{ offsetof(struct bpf_iter__task_file, file),
723 		  PTR_TO_BTF_ID_OR_NULL },
724 	},
725 	.seq_info		= &task_file_seq_info,
726 	.fill_link_info		= bpf_iter_fill_link_info,
727 	.show_fdinfo		= bpf_iter_task_show_fdinfo,
728 };
729 
730 static const struct bpf_iter_seq_info task_vma_seq_info = {
731 	.seq_ops		= &task_vma_seq_ops,
732 	.init_seq_private	= init_seq_pidns,
733 	.fini_seq_private	= fini_seq_pidns,
734 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_vma_info),
735 };
736 
737 static struct bpf_iter_reg task_vma_reg_info = {
738 	.target			= "task_vma",
739 	.attach_target		= bpf_iter_attach_task,
740 	.feature		= BPF_ITER_RESCHED,
741 	.ctx_arg_info_size	= 2,
742 	.ctx_arg_info		= {
743 		{ offsetof(struct bpf_iter__task_vma, task),
744 		  PTR_TO_BTF_ID_OR_NULL },
745 		{ offsetof(struct bpf_iter__task_vma, vma),
746 		  PTR_TO_BTF_ID_OR_NULL },
747 	},
748 	.seq_info		= &task_vma_seq_info,
749 	.fill_link_info		= bpf_iter_fill_link_info,
750 	.show_fdinfo		= bpf_iter_task_show_fdinfo,
751 };
752 
753 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
754 	   bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
755 {
756 	struct mmap_unlock_irq_work *work = NULL;
757 	struct vm_area_struct *vma;
758 	bool irq_work_busy = false;
759 	struct mm_struct *mm;
760 	int ret = -ENOENT;
761 
762 	if (flags)
763 		return -EINVAL;
764 
765 	if (!task)
766 		return -ENOENT;
767 
768 	mm = task->mm;
769 	if (!mm)
770 		return -ENOENT;
771 
772 	irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
773 
774 	if (irq_work_busy || !mmap_read_trylock(mm))
775 		return -EBUSY;
776 
777 	vma = find_vma(mm, start);
778 
779 	if (vma && vma->vm_start <= start && vma->vm_end > start) {
780 		callback_fn((u64)(long)task, (u64)(long)vma,
781 			    (u64)(long)callback_ctx, 0, 0);
782 		ret = 0;
783 	}
784 	bpf_mmap_unlock_mm(work, mm);
785 	return ret;
786 }
787 
788 const struct bpf_func_proto bpf_find_vma_proto = {
789 	.func		= bpf_find_vma,
790 	.ret_type	= RET_INTEGER,
791 	.arg1_type	= ARG_PTR_TO_BTF_ID,
792 	.arg1_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
793 	.arg2_type	= ARG_ANYTHING,
794 	.arg3_type	= ARG_PTR_TO_FUNC,
795 	.arg4_type	= ARG_PTR_TO_STACK_OR_NULL,
796 	.arg5_type	= ARG_ANYTHING,
797 };
798 
799 static inline void bpf_iter_mmput_async(struct mm_struct *mm)
800 {
801 #ifdef CONFIG_MMU
802 	mmput_async(mm);
803 #else
804 	mmput(mm);
805 #endif
806 }
807 
808 struct bpf_iter_task_vma_kern_data {
809 	struct task_struct *task;
810 	struct mm_struct *mm;
811 	struct vm_area_struct snapshot;
812 	u64 next_addr;
813 };
814 
815 struct bpf_iter_task_vma {
816 	/* opaque iterator state; having __u64 here allows to preserve correct
817 	 * alignment requirements in vmlinux.h, generated from BTF
818 	 */
819 	__u64 __opaque[1];
820 } __attribute__((aligned(8)));
821 
822 /* Non-opaque version of bpf_iter_task_vma */
823 struct bpf_iter_task_vma_kern {
824 	struct bpf_iter_task_vma_kern_data *data;
825 } __attribute__((aligned(8)));
826 
827 __bpf_kfunc_start_defs();
828 
829 __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
830 				      struct task_struct *task, u64 addr)
831 {
832 	struct bpf_iter_task_vma_kern *kit = (void *)it;
833 	int err;
834 
835 	BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
836 	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
837 
838 	if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) {
839 		kit->data = NULL;
840 		return -EOPNOTSUPP;
841 	}
842 
843 	/*
844 	 * Reject irqs-disabled contexts including NMI. Operations used
845 	 * by _next() and _destroy() (vma_end_read, fput, bpf_iter_mmput_async)
846 	 * can take spinlocks with IRQs disabled (pi_lock, pool->lock).
847 	 * Running from NMI or from a tracepoint that fires with those
848 	 * locks held could deadlock.
849 	 */
850 	if (irqs_disabled()) {
851 		kit->data = NULL;
852 		return -EBUSY;
853 	}
854 
855 	/* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
856 	 * before, so non-NULL kit->data doesn't point to previously
857 	 * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
858 	 */
859 	kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
860 	if (!kit->data)
861 		return -ENOMEM;
862 
863 	kit->data->task = get_task_struct(task);
864 	/*
865 	 * Safely read task->mm and acquire an mm reference.
866 	 *
867 	 * Cannot use get_task_mm() because its task_lock() is a
868 	 * blocking spin_lock that would deadlock if the target task
869 	 * already holds alloc_lock on this CPU (e.g. a softirq BPF
870 	 * program iterating a task interrupted while holding its
871 	 * alloc_lock).
872 	 */
873 	if (!spin_trylock(&task->alloc_lock)) {
874 		err = -EBUSY;
875 		goto err_cleanup_iter;
876 	}
877 	kit->data->mm = task->mm;
878 	if (kit->data->mm && !(task->flags & PF_KTHREAD))
879 		mmget(kit->data->mm);
880 	else
881 		kit->data->mm = NULL;
882 	spin_unlock(&task->alloc_lock);
883 	if (!kit->data->mm) {
884 		err = -ENOENT;
885 		goto err_cleanup_iter;
886 	}
887 
888 	kit->data->snapshot.vm_file = NULL;
889 	kit->data->next_addr = addr;
890 	return 0;
891 
892 err_cleanup_iter:
893 	put_task_struct(kit->data->task);
894 	bpf_mem_free(&bpf_global_ma, kit->data);
895 	/* NULL kit->data signals failed bpf_iter_task_vma initialization */
896 	kit->data = NULL;
897 	return err;
898 }
899 
900 /*
901  * Find and lock the next VMA at or after data->next_addr.
902  *
903  * lock_vma_under_rcu() is a point lookup (mas_walk): it finds the VMA
904  * containing a given address but cannot iterate. An RCU-protected
905  * maple tree walk with vma_next() (mas_find) is needed first to locate
906  * the next VMA's vm_start across any gap.
907  *
908  * Between the RCU walk and the lock, the VMA may be removed, shrunk,
909  * or write-locked. On failure, advance past it using vm_end from the
910  * RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall back
911  * to PAGE_SIZE advancement to guarantee forward progress.
912  */
913 static struct vm_area_struct *
914 bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
915 {
916 	struct vm_area_struct *vma;
917 	struct vma_iterator vmi;
918 	unsigned long start, end;
919 
920 retry:
921 	rcu_read_lock();
922 	vma_iter_init(&vmi, data->mm, data->next_addr);
923 	vma = vma_next(&vmi);
924 	if (!vma) {
925 		rcu_read_unlock();
926 		return NULL;
927 	}
928 	start = vma->vm_start;
929 	end = vma->vm_end;
930 	rcu_read_unlock();
931 
932 	vma = lock_vma_under_rcu(data->mm, start);
933 	if (!vma) {
934 		if (end <= data->next_addr)
935 			data->next_addr += PAGE_SIZE;
936 		else
937 			data->next_addr = end;
938 		goto retry;
939 	}
940 
941 	if (unlikely(vma->vm_end <= data->next_addr)) {
942 		data->next_addr += PAGE_SIZE;
943 		vma_end_read(vma);
944 		goto retry;
945 	}
946 
947 	return vma;
948 }
949 
950 static void bpf_iter_task_vma_snapshot_reset(struct vm_area_struct *snap)
951 {
952 	if (snap->vm_file) {
953 		fput(snap->vm_file);
954 		snap->vm_file = NULL;
955 	}
956 }
957 
958 __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
959 {
960 	struct bpf_iter_task_vma_kern *kit = (void *)it;
961 	struct vm_area_struct *snap, *vma;
962 
963 	if (!kit->data) /* bpf_iter_task_vma_new failed */
964 		return NULL;
965 
966 	snap = &kit->data->snapshot;
967 
968 	bpf_iter_task_vma_snapshot_reset(snap);
969 
970 	vma = bpf_iter_task_vma_find_next(kit->data);
971 	if (!vma)
972 		return NULL;
973 
974 	memcpy(snap, vma, sizeof(*snap));
975 
976 	/*
977 	 * The verifier only trusts vm_mm and vm_file (see
978 	 * BTF_TYPE_SAFE_TRUSTED_OR_NULL in verifier.c). Take a reference
979 	 * on vm_file; vm_mm is already correct because lock_vma_under_rcu()
980 	 * verifies vma->vm_mm == mm. All other pointers are untrusted by
981 	 * the verifier and left as-is.
982 	 */
983 	if (snap->vm_file)
984 		get_file(snap->vm_file);
985 
986 	kit->data->next_addr = vma->vm_end;
987 	vma_end_read(vma);
988 	return snap;
989 }
990 
991 __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
992 {
993 	struct bpf_iter_task_vma_kern *kit = (void *)it;
994 
995 	if (kit->data) {
996 		bpf_iter_task_vma_snapshot_reset(&kit->data->snapshot);
997 		put_task_struct(kit->data->task);
998 		bpf_iter_mmput_async(kit->data->mm);
999 		bpf_mem_free(&bpf_global_ma, kit->data);
1000 	}
1001 }
1002 
1003 __bpf_kfunc_end_defs();
1004 
1005 #ifdef CONFIG_CGROUPS
1006 
1007 struct bpf_iter_css_task {
1008 	__u64 __opaque[1];
1009 } __attribute__((aligned(8)));
1010 
1011 struct bpf_iter_css_task_kern {
1012 	struct css_task_iter *css_it;
1013 } __attribute__((aligned(8)));
1014 
1015 __bpf_kfunc_start_defs();
1016 
1017 __bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
1018 		struct cgroup_subsys_state *css, unsigned int flags)
1019 {
1020 	struct bpf_iter_css_task_kern *kit = (void *)it;
1021 
1022 	BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task));
1023 	BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) !=
1024 					__alignof__(struct bpf_iter_css_task));
1025 	kit->css_it = NULL;
1026 	switch (flags) {
1027 	case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
1028 	case CSS_TASK_ITER_PROCS:
1029 	case 0:
1030 		break;
1031 	default:
1032 		return -EINVAL;
1033 	}
1034 
1035 	kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter));
1036 	if (!kit->css_it)
1037 		return -ENOMEM;
1038 	css_task_iter_start(css, flags, kit->css_it);
1039 	return 0;
1040 }
1041 
1042 __bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it)
1043 {
1044 	struct bpf_iter_css_task_kern *kit = (void *)it;
1045 
1046 	if (!kit->css_it)
1047 		return NULL;
1048 	return css_task_iter_next(kit->css_it);
1049 }
1050 
1051 __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
1052 {
1053 	struct bpf_iter_css_task_kern *kit = (void *)it;
1054 
1055 	if (!kit->css_it)
1056 		return;
1057 	css_task_iter_end(kit->css_it);
1058 	bpf_mem_free(&bpf_global_ma, kit->css_it);
1059 }
1060 
1061 __bpf_kfunc_end_defs();
1062 
1063 #endif /* CONFIG_CGROUPS */
1064 
1065 struct bpf_iter_task {
1066 	__u64 __opaque[3];
1067 } __attribute__((aligned(8)));
1068 
1069 struct bpf_iter_task_kern {
1070 	struct task_struct *task;
1071 	struct task_struct *pos;
1072 	unsigned int flags;
1073 } __attribute__((aligned(8)));
1074 
1075 enum {
1076 	/* all process in the system */
1077 	BPF_TASK_ITER_ALL_PROCS,
1078 	/* all threads in the system */
1079 	BPF_TASK_ITER_ALL_THREADS,
1080 	/* all threads of a specific process */
1081 	BPF_TASK_ITER_PROC_THREADS
1082 };
1083 
1084 __bpf_kfunc_start_defs();
1085 
1086 __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
1087 		struct task_struct *task__nullable, unsigned int flags)
1088 {
1089 	struct bpf_iter_task_kern *kit = (void *)it;
1090 
1091 	BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task));
1092 	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
1093 					__alignof__(struct bpf_iter_task));
1094 
1095 	kit->pos = NULL;
1096 
1097 	switch (flags) {
1098 	case BPF_TASK_ITER_ALL_THREADS:
1099 	case BPF_TASK_ITER_ALL_PROCS:
1100 		break;
1101 	case BPF_TASK_ITER_PROC_THREADS:
1102 		if (!task__nullable)
1103 			return -EINVAL;
1104 		break;
1105 	default:
1106 		return -EINVAL;
1107 	}
1108 
1109 	if (flags == BPF_TASK_ITER_PROC_THREADS)
1110 		kit->task = task__nullable;
1111 	else
1112 		kit->task = &init_task;
1113 	kit->pos = kit->task;
1114 	kit->flags = flags;
1115 	return 0;
1116 }
1117 
1118 __bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
1119 {
1120 	struct bpf_iter_task_kern *kit = (void *)it;
1121 	struct task_struct *pos;
1122 	unsigned int flags;
1123 
1124 	flags = kit->flags;
1125 	pos = kit->pos;
1126 
1127 	if (!pos)
1128 		return pos;
1129 
1130 	if (flags == BPF_TASK_ITER_ALL_PROCS)
1131 		goto get_next_task;
1132 
1133 	kit->pos = __next_thread(kit->pos);
1134 	if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS)
1135 		return pos;
1136 
1137 get_next_task:
1138 	kit->task = next_task(kit->task);
1139 	if (kit->task == &init_task)
1140 		kit->pos = NULL;
1141 	else
1142 		kit->pos = kit->task;
1143 
1144 	return pos;
1145 }
1146 
1147 __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
1148 {
1149 }
1150 
1151 __bpf_kfunc_end_defs();
1152 
1153 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
1154 
1155 static void do_mmap_read_unlock(struct irq_work *entry)
1156 {
1157 	struct mmap_unlock_irq_work *work;
1158 
1159 	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
1160 		return;
1161 
1162 	work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
1163 	mmap_read_unlock_non_owner(work->mm);
1164 }
1165 
1166 static int __init task_iter_init(void)
1167 {
1168 	struct mmap_unlock_irq_work *work;
1169 	int ret, cpu;
1170 
1171 	for_each_possible_cpu(cpu) {
1172 		work = per_cpu_ptr(&mmap_unlock_work, cpu);
1173 		init_irq_work(&work->irq_work, do_mmap_read_unlock);
1174 	}
1175 
1176 	task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
1177 	ret = bpf_iter_reg_target(&task_reg_info);
1178 	if (ret)
1179 		return ret;
1180 
1181 	task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
1182 	task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
1183 	ret =  bpf_iter_reg_target(&task_file_reg_info);
1184 	if (ret)
1185 		return ret;
1186 
1187 	task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
1188 	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
1189 	return bpf_iter_reg_target(&task_vma_reg_info);
1190 }
1191 late_initcall(task_iter_init);
1192