1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2020 Facebook */ 3 4 #include <linux/init.h> 5 #include <linux/namei.h> 6 #include <linux/pid_namespace.h> 7 #include <linux/fs.h> 8 #include <linux/filter.h> 9 #include <linux/bpf_mem_alloc.h> 10 #include <linux/btf_ids.h> 11 #include <linux/mm_types.h> 12 #include <linux/mmap_lock.h> 13 #include <linux/sched/mm.h> 14 #include "mmap_unlock_work.h" 15 16 static const char * const iter_task_type_names[] = { 17 "ALL", 18 "TID", 19 "PID", 20 }; 21 22 struct bpf_iter_seq_task_common { 23 struct pid_namespace *ns; 24 enum bpf_iter_task_type type; 25 u32 pid; 26 u32 pid_visiting; 27 }; 28 29 struct bpf_iter_seq_task_info { 30 /* The first field must be struct bpf_iter_seq_task_common. 31 * this is assumed by {init, fini}_seq_pidns() callback functions. 32 */ 33 struct bpf_iter_seq_task_common common; 34 u32 tid; 35 }; 36 37 static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, 38 u32 *tid, 39 bool skip_if_dup_files) 40 { 41 struct task_struct *task; 42 struct pid *pid; 43 u32 next_tid; 44 45 if (!*tid) { 46 /* The first time, the iterator calls this function. */ 47 pid = find_pid_ns(common->pid, common->ns); 48 task = get_pid_task(pid, PIDTYPE_TGID); 49 if (!task) 50 return NULL; 51 52 *tid = common->pid; 53 common->pid_visiting = common->pid; 54 55 return task; 56 } 57 58 /* If the control returns to user space and comes back to the 59 * kernel again, *tid and common->pid_visiting should be the 60 * same for task_seq_start() to pick up the correct task. 61 */ 62 if (*tid == common->pid_visiting) { 63 pid = find_pid_ns(common->pid_visiting, common->ns); 64 task = get_pid_task(pid, PIDTYPE_PID); 65 66 return task; 67 } 68 69 task = find_task_by_pid_ns(common->pid_visiting, common->ns); 70 if (!task) 71 return NULL; 72 73 retry: 74 task = __next_thread(task); 75 if (!task) 76 return NULL; 77 78 next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); 79 if (!next_tid) 80 goto retry; 81 82 if (skip_if_dup_files && task->files == task->group_leader->files) 83 goto retry; 84 85 *tid = common->pid_visiting = next_tid; 86 get_task_struct(task); 87 return task; 88 } 89 90 static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, 91 u32 *tid, 92 bool skip_if_dup_files) 93 { 94 struct task_struct *task = NULL; 95 struct pid *pid; 96 97 if (common->type == BPF_TASK_ITER_TID) { 98 if (*tid && *tid != common->pid) 99 return NULL; 100 rcu_read_lock(); 101 pid = find_pid_ns(common->pid, common->ns); 102 if (pid) { 103 task = get_pid_task(pid, PIDTYPE_PID); 104 *tid = common->pid; 105 } 106 rcu_read_unlock(); 107 108 return task; 109 } 110 111 if (common->type == BPF_TASK_ITER_TGID) { 112 rcu_read_lock(); 113 task = task_group_seq_get_next(common, tid, skip_if_dup_files); 114 rcu_read_unlock(); 115 116 return task; 117 } 118 119 rcu_read_lock(); 120 retry: 121 pid = find_ge_pid(*tid, common->ns); 122 if (pid) { 123 *tid = pid_nr_ns(pid, common->ns); 124 task = get_pid_task(pid, PIDTYPE_PID); 125 if (!task) { 126 ++*tid; 127 goto retry; 128 } else if (skip_if_dup_files && !thread_group_leader(task) && 129 task->files == task->group_leader->files) { 130 put_task_struct(task); 131 task = NULL; 132 ++*tid; 133 goto retry; 134 } 135 } 136 rcu_read_unlock(); 137 138 return task; 139 } 140 141 static void *task_seq_start(struct seq_file *seq, loff_t *pos) 142 { 143 struct bpf_iter_seq_task_info *info = seq->private; 144 struct task_struct *task; 145 146 task = task_seq_get_next(&info->common, &info->tid, false); 147 if (!task) 148 return NULL; 149 150 if (*pos == 0) 151 ++*pos; 152 return task; 153 } 154 155 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) 156 { 157 struct bpf_iter_seq_task_info *info = seq->private; 158 struct task_struct *task; 159 160 ++*pos; 161 ++info->tid; 162 put_task_struct((struct task_struct *)v); 163 task = task_seq_get_next(&info->common, &info->tid, false); 164 if (!task) 165 return NULL; 166 167 return task; 168 } 169 170 struct bpf_iter__task { 171 __bpf_md_ptr(struct bpf_iter_meta *, meta); 172 __bpf_md_ptr(struct task_struct *, task); 173 }; 174 175 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) 176 177 static int __task_seq_show(struct seq_file *seq, struct task_struct *task, 178 bool in_stop) 179 { 180 struct bpf_iter_meta meta; 181 struct bpf_iter__task ctx; 182 struct bpf_prog *prog; 183 184 meta.seq = seq; 185 prog = bpf_iter_get_info(&meta, in_stop); 186 if (!prog) 187 return 0; 188 189 ctx.meta = &meta; 190 ctx.task = task; 191 return bpf_iter_run_prog(prog, &ctx); 192 } 193 194 static int task_seq_show(struct seq_file *seq, void *v) 195 { 196 return __task_seq_show(seq, v, false); 197 } 198 199 static void task_seq_stop(struct seq_file *seq, void *v) 200 { 201 if (!v) 202 (void)__task_seq_show(seq, v, true); 203 else 204 put_task_struct((struct task_struct *)v); 205 } 206 207 static int bpf_iter_attach_task(struct bpf_prog *prog, 208 union bpf_iter_link_info *linfo, 209 struct bpf_iter_aux_info *aux) 210 { 211 unsigned int flags; 212 struct pid *pid; 213 pid_t tgid; 214 215 if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) 216 return -EINVAL; 217 218 aux->task.type = BPF_TASK_ITER_ALL; 219 if (linfo->task.tid != 0) { 220 aux->task.type = BPF_TASK_ITER_TID; 221 aux->task.pid = linfo->task.tid; 222 } 223 if (linfo->task.pid != 0) { 224 aux->task.type = BPF_TASK_ITER_TGID; 225 aux->task.pid = linfo->task.pid; 226 } 227 if (linfo->task.pid_fd != 0) { 228 aux->task.type = BPF_TASK_ITER_TGID; 229 230 pid = pidfd_get_pid(linfo->task.pid_fd, &flags); 231 if (IS_ERR(pid)) 232 return PTR_ERR(pid); 233 234 tgid = pid_nr_ns(pid, task_active_pid_ns(current)); 235 aux->task.pid = tgid; 236 put_pid(pid); 237 } 238 239 return 0; 240 } 241 242 static const struct seq_operations task_seq_ops = { 243 .start = task_seq_start, 244 .next = task_seq_next, 245 .stop = task_seq_stop, 246 .show = task_seq_show, 247 }; 248 249 struct bpf_iter_seq_task_file_info { 250 /* The first field must be struct bpf_iter_seq_task_common. 251 * this is assumed by {init, fini}_seq_pidns() callback functions. 252 */ 253 struct bpf_iter_seq_task_common common; 254 struct task_struct *task; 255 u32 tid; 256 u32 fd; 257 }; 258 259 static struct file * 260 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) 261 { 262 u32 saved_tid = info->tid; 263 struct task_struct *curr_task; 264 unsigned int curr_fd = info->fd; 265 struct file *f; 266 267 /* If this function returns a non-NULL file object, 268 * it held a reference to the task/file. 269 * Otherwise, it does not hold any reference. 270 */ 271 again: 272 if (info->task) { 273 curr_task = info->task; 274 curr_fd = info->fd; 275 } else { 276 curr_task = task_seq_get_next(&info->common, &info->tid, true); 277 if (!curr_task) { 278 info->task = NULL; 279 return NULL; 280 } 281 282 /* set info->task */ 283 info->task = curr_task; 284 if (saved_tid == info->tid) 285 curr_fd = info->fd; 286 else 287 curr_fd = 0; 288 } 289 290 f = fget_task_next(curr_task, &curr_fd); 291 if (f) { 292 /* set info->fd */ 293 info->fd = curr_fd; 294 return f; 295 } 296 297 /* the current task is done, go to the next task */ 298 put_task_struct(curr_task); 299 300 if (info->common.type == BPF_TASK_ITER_TID) { 301 info->task = NULL; 302 return NULL; 303 } 304 305 info->task = NULL; 306 info->fd = 0; 307 saved_tid = ++(info->tid); 308 goto again; 309 } 310 311 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) 312 { 313 struct bpf_iter_seq_task_file_info *info = seq->private; 314 struct file *file; 315 316 info->task = NULL; 317 file = task_file_seq_get_next(info); 318 if (file && *pos == 0) 319 ++*pos; 320 321 return file; 322 } 323 324 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) 325 { 326 struct bpf_iter_seq_task_file_info *info = seq->private; 327 328 ++*pos; 329 ++info->fd; 330 fput((struct file *)v); 331 return task_file_seq_get_next(info); 332 } 333 334 struct bpf_iter__task_file { 335 __bpf_md_ptr(struct bpf_iter_meta *, meta); 336 __bpf_md_ptr(struct task_struct *, task); 337 u32 fd __aligned(8); 338 __bpf_md_ptr(struct file *, file); 339 }; 340 341 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, 342 struct task_struct *task, u32 fd, 343 struct file *file) 344 345 static int __task_file_seq_show(struct seq_file *seq, struct file *file, 346 bool in_stop) 347 { 348 struct bpf_iter_seq_task_file_info *info = seq->private; 349 struct bpf_iter__task_file ctx; 350 struct bpf_iter_meta meta; 351 struct bpf_prog *prog; 352 353 meta.seq = seq; 354 prog = bpf_iter_get_info(&meta, in_stop); 355 if (!prog) 356 return 0; 357 358 ctx.meta = &meta; 359 ctx.task = info->task; 360 ctx.fd = info->fd; 361 ctx.file = file; 362 return bpf_iter_run_prog(prog, &ctx); 363 } 364 365 static int task_file_seq_show(struct seq_file *seq, void *v) 366 { 367 return __task_file_seq_show(seq, v, false); 368 } 369 370 static void task_file_seq_stop(struct seq_file *seq, void *v) 371 { 372 struct bpf_iter_seq_task_file_info *info = seq->private; 373 374 if (!v) { 375 (void)__task_file_seq_show(seq, v, true); 376 } else { 377 fput((struct file *)v); 378 put_task_struct(info->task); 379 info->task = NULL; 380 } 381 } 382 383 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) 384 { 385 struct bpf_iter_seq_task_common *common = priv_data; 386 387 common->ns = get_pid_ns(task_active_pid_ns(current)); 388 common->type = aux->task.type; 389 common->pid = aux->task.pid; 390 391 return 0; 392 } 393 394 static void fini_seq_pidns(void *priv_data) 395 { 396 struct bpf_iter_seq_task_common *common = priv_data; 397 398 put_pid_ns(common->ns); 399 } 400 401 static const struct seq_operations task_file_seq_ops = { 402 .start = task_file_seq_start, 403 .next = task_file_seq_next, 404 .stop = task_file_seq_stop, 405 .show = task_file_seq_show, 406 }; 407 408 struct bpf_iter_seq_task_vma_info { 409 /* The first field must be struct bpf_iter_seq_task_common. 410 * this is assumed by {init, fini}_seq_pidns() callback functions. 411 */ 412 struct bpf_iter_seq_task_common common; 413 struct task_struct *task; 414 struct mm_struct *mm; 415 struct vm_area_struct *vma; 416 u32 tid; 417 unsigned long prev_vm_start; 418 unsigned long prev_vm_end; 419 }; 420 421 enum bpf_task_vma_iter_find_op { 422 task_vma_iter_first_vma, /* use find_vma() with addr 0 */ 423 task_vma_iter_next_vma, /* use vma_next() with curr_vma */ 424 task_vma_iter_find_vma, /* use find_vma() to find next vma */ 425 }; 426 427 static struct vm_area_struct * 428 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) 429 { 430 enum bpf_task_vma_iter_find_op op; 431 struct vm_area_struct *curr_vma; 432 struct task_struct *curr_task; 433 struct mm_struct *curr_mm; 434 u32 saved_tid = info->tid; 435 436 /* If this function returns a non-NULL vma, it holds a reference to 437 * the task_struct, holds a refcount on mm->mm_users, and holds 438 * read lock on vma->mm->mmap_lock. 439 * If this function returns NULL, it does not hold any reference or 440 * lock. 441 */ 442 if (info->task) { 443 curr_task = info->task; 444 curr_vma = info->vma; 445 curr_mm = info->mm; 446 /* In case of lock contention, drop mmap_lock to unblock 447 * the writer. 448 * 449 * After relock, call find(mm, prev_vm_end - 1) to find 450 * new vma to process. 451 * 452 * +------+------+-----------+ 453 * | VMA1 | VMA2 | VMA3 | 454 * +------+------+-----------+ 455 * | | | | 456 * 4k 8k 16k 400k 457 * 458 * For example, curr_vma == VMA2. Before unlock, we set 459 * 460 * prev_vm_start = 8k 461 * prev_vm_end = 16k 462 * 463 * There are a few cases: 464 * 465 * 1) VMA2 is freed, but VMA3 exists. 466 * 467 * find_vma() will return VMA3, just process VMA3. 468 * 469 * 2) VMA2 still exists. 470 * 471 * find_vma() will return VMA2, process VMA2->next. 472 * 473 * 3) no more vma in this mm. 474 * 475 * Process the next task. 476 * 477 * 4) find_vma() returns a different vma, VMA2'. 478 * 479 * 4.1) If VMA2 covers same range as VMA2', skip VMA2', 480 * because we already covered the range; 481 * 4.2) VMA2 and VMA2' covers different ranges, process 482 * VMA2'. 483 */ 484 if (mmap_lock_is_contended(curr_mm)) { 485 info->prev_vm_start = curr_vma->vm_start; 486 info->prev_vm_end = curr_vma->vm_end; 487 op = task_vma_iter_find_vma; 488 mmap_read_unlock(curr_mm); 489 if (mmap_read_lock_killable(curr_mm)) { 490 mmput(curr_mm); 491 goto finish; 492 } 493 } else { 494 op = task_vma_iter_next_vma; 495 } 496 } else { 497 again: 498 curr_task = task_seq_get_next(&info->common, &info->tid, true); 499 if (!curr_task) { 500 info->tid++; 501 goto finish; 502 } 503 504 if (saved_tid != info->tid) { 505 /* new task, process the first vma */ 506 op = task_vma_iter_first_vma; 507 } else { 508 /* Found the same tid, which means the user space 509 * finished data in previous buffer and read more. 510 * We dropped mmap_lock before returning to user 511 * space, so it is necessary to use find_vma() to 512 * find the next vma to process. 513 */ 514 op = task_vma_iter_find_vma; 515 } 516 517 curr_mm = get_task_mm(curr_task); 518 if (!curr_mm) 519 goto next_task; 520 521 if (mmap_read_lock_killable(curr_mm)) { 522 mmput(curr_mm); 523 goto finish; 524 } 525 } 526 527 switch (op) { 528 case task_vma_iter_first_vma: 529 curr_vma = find_vma(curr_mm, 0); 530 break; 531 case task_vma_iter_next_vma: 532 curr_vma = find_vma(curr_mm, curr_vma->vm_end); 533 break; 534 case task_vma_iter_find_vma: 535 /* We dropped mmap_lock so it is necessary to use find_vma 536 * to find the next vma. This is similar to the mechanism 537 * in show_smaps_rollup(). 538 */ 539 curr_vma = find_vma(curr_mm, info->prev_vm_end - 1); 540 /* case 1) and 4.2) above just use curr_vma */ 541 542 /* check for case 2) or case 4.1) above */ 543 if (curr_vma && 544 curr_vma->vm_start == info->prev_vm_start && 545 curr_vma->vm_end == info->prev_vm_end) 546 curr_vma = find_vma(curr_mm, curr_vma->vm_end); 547 break; 548 } 549 if (!curr_vma) { 550 /* case 3) above, or case 2) 4.1) with vma->next == NULL */ 551 mmap_read_unlock(curr_mm); 552 mmput(curr_mm); 553 goto next_task; 554 } 555 info->task = curr_task; 556 info->vma = curr_vma; 557 info->mm = curr_mm; 558 return curr_vma; 559 560 next_task: 561 if (info->common.type == BPF_TASK_ITER_TID) 562 goto finish; 563 564 put_task_struct(curr_task); 565 info->task = NULL; 566 info->mm = NULL; 567 info->tid++; 568 goto again; 569 570 finish: 571 if (curr_task) 572 put_task_struct(curr_task); 573 info->task = NULL; 574 info->vma = NULL; 575 info->mm = NULL; 576 return NULL; 577 } 578 579 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) 580 { 581 struct bpf_iter_seq_task_vma_info *info = seq->private; 582 struct vm_area_struct *vma; 583 584 vma = task_vma_seq_get_next(info); 585 if (vma && *pos == 0) 586 ++*pos; 587 588 return vma; 589 } 590 591 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) 592 { 593 struct bpf_iter_seq_task_vma_info *info = seq->private; 594 595 ++*pos; 596 return task_vma_seq_get_next(info); 597 } 598 599 struct bpf_iter__task_vma { 600 __bpf_md_ptr(struct bpf_iter_meta *, meta); 601 __bpf_md_ptr(struct task_struct *, task); 602 __bpf_md_ptr(struct vm_area_struct *, vma); 603 }; 604 605 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, 606 struct task_struct *task, struct vm_area_struct *vma) 607 608 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) 609 { 610 struct bpf_iter_seq_task_vma_info *info = seq->private; 611 struct bpf_iter__task_vma ctx; 612 struct bpf_iter_meta meta; 613 struct bpf_prog *prog; 614 615 meta.seq = seq; 616 prog = bpf_iter_get_info(&meta, in_stop); 617 if (!prog) 618 return 0; 619 620 ctx.meta = &meta; 621 ctx.task = info->task; 622 ctx.vma = info->vma; 623 return bpf_iter_run_prog(prog, &ctx); 624 } 625 626 static int task_vma_seq_show(struct seq_file *seq, void *v) 627 { 628 return __task_vma_seq_show(seq, false); 629 } 630 631 static void task_vma_seq_stop(struct seq_file *seq, void *v) 632 { 633 struct bpf_iter_seq_task_vma_info *info = seq->private; 634 635 if (!v) { 636 (void)__task_vma_seq_show(seq, true); 637 } else { 638 /* info->vma has not been seen by the BPF program. If the 639 * user space reads more, task_vma_seq_get_next should 640 * return this vma again. Set prev_vm_start to ~0UL, 641 * so that we don't skip the vma returned by the next 642 * find_vma() (case task_vma_iter_find_vma in 643 * task_vma_seq_get_next()). 644 */ 645 info->prev_vm_start = ~0UL; 646 info->prev_vm_end = info->vma->vm_end; 647 mmap_read_unlock(info->mm); 648 mmput(info->mm); 649 info->mm = NULL; 650 put_task_struct(info->task); 651 info->task = NULL; 652 } 653 } 654 655 static const struct seq_operations task_vma_seq_ops = { 656 .start = task_vma_seq_start, 657 .next = task_vma_seq_next, 658 .stop = task_vma_seq_stop, 659 .show = task_vma_seq_show, 660 }; 661 662 static const struct bpf_iter_seq_info task_seq_info = { 663 .seq_ops = &task_seq_ops, 664 .init_seq_private = init_seq_pidns, 665 .fini_seq_private = fini_seq_pidns, 666 .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), 667 }; 668 669 static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info) 670 { 671 switch (aux->task.type) { 672 case BPF_TASK_ITER_TID: 673 info->iter.task.tid = aux->task.pid; 674 break; 675 case BPF_TASK_ITER_TGID: 676 info->iter.task.pid = aux->task.pid; 677 break; 678 default: 679 break; 680 } 681 return 0; 682 } 683 684 static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) 685 { 686 seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); 687 if (aux->task.type == BPF_TASK_ITER_TID) 688 seq_printf(seq, "tid:\t%u\n", aux->task.pid); 689 else if (aux->task.type == BPF_TASK_ITER_TGID) 690 seq_printf(seq, "pid:\t%u\n", aux->task.pid); 691 } 692 693 static struct bpf_iter_reg task_reg_info = { 694 .target = "task", 695 .attach_target = bpf_iter_attach_task, 696 .feature = BPF_ITER_RESCHED, 697 .ctx_arg_info_size = 1, 698 .ctx_arg_info = { 699 { offsetof(struct bpf_iter__task, task), 700 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 701 }, 702 .seq_info = &task_seq_info, 703 .fill_link_info = bpf_iter_fill_link_info, 704 .show_fdinfo = bpf_iter_task_show_fdinfo, 705 }; 706 707 static const struct bpf_iter_seq_info task_file_seq_info = { 708 .seq_ops = &task_file_seq_ops, 709 .init_seq_private = init_seq_pidns, 710 .fini_seq_private = fini_seq_pidns, 711 .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), 712 }; 713 714 static struct bpf_iter_reg task_file_reg_info = { 715 .target = "task_file", 716 .attach_target = bpf_iter_attach_task, 717 .feature = BPF_ITER_RESCHED, 718 .ctx_arg_info_size = 2, 719 .ctx_arg_info = { 720 { offsetof(struct bpf_iter__task_file, task), 721 PTR_TO_BTF_ID_OR_NULL }, 722 { offsetof(struct bpf_iter__task_file, file), 723 PTR_TO_BTF_ID_OR_NULL }, 724 }, 725 .seq_info = &task_file_seq_info, 726 .fill_link_info = bpf_iter_fill_link_info, 727 .show_fdinfo = bpf_iter_task_show_fdinfo, 728 }; 729 730 static const struct bpf_iter_seq_info task_vma_seq_info = { 731 .seq_ops = &task_vma_seq_ops, 732 .init_seq_private = init_seq_pidns, 733 .fini_seq_private = fini_seq_pidns, 734 .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), 735 }; 736 737 static struct bpf_iter_reg task_vma_reg_info = { 738 .target = "task_vma", 739 .attach_target = bpf_iter_attach_task, 740 .feature = BPF_ITER_RESCHED, 741 .ctx_arg_info_size = 2, 742 .ctx_arg_info = { 743 { offsetof(struct bpf_iter__task_vma, task), 744 PTR_TO_BTF_ID_OR_NULL }, 745 { offsetof(struct bpf_iter__task_vma, vma), 746 PTR_TO_BTF_ID_OR_NULL }, 747 }, 748 .seq_info = &task_vma_seq_info, 749 .fill_link_info = bpf_iter_fill_link_info, 750 .show_fdinfo = bpf_iter_task_show_fdinfo, 751 }; 752 753 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, 754 bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) 755 { 756 struct mmap_unlock_irq_work *work = NULL; 757 struct vm_area_struct *vma; 758 bool irq_work_busy = false; 759 struct mm_struct *mm; 760 int ret = -ENOENT; 761 762 if (flags) 763 return -EINVAL; 764 765 if (!task) 766 return -ENOENT; 767 768 mm = task->mm; 769 if (!mm) 770 return -ENOENT; 771 772 irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); 773 774 if (irq_work_busy || !mmap_read_trylock(mm)) 775 return -EBUSY; 776 777 vma = find_vma(mm, start); 778 779 if (vma && vma->vm_start <= start && vma->vm_end > start) { 780 callback_fn((u64)(long)task, (u64)(long)vma, 781 (u64)(long)callback_ctx, 0, 0); 782 ret = 0; 783 } 784 bpf_mmap_unlock_mm(work, mm); 785 return ret; 786 } 787 788 const struct bpf_func_proto bpf_find_vma_proto = { 789 .func = bpf_find_vma, 790 .ret_type = RET_INTEGER, 791 .arg1_type = ARG_PTR_TO_BTF_ID, 792 .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 793 .arg2_type = ARG_ANYTHING, 794 .arg3_type = ARG_PTR_TO_FUNC, 795 .arg4_type = ARG_PTR_TO_STACK_OR_NULL, 796 .arg5_type = ARG_ANYTHING, 797 }; 798 799 static inline void bpf_iter_mmput_async(struct mm_struct *mm) 800 { 801 #ifdef CONFIG_MMU 802 mmput_async(mm); 803 #else 804 mmput(mm); 805 #endif 806 } 807 808 struct bpf_iter_task_vma_kern_data { 809 struct task_struct *task; 810 struct mm_struct *mm; 811 struct vm_area_struct snapshot; 812 u64 next_addr; 813 }; 814 815 struct bpf_iter_task_vma { 816 /* opaque iterator state; having __u64 here allows to preserve correct 817 * alignment requirements in vmlinux.h, generated from BTF 818 */ 819 __u64 __opaque[1]; 820 } __attribute__((aligned(8))); 821 822 /* Non-opaque version of bpf_iter_task_vma */ 823 struct bpf_iter_task_vma_kern { 824 struct bpf_iter_task_vma_kern_data *data; 825 } __attribute__((aligned(8))); 826 827 __bpf_kfunc_start_defs(); 828 829 __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, 830 struct task_struct *task, u64 addr) 831 { 832 struct bpf_iter_task_vma_kern *kit = (void *)it; 833 int err; 834 835 BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); 836 BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma)); 837 838 if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) { 839 kit->data = NULL; 840 return -EOPNOTSUPP; 841 } 842 843 /* 844 * Reject irqs-disabled contexts including NMI. Operations used 845 * by _next() and _destroy() (vma_end_read, fput, bpf_iter_mmput_async) 846 * can take spinlocks with IRQs disabled (pi_lock, pool->lock). 847 * Running from NMI or from a tracepoint that fires with those 848 * locks held could deadlock. 849 */ 850 if (irqs_disabled()) { 851 kit->data = NULL; 852 return -EBUSY; 853 } 854 855 /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized 856 * before, so non-NULL kit->data doesn't point to previously 857 * bpf_mem_alloc'd bpf_iter_task_vma_kern_data 858 */ 859 kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data)); 860 if (!kit->data) 861 return -ENOMEM; 862 863 kit->data->task = get_task_struct(task); 864 /* 865 * Safely read task->mm and acquire an mm reference. 866 * 867 * Cannot use get_task_mm() because its task_lock() is a 868 * blocking spin_lock that would deadlock if the target task 869 * already holds alloc_lock on this CPU (e.g. a softirq BPF 870 * program iterating a task interrupted while holding its 871 * alloc_lock). 872 */ 873 if (!spin_trylock(&task->alloc_lock)) { 874 err = -EBUSY; 875 goto err_cleanup_iter; 876 } 877 kit->data->mm = task->mm; 878 if (kit->data->mm && !(task->flags & PF_KTHREAD)) 879 mmget(kit->data->mm); 880 else 881 kit->data->mm = NULL; 882 spin_unlock(&task->alloc_lock); 883 if (!kit->data->mm) { 884 err = -ENOENT; 885 goto err_cleanup_iter; 886 } 887 888 kit->data->snapshot.vm_file = NULL; 889 kit->data->next_addr = addr; 890 return 0; 891 892 err_cleanup_iter: 893 put_task_struct(kit->data->task); 894 bpf_mem_free(&bpf_global_ma, kit->data); 895 /* NULL kit->data signals failed bpf_iter_task_vma initialization */ 896 kit->data = NULL; 897 return err; 898 } 899 900 /* 901 * Find and lock the next VMA at or after data->next_addr. 902 * 903 * lock_vma_under_rcu() is a point lookup (mas_walk): it finds the VMA 904 * containing a given address but cannot iterate. An RCU-protected 905 * maple tree walk with vma_next() (mas_find) is needed first to locate 906 * the next VMA's vm_start across any gap. 907 * 908 * Between the RCU walk and the lock, the VMA may be removed, shrunk, 909 * or write-locked. On failure, advance past it using vm_end from the 910 * RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall back 911 * to PAGE_SIZE advancement to guarantee forward progress. 912 */ 913 static struct vm_area_struct * 914 bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data) 915 { 916 struct vm_area_struct *vma; 917 struct vma_iterator vmi; 918 unsigned long start, end; 919 920 retry: 921 rcu_read_lock(); 922 vma_iter_init(&vmi, data->mm, data->next_addr); 923 vma = vma_next(&vmi); 924 if (!vma) { 925 rcu_read_unlock(); 926 return NULL; 927 } 928 start = vma->vm_start; 929 end = vma->vm_end; 930 rcu_read_unlock(); 931 932 vma = lock_vma_under_rcu(data->mm, start); 933 if (!vma) { 934 if (end <= data->next_addr) 935 data->next_addr += PAGE_SIZE; 936 else 937 data->next_addr = end; 938 goto retry; 939 } 940 941 if (unlikely(vma->vm_end <= data->next_addr)) { 942 data->next_addr += PAGE_SIZE; 943 vma_end_read(vma); 944 goto retry; 945 } 946 947 return vma; 948 } 949 950 static void bpf_iter_task_vma_snapshot_reset(struct vm_area_struct *snap) 951 { 952 if (snap->vm_file) { 953 fput(snap->vm_file); 954 snap->vm_file = NULL; 955 } 956 } 957 958 __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) 959 { 960 struct bpf_iter_task_vma_kern *kit = (void *)it; 961 struct vm_area_struct *snap, *vma; 962 963 if (!kit->data) /* bpf_iter_task_vma_new failed */ 964 return NULL; 965 966 snap = &kit->data->snapshot; 967 968 bpf_iter_task_vma_snapshot_reset(snap); 969 970 vma = bpf_iter_task_vma_find_next(kit->data); 971 if (!vma) 972 return NULL; 973 974 memcpy(snap, vma, sizeof(*snap)); 975 976 /* 977 * The verifier only trusts vm_mm and vm_file (see 978 * BTF_TYPE_SAFE_TRUSTED_OR_NULL in verifier.c). Take a reference 979 * on vm_file; vm_mm is already correct because lock_vma_under_rcu() 980 * verifies vma->vm_mm == mm. All other pointers are untrusted by 981 * the verifier and left as-is. 982 */ 983 if (snap->vm_file) 984 get_file(snap->vm_file); 985 986 kit->data->next_addr = vma->vm_end; 987 vma_end_read(vma); 988 return snap; 989 } 990 991 __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) 992 { 993 struct bpf_iter_task_vma_kern *kit = (void *)it; 994 995 if (kit->data) { 996 bpf_iter_task_vma_snapshot_reset(&kit->data->snapshot); 997 put_task_struct(kit->data->task); 998 bpf_iter_mmput_async(kit->data->mm); 999 bpf_mem_free(&bpf_global_ma, kit->data); 1000 } 1001 } 1002 1003 __bpf_kfunc_end_defs(); 1004 1005 #ifdef CONFIG_CGROUPS 1006 1007 struct bpf_iter_css_task { 1008 __u64 __opaque[1]; 1009 } __attribute__((aligned(8))); 1010 1011 struct bpf_iter_css_task_kern { 1012 struct css_task_iter *css_it; 1013 } __attribute__((aligned(8))); 1014 1015 __bpf_kfunc_start_defs(); 1016 1017 __bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it, 1018 struct cgroup_subsys_state *css, unsigned int flags) 1019 { 1020 struct bpf_iter_css_task_kern *kit = (void *)it; 1021 1022 BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task)); 1023 BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) != 1024 __alignof__(struct bpf_iter_css_task)); 1025 kit->css_it = NULL; 1026 switch (flags) { 1027 case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED: 1028 case CSS_TASK_ITER_PROCS: 1029 case 0: 1030 break; 1031 default: 1032 return -EINVAL; 1033 } 1034 1035 kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter)); 1036 if (!kit->css_it) 1037 return -ENOMEM; 1038 css_task_iter_start(css, flags, kit->css_it); 1039 return 0; 1040 } 1041 1042 __bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) 1043 { 1044 struct bpf_iter_css_task_kern *kit = (void *)it; 1045 1046 if (!kit->css_it) 1047 return NULL; 1048 return css_task_iter_next(kit->css_it); 1049 } 1050 1051 __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) 1052 { 1053 struct bpf_iter_css_task_kern *kit = (void *)it; 1054 1055 if (!kit->css_it) 1056 return; 1057 css_task_iter_end(kit->css_it); 1058 bpf_mem_free(&bpf_global_ma, kit->css_it); 1059 } 1060 1061 __bpf_kfunc_end_defs(); 1062 1063 #endif /* CONFIG_CGROUPS */ 1064 1065 struct bpf_iter_task { 1066 __u64 __opaque[3]; 1067 } __attribute__((aligned(8))); 1068 1069 struct bpf_iter_task_kern { 1070 struct task_struct *task; 1071 struct task_struct *pos; 1072 unsigned int flags; 1073 } __attribute__((aligned(8))); 1074 1075 enum { 1076 /* all process in the system */ 1077 BPF_TASK_ITER_ALL_PROCS, 1078 /* all threads in the system */ 1079 BPF_TASK_ITER_ALL_THREADS, 1080 /* all threads of a specific process */ 1081 BPF_TASK_ITER_PROC_THREADS 1082 }; 1083 1084 __bpf_kfunc_start_defs(); 1085 1086 __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, 1087 struct task_struct *task__nullable, unsigned int flags) 1088 { 1089 struct bpf_iter_task_kern *kit = (void *)it; 1090 1091 BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task)); 1092 BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != 1093 __alignof__(struct bpf_iter_task)); 1094 1095 kit->pos = NULL; 1096 1097 switch (flags) { 1098 case BPF_TASK_ITER_ALL_THREADS: 1099 case BPF_TASK_ITER_ALL_PROCS: 1100 break; 1101 case BPF_TASK_ITER_PROC_THREADS: 1102 if (!task__nullable) 1103 return -EINVAL; 1104 break; 1105 default: 1106 return -EINVAL; 1107 } 1108 1109 if (flags == BPF_TASK_ITER_PROC_THREADS) 1110 kit->task = task__nullable; 1111 else 1112 kit->task = &init_task; 1113 kit->pos = kit->task; 1114 kit->flags = flags; 1115 return 0; 1116 } 1117 1118 __bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) 1119 { 1120 struct bpf_iter_task_kern *kit = (void *)it; 1121 struct task_struct *pos; 1122 unsigned int flags; 1123 1124 flags = kit->flags; 1125 pos = kit->pos; 1126 1127 if (!pos) 1128 return pos; 1129 1130 if (flags == BPF_TASK_ITER_ALL_PROCS) 1131 goto get_next_task; 1132 1133 kit->pos = __next_thread(kit->pos); 1134 if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS) 1135 return pos; 1136 1137 get_next_task: 1138 kit->task = next_task(kit->task); 1139 if (kit->task == &init_task) 1140 kit->pos = NULL; 1141 else 1142 kit->pos = kit->task; 1143 1144 return pos; 1145 } 1146 1147 __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it) 1148 { 1149 } 1150 1151 __bpf_kfunc_end_defs(); 1152 1153 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); 1154 1155 static void do_mmap_read_unlock(struct irq_work *entry) 1156 { 1157 struct mmap_unlock_irq_work *work; 1158 1159 if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) 1160 return; 1161 1162 work = container_of(entry, struct mmap_unlock_irq_work, irq_work); 1163 mmap_read_unlock_non_owner(work->mm); 1164 } 1165 1166 static int __init task_iter_init(void) 1167 { 1168 struct mmap_unlock_irq_work *work; 1169 int ret, cpu; 1170 1171 for_each_possible_cpu(cpu) { 1172 work = per_cpu_ptr(&mmap_unlock_work, cpu); 1173 init_irq_work(&work->irq_work, do_mmap_read_unlock); 1174 } 1175 1176 task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 1177 ret = bpf_iter_reg_target(&task_reg_info); 1178 if (ret) 1179 return ret; 1180 1181 task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 1182 task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; 1183 ret = bpf_iter_reg_target(&task_file_reg_info); 1184 if (ret) 1185 return ret; 1186 1187 task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 1188 task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; 1189 return bpf_iter_reg_target(&task_vma_reg_info); 1190 } 1191 late_initcall(task_iter_init); 1192