1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Performance events core code: 4 * 5 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 7 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra 8 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 9 */ 10 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/cpu.h> 14 #include <linux/smp.h> 15 #include <linux/idr.h> 16 #include <linux/file.h> 17 #include <linux/poll.h> 18 #include <linux/slab.h> 19 #include <linux/hash.h> 20 #include <linux/tick.h> 21 #include <linux/sysfs.h> 22 #include <linux/dcache.h> 23 #include <linux/percpu.h> 24 #include <linux/ptrace.h> 25 #include <linux/reboot.h> 26 #include <linux/vmstat.h> 27 #include <linux/device.h> 28 #include <linux/export.h> 29 #include <linux/vmalloc.h> 30 #include <linux/hardirq.h> 31 #include <linux/hugetlb.h> 32 #include <linux/rculist.h> 33 #include <linux/uaccess.h> 34 #include <linux/syscalls.h> 35 #include <linux/anon_inodes.h> 36 #include <linux/kernel_stat.h> 37 #include <linux/cgroup.h> 38 #include <linux/perf_event.h> 39 #include <linux/trace_events.h> 40 #include <linux/hw_breakpoint.h> 41 #include <linux/mm_types.h> 42 #include <linux/module.h> 43 #include <linux/mman.h> 44 #include <linux/compat.h> 45 #include <linux/bpf.h> 46 #include <linux/filter.h> 47 #include <linux/namei.h> 48 #include <linux/parser.h> 49 #include <linux/sched/clock.h> 50 #include <linux/sched/mm.h> 51 #include <linux/proc_ns.h> 52 #include <linux/mount.h> 53 #include <linux/min_heap.h> 54 #include <linux/highmem.h> 55 #include <linux/pgtable.h> 56 #include <linux/buildid.h> 57 #include <linux/task_work.h> 58 #include <linux/percpu-rwsem.h> 59 60 #include "internal.h" 61 62 #include <asm/irq_regs.h> 63 64 typedef int (*remote_function_f)(void *); 65 66 struct remote_function_call { 67 struct task_struct *p; 68 remote_function_f func; 69 void *info; 70 int ret; 71 }; 72 73 static void remote_function(void *data) 74 { 75 struct remote_function_call *tfc = data; 76 struct task_struct *p = tfc->p; 77 78 if (p) { 79 /* -EAGAIN */ 80 if (task_cpu(p) != smp_processor_id()) 81 return; 82 83 /* 84 * Now that we're on right CPU with IRQs disabled, we can test 85 * if we hit the right task without races. 86 */ 87 88 tfc->ret = -ESRCH; /* No such (running) process */ 89 if (p != current) 90 return; 91 } 92 93 tfc->ret = tfc->func(tfc->info); 94 } 95 96 /** 97 * task_function_call - call a function on the cpu on which a task runs 98 * @p: the task to evaluate 99 * @func: the function to be called 100 * @info: the function call argument 101 * 102 * Calls the function @func when the task is currently running. This might 103 * be on the current CPU, which just calls the function directly. This will 104 * retry due to any failures in smp_call_function_single(), such as if the 105 * task_cpu() goes offline concurrently. 106 * 107 * returns @func return value or -ESRCH or -ENXIO when the process isn't running 108 */ 109 static int 110 task_function_call(struct task_struct *p, remote_function_f func, void *info) 111 { 112 struct remote_function_call data = { 113 .p = p, 114 .func = func, 115 .info = info, 116 .ret = -EAGAIN, 117 }; 118 int ret; 119 120 for (;;) { 121 ret = smp_call_function_single(task_cpu(p), remote_function, 122 &data, 1); 123 if (!ret) 124 ret = data.ret; 125 126 if (ret != -EAGAIN) 127 break; 128 129 cond_resched(); 130 } 131 132 return ret; 133 } 134 135 /** 136 * cpu_function_call - call a function on the cpu 137 * @cpu: target cpu to queue this function 138 * @func: the function to be called 139 * @info: the function call argument 140 * 141 * Calls the function @func on the remote cpu. 142 * 143 * returns: @func return value or -ENXIO when the cpu is offline 144 */ 145 static int cpu_function_call(int cpu, remote_function_f func, void *info) 146 { 147 struct remote_function_call data = { 148 .p = NULL, 149 .func = func, 150 .info = info, 151 .ret = -ENXIO, /* No such CPU */ 152 }; 153 154 smp_call_function_single(cpu, remote_function, &data, 1); 155 156 return data.ret; 157 } 158 159 enum event_type_t { 160 EVENT_FLEXIBLE = 0x01, 161 EVENT_PINNED = 0x02, 162 EVENT_TIME = 0x04, 163 EVENT_FROZEN = 0x08, 164 /* see ctx_resched() for details */ 165 EVENT_CPU = 0x10, 166 EVENT_CGROUP = 0x20, 167 168 /* compound helpers */ 169 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 170 EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, 171 }; 172 173 static inline void __perf_ctx_lock(struct perf_event_context *ctx) 174 { 175 raw_spin_lock(&ctx->lock); 176 WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); 177 } 178 179 static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 180 struct perf_event_context *ctx) 181 { 182 __perf_ctx_lock(&cpuctx->ctx); 183 if (ctx) 184 __perf_ctx_lock(ctx); 185 } 186 187 static inline void __perf_ctx_unlock(struct perf_event_context *ctx) 188 { 189 /* 190 * If ctx_sched_in() didn't again set any ALL flags, clean up 191 * after ctx_sched_out() by clearing is_active. 192 */ 193 if (ctx->is_active & EVENT_FROZEN) { 194 if (!(ctx->is_active & EVENT_ALL)) 195 ctx->is_active = 0; 196 else 197 ctx->is_active &= ~EVENT_FROZEN; 198 } 199 raw_spin_unlock(&ctx->lock); 200 } 201 202 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 203 struct perf_event_context *ctx) 204 { 205 if (ctx) 206 __perf_ctx_unlock(ctx); 207 __perf_ctx_unlock(&cpuctx->ctx); 208 } 209 210 #define TASK_TOMBSTONE ((void *)-1L) 211 212 static bool is_kernel_event(struct perf_event *event) 213 { 214 return READ_ONCE(event->owner) == TASK_TOMBSTONE; 215 } 216 217 static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 218 219 struct perf_event_context *perf_cpu_task_ctx(void) 220 { 221 lockdep_assert_irqs_disabled(); 222 return this_cpu_ptr(&perf_cpu_context)->task_ctx; 223 } 224 225 /* 226 * On task ctx scheduling... 227 * 228 * When !ctx->nr_events a task context will not be scheduled. This means 229 * we can disable the scheduler hooks (for performance) without leaving 230 * pending task ctx state. 231 * 232 * This however results in two special cases: 233 * 234 * - removing the last event from a task ctx; this is relatively straight 235 * forward and is done in __perf_remove_from_context. 236 * 237 * - adding the first event to a task ctx; this is tricky because we cannot 238 * rely on ctx->is_active and therefore cannot use event_function_call(). 239 * See perf_install_in_context(). 240 * 241 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. 242 */ 243 244 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, 245 struct perf_event_context *, void *); 246 247 struct event_function_struct { 248 struct perf_event *event; 249 event_f func; 250 void *data; 251 }; 252 253 static int event_function(void *info) 254 { 255 struct event_function_struct *efs = info; 256 struct perf_event *event = efs->event; 257 struct perf_event_context *ctx = event->ctx; 258 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 259 struct perf_event_context *task_ctx = cpuctx->task_ctx; 260 int ret = 0; 261 262 lockdep_assert_irqs_disabled(); 263 264 perf_ctx_lock(cpuctx, task_ctx); 265 /* 266 * Since we do the IPI call without holding ctx->lock things can have 267 * changed, double check we hit the task we set out to hit. 268 */ 269 if (ctx->task) { 270 if (ctx->task != current) { 271 ret = -ESRCH; 272 goto unlock; 273 } 274 275 /* 276 * We only use event_function_call() on established contexts, 277 * and event_function() is only ever called when active (or 278 * rather, we'll have bailed in task_function_call() or the 279 * above ctx->task != current test), therefore we must have 280 * ctx->is_active here. 281 */ 282 WARN_ON_ONCE(!ctx->is_active); 283 /* 284 * And since we have ctx->is_active, cpuctx->task_ctx must 285 * match. 286 */ 287 WARN_ON_ONCE(task_ctx != ctx); 288 } else { 289 WARN_ON_ONCE(&cpuctx->ctx != ctx); 290 } 291 292 efs->func(event, cpuctx, ctx, efs->data); 293 unlock: 294 perf_ctx_unlock(cpuctx, task_ctx); 295 296 return ret; 297 } 298 299 static void event_function_call(struct perf_event *event, event_f func, void *data) 300 { 301 struct perf_event_context *ctx = event->ctx; 302 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ 303 struct perf_cpu_context *cpuctx; 304 struct event_function_struct efs = { 305 .event = event, 306 .func = func, 307 .data = data, 308 }; 309 310 if (!event->parent) { 311 /* 312 * If this is a !child event, we must hold ctx::mutex to 313 * stabilize the event->ctx relation. See 314 * perf_event_ctx_lock(). 315 */ 316 lockdep_assert_held(&ctx->mutex); 317 } 318 319 if (!task) { 320 cpu_function_call(event->cpu, event_function, &efs); 321 return; 322 } 323 324 if (task == TASK_TOMBSTONE) 325 return; 326 327 again: 328 if (!task_function_call(task, event_function, &efs)) 329 return; 330 331 local_irq_disable(); 332 cpuctx = this_cpu_ptr(&perf_cpu_context); 333 perf_ctx_lock(cpuctx, ctx); 334 /* 335 * Reload the task pointer, it might have been changed by 336 * a concurrent perf_event_context_sched_out(). 337 */ 338 task = ctx->task; 339 if (task == TASK_TOMBSTONE) 340 goto unlock; 341 if (ctx->is_active) { 342 perf_ctx_unlock(cpuctx, ctx); 343 local_irq_enable(); 344 goto again; 345 } 346 func(event, NULL, ctx, data); 347 unlock: 348 perf_ctx_unlock(cpuctx, ctx); 349 local_irq_enable(); 350 } 351 352 /* 353 * Similar to event_function_call() + event_function(), but hard assumes IRQs 354 * are already disabled and we're on the right CPU. 355 */ 356 static void event_function_local(struct perf_event *event, event_f func, void *data) 357 { 358 struct perf_event_context *ctx = event->ctx; 359 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 360 struct task_struct *task = READ_ONCE(ctx->task); 361 struct perf_event_context *task_ctx = NULL; 362 363 lockdep_assert_irqs_disabled(); 364 365 if (task) { 366 if (task == TASK_TOMBSTONE) 367 return; 368 369 task_ctx = ctx; 370 } 371 372 perf_ctx_lock(cpuctx, task_ctx); 373 374 task = ctx->task; 375 if (task == TASK_TOMBSTONE) 376 goto unlock; 377 378 if (task) { 379 /* 380 * We must be either inactive or active and the right task, 381 * otherwise we're screwed, since we cannot IPI to somewhere 382 * else. 383 */ 384 if (ctx->is_active) { 385 if (WARN_ON_ONCE(task != current)) 386 goto unlock; 387 388 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) 389 goto unlock; 390 } 391 } else { 392 WARN_ON_ONCE(&cpuctx->ctx != ctx); 393 } 394 395 func(event, cpuctx, ctx, data); 396 unlock: 397 perf_ctx_unlock(cpuctx, task_ctx); 398 } 399 400 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 401 PERF_FLAG_FD_OUTPUT |\ 402 PERF_FLAG_PID_CGROUP |\ 403 PERF_FLAG_FD_CLOEXEC) 404 405 /* 406 * branch priv levels that need permission checks 407 */ 408 #define PERF_SAMPLE_BRANCH_PERM_PLM \ 409 (PERF_SAMPLE_BRANCH_KERNEL |\ 410 PERF_SAMPLE_BRANCH_HV) 411 412 /* 413 * perf_sched_events : >0 events exist 414 */ 415 416 static void perf_sched_delayed(struct work_struct *work); 417 DEFINE_STATIC_KEY_FALSE(perf_sched_events); 418 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); 419 static DEFINE_MUTEX(perf_sched_mutex); 420 static atomic_t perf_sched_count; 421 422 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); 423 424 static atomic_t nr_mmap_events __read_mostly; 425 static atomic_t nr_comm_events __read_mostly; 426 static atomic_t nr_namespaces_events __read_mostly; 427 static atomic_t nr_task_events __read_mostly; 428 static atomic_t nr_freq_events __read_mostly; 429 static atomic_t nr_switch_events __read_mostly; 430 static atomic_t nr_ksymbol_events __read_mostly; 431 static atomic_t nr_bpf_events __read_mostly; 432 static atomic_t nr_cgroup_events __read_mostly; 433 static atomic_t nr_text_poke_events __read_mostly; 434 static atomic_t nr_build_id_events __read_mostly; 435 436 static LIST_HEAD(pmus); 437 static DEFINE_MUTEX(pmus_lock); 438 static struct srcu_struct pmus_srcu; 439 static cpumask_var_t perf_online_mask; 440 static cpumask_var_t perf_online_core_mask; 441 static cpumask_var_t perf_online_die_mask; 442 static cpumask_var_t perf_online_cluster_mask; 443 static cpumask_var_t perf_online_pkg_mask; 444 static cpumask_var_t perf_online_sys_mask; 445 static struct kmem_cache *perf_event_cache; 446 447 /* 448 * perf event paranoia level: 449 * -1 - not paranoid at all 450 * 0 - disallow raw tracepoint access for unpriv 451 * 1 - disallow cpu events for unpriv 452 * 2 - disallow kernel profiling for unpriv 453 */ 454 int sysctl_perf_event_paranoid __read_mostly = 2; 455 456 /* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ 457 static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); 458 459 /* 460 * max perf event sample rate 461 */ 462 #define DEFAULT_MAX_SAMPLE_RATE 100000 463 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) 464 #define DEFAULT_CPU_TIME_MAX_PERCENT 25 465 466 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 467 static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; 468 469 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 470 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 471 472 static int perf_sample_allowed_ns __read_mostly = 473 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; 474 475 static void update_perf_cpu_limits(void) 476 { 477 u64 tmp = perf_sample_period_ns; 478 479 tmp *= sysctl_perf_cpu_time_max_percent; 480 tmp = div_u64(tmp, 100); 481 if (!tmp) 482 tmp = 1; 483 484 WRITE_ONCE(perf_sample_allowed_ns, tmp); 485 } 486 487 static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); 488 489 static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, 490 void *buffer, size_t *lenp, loff_t *ppos) 491 { 492 int ret; 493 int perf_cpu = sysctl_perf_cpu_time_max_percent; 494 /* 495 * If throttling is disabled don't allow the write: 496 */ 497 if (write && (perf_cpu == 100 || perf_cpu == 0)) 498 return -EINVAL; 499 500 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 501 if (ret || !write) 502 return ret; 503 504 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 505 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 506 update_perf_cpu_limits(); 507 508 return 0; 509 } 510 511 static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, 512 void *buffer, size_t *lenp, loff_t *ppos) 513 { 514 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 515 516 if (ret || !write) 517 return ret; 518 519 if (sysctl_perf_cpu_time_max_percent == 100 || 520 sysctl_perf_cpu_time_max_percent == 0) { 521 printk(KERN_WARNING 522 "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); 523 WRITE_ONCE(perf_sample_allowed_ns, 0); 524 } else { 525 update_perf_cpu_limits(); 526 } 527 528 return 0; 529 } 530 531 static const struct ctl_table events_core_sysctl_table[] = { 532 /* 533 * User-space relies on this file as a feature check for 534 * perf_events being enabled. It's an ABI, do not remove! 535 */ 536 { 537 .procname = "perf_event_paranoid", 538 .data = &sysctl_perf_event_paranoid, 539 .maxlen = sizeof(sysctl_perf_event_paranoid), 540 .mode = 0644, 541 .proc_handler = proc_dointvec, 542 }, 543 { 544 .procname = "perf_event_mlock_kb", 545 .data = &sysctl_perf_event_mlock, 546 .maxlen = sizeof(sysctl_perf_event_mlock), 547 .mode = 0644, 548 .proc_handler = proc_dointvec, 549 }, 550 { 551 .procname = "perf_event_max_sample_rate", 552 .data = &sysctl_perf_event_sample_rate, 553 .maxlen = sizeof(sysctl_perf_event_sample_rate), 554 .mode = 0644, 555 .proc_handler = perf_event_max_sample_rate_handler, 556 .extra1 = SYSCTL_ONE, 557 }, 558 { 559 .procname = "perf_cpu_time_max_percent", 560 .data = &sysctl_perf_cpu_time_max_percent, 561 .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), 562 .mode = 0644, 563 .proc_handler = perf_cpu_time_max_percent_handler, 564 .extra1 = SYSCTL_ZERO, 565 .extra2 = SYSCTL_ONE_HUNDRED, 566 }, 567 }; 568 569 static int __init init_events_core_sysctls(void) 570 { 571 register_sysctl_init("kernel", events_core_sysctl_table); 572 return 0; 573 } 574 core_initcall(init_events_core_sysctls); 575 576 577 /* 578 * perf samples are done in some very critical code paths (NMIs). 579 * If they take too much CPU time, the system can lock up and not 580 * get any real work done. This will drop the sample rate when 581 * we detect that events are taking too long. 582 */ 583 #define NR_ACCUMULATED_SAMPLES 128 584 static DEFINE_PER_CPU(u64, running_sample_length); 585 586 static u64 __report_avg; 587 static u64 __report_allowed; 588 589 static void perf_duration_warn(struct irq_work *w) 590 { 591 printk_ratelimited(KERN_INFO 592 "perf: interrupt took too long (%lld > %lld), lowering " 593 "kernel.perf_event_max_sample_rate to %d\n", 594 __report_avg, __report_allowed, 595 sysctl_perf_event_sample_rate); 596 } 597 598 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); 599 600 void perf_sample_event_took(u64 sample_len_ns) 601 { 602 u64 max_len = READ_ONCE(perf_sample_allowed_ns); 603 u64 running_len; 604 u64 avg_len; 605 u32 max; 606 607 if (max_len == 0) 608 return; 609 610 /* Decay the counter by 1 average sample. */ 611 running_len = __this_cpu_read(running_sample_length); 612 running_len -= running_len/NR_ACCUMULATED_SAMPLES; 613 running_len += sample_len_ns; 614 __this_cpu_write(running_sample_length, running_len); 615 616 /* 617 * Note: this will be biased artificially low until we have 618 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us 619 * from having to maintain a count. 620 */ 621 avg_len = running_len/NR_ACCUMULATED_SAMPLES; 622 if (avg_len <= max_len) 623 return; 624 625 __report_avg = avg_len; 626 __report_allowed = max_len; 627 628 /* 629 * Compute a throttle threshold 25% below the current duration. 630 */ 631 avg_len += avg_len / 4; 632 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; 633 if (avg_len < max) 634 max /= (u32)avg_len; 635 else 636 max = 1; 637 638 WRITE_ONCE(perf_sample_allowed_ns, avg_len); 639 WRITE_ONCE(max_samples_per_tick, max); 640 641 sysctl_perf_event_sample_rate = max * HZ; 642 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 643 644 if (!irq_work_queue(&perf_duration_work)) { 645 early_printk("perf: interrupt took too long (%lld > %lld), lowering " 646 "kernel.perf_event_max_sample_rate to %d\n", 647 __report_avg, __report_allowed, 648 sysctl_perf_event_sample_rate); 649 } 650 } 651 652 static atomic64_t perf_event_id; 653 654 static void update_context_time(struct perf_event_context *ctx); 655 static u64 perf_event_time(struct perf_event *event); 656 657 void __weak perf_event_print_debug(void) { } 658 659 static inline u64 perf_clock(void) 660 { 661 return local_clock(); 662 } 663 664 static inline u64 perf_event_clock(struct perf_event *event) 665 { 666 return event->clock(); 667 } 668 669 /* 670 * State based event timekeeping... 671 * 672 * The basic idea is to use event->state to determine which (if any) time 673 * fields to increment with the current delta. This means we only need to 674 * update timestamps when we change state or when they are explicitly requested 675 * (read). 676 * 677 * Event groups make things a little more complicated, but not terribly so. The 678 * rules for a group are that if the group leader is OFF the entire group is 679 * OFF, irrespective of what the group member states are. This results in 680 * __perf_effective_state(). 681 * 682 * A further ramification is that when a group leader flips between OFF and 683 * !OFF, we need to update all group member times. 684 * 685 * 686 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we 687 * need to make sure the relevant context time is updated before we try and 688 * update our timestamps. 689 */ 690 691 static __always_inline enum perf_event_state 692 __perf_effective_state(struct perf_event *event) 693 { 694 struct perf_event *leader = event->group_leader; 695 696 if (leader->state <= PERF_EVENT_STATE_OFF) 697 return leader->state; 698 699 return event->state; 700 } 701 702 static __always_inline void 703 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) 704 { 705 enum perf_event_state state = __perf_effective_state(event); 706 u64 delta = now - event->tstamp; 707 708 *enabled = event->total_time_enabled; 709 if (state >= PERF_EVENT_STATE_INACTIVE) 710 *enabled += delta; 711 712 *running = event->total_time_running; 713 if (state >= PERF_EVENT_STATE_ACTIVE) 714 *running += delta; 715 } 716 717 static void perf_event_update_time(struct perf_event *event) 718 { 719 u64 now = perf_event_time(event); 720 721 __perf_update_times(event, now, &event->total_time_enabled, 722 &event->total_time_running); 723 event->tstamp = now; 724 } 725 726 static void perf_event_update_sibling_time(struct perf_event *leader) 727 { 728 struct perf_event *sibling; 729 730 for_each_sibling_event(sibling, leader) 731 perf_event_update_time(sibling); 732 } 733 734 static void 735 perf_event_set_state(struct perf_event *event, enum perf_event_state state) 736 { 737 if (event->state == state) 738 return; 739 740 perf_event_update_time(event); 741 /* 742 * If a group leader gets enabled/disabled all its siblings 743 * are affected too. 744 */ 745 if ((event->state < 0) ^ (state < 0)) 746 perf_event_update_sibling_time(event); 747 748 WRITE_ONCE(event->state, state); 749 } 750 751 /* 752 * UP store-release, load-acquire 753 */ 754 755 #define __store_release(ptr, val) \ 756 do { \ 757 barrier(); \ 758 WRITE_ONCE(*(ptr), (val)); \ 759 } while (0) 760 761 #define __load_acquire(ptr) \ 762 ({ \ 763 __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ 764 barrier(); \ 765 ___p; \ 766 }) 767 768 #define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ 769 list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ 770 if (_cgroup && !_epc->nr_cgroups) \ 771 continue; \ 772 else if (_pmu && _epc->pmu != _pmu) \ 773 continue; \ 774 else 775 776 static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) 777 { 778 struct perf_event_pmu_context *pmu_ctx; 779 780 for_each_epc(pmu_ctx, ctx, NULL, cgroup) 781 perf_pmu_disable(pmu_ctx->pmu); 782 } 783 784 static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) 785 { 786 struct perf_event_pmu_context *pmu_ctx; 787 788 for_each_epc(pmu_ctx, ctx, NULL, cgroup) 789 perf_pmu_enable(pmu_ctx->pmu); 790 } 791 792 static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); 793 static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); 794 795 #ifdef CONFIG_CGROUP_PERF 796 797 static inline bool 798 perf_cgroup_match(struct perf_event *event) 799 { 800 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 801 802 /* @event doesn't care about cgroup */ 803 if (!event->cgrp) 804 return true; 805 806 /* wants specific cgroup scope but @cpuctx isn't associated with any */ 807 if (!cpuctx->cgrp) 808 return false; 809 810 /* 811 * Cgroup scoping is recursive. An event enabled for a cgroup is 812 * also enabled for all its descendant cgroups. If @cpuctx's 813 * cgroup is a descendant of @event's (the test covers identity 814 * case), it's a match. 815 */ 816 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, 817 event->cgrp->css.cgroup); 818 } 819 820 static inline void perf_detach_cgroup(struct perf_event *event) 821 { 822 css_put(&event->cgrp->css); 823 event->cgrp = NULL; 824 } 825 826 static inline int is_cgroup_event(struct perf_event *event) 827 { 828 return event->cgrp != NULL; 829 } 830 831 static inline u64 perf_cgroup_event_time(struct perf_event *event) 832 { 833 struct perf_cgroup_info *t; 834 835 t = per_cpu_ptr(event->cgrp->info, event->cpu); 836 return t->time; 837 } 838 839 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) 840 { 841 struct perf_cgroup_info *t; 842 843 t = per_cpu_ptr(event->cgrp->info, event->cpu); 844 if (!__load_acquire(&t->active)) 845 return t->time; 846 now += READ_ONCE(t->timeoffset); 847 return now; 848 } 849 850 static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) 851 { 852 if (adv) 853 info->time += now - info->timestamp; 854 info->timestamp = now; 855 /* 856 * see update_context_time() 857 */ 858 WRITE_ONCE(info->timeoffset, info->time - info->timestamp); 859 } 860 861 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) 862 { 863 struct perf_cgroup *cgrp = cpuctx->cgrp; 864 struct cgroup_subsys_state *css; 865 struct perf_cgroup_info *info; 866 867 if (cgrp) { 868 u64 now = perf_clock(); 869 870 for (css = &cgrp->css; css; css = css->parent) { 871 cgrp = container_of(css, struct perf_cgroup, css); 872 info = this_cpu_ptr(cgrp->info); 873 874 __update_cgrp_time(info, now, true); 875 if (final) 876 __store_release(&info->active, 0); 877 } 878 } 879 } 880 881 static inline void update_cgrp_time_from_event(struct perf_event *event) 882 { 883 struct perf_cgroup_info *info; 884 885 /* 886 * ensure we access cgroup data only when needed and 887 * when we know the cgroup is pinned (css_get) 888 */ 889 if (!is_cgroup_event(event)) 890 return; 891 892 info = this_cpu_ptr(event->cgrp->info); 893 /* 894 * Do not update time when cgroup is not active 895 */ 896 if (info->active) 897 __update_cgrp_time(info, perf_clock(), true); 898 } 899 900 static inline void 901 perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) 902 { 903 struct perf_event_context *ctx = &cpuctx->ctx; 904 struct perf_cgroup *cgrp = cpuctx->cgrp; 905 struct perf_cgroup_info *info; 906 struct cgroup_subsys_state *css; 907 908 /* 909 * ctx->lock held by caller 910 * ensure we do not access cgroup data 911 * unless we have the cgroup pinned (css_get) 912 */ 913 if (!cgrp) 914 return; 915 916 WARN_ON_ONCE(!ctx->nr_cgroups); 917 918 for (css = &cgrp->css; css; css = css->parent) { 919 cgrp = container_of(css, struct perf_cgroup, css); 920 info = this_cpu_ptr(cgrp->info); 921 __update_cgrp_time(info, ctx->timestamp, false); 922 __store_release(&info->active, 1); 923 } 924 } 925 926 /* 927 * reschedule events based on the cgroup constraint of task. 928 */ 929 static void perf_cgroup_switch(struct task_struct *task) 930 { 931 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 932 struct perf_cgroup *cgrp; 933 934 /* 935 * cpuctx->cgrp is set when the first cgroup event enabled, 936 * and is cleared when the last cgroup event disabled. 937 */ 938 if (READ_ONCE(cpuctx->cgrp) == NULL) 939 return; 940 941 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); 942 943 cgrp = perf_cgroup_from_task(task, NULL); 944 if (READ_ONCE(cpuctx->cgrp) == cgrp) 945 return; 946 947 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 948 perf_ctx_disable(&cpuctx->ctx, true); 949 950 ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); 951 /* 952 * must not be done before ctxswout due 953 * to update_cgrp_time_from_cpuctx() in 954 * ctx_sched_out() 955 */ 956 cpuctx->cgrp = cgrp; 957 /* 958 * set cgrp before ctxsw in to allow 959 * perf_cgroup_set_timestamp() in ctx_sched_in() 960 * to not have to pass task around 961 */ 962 ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); 963 964 perf_ctx_enable(&cpuctx->ctx, true); 965 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 966 } 967 968 static int perf_cgroup_ensure_storage(struct perf_event *event, 969 struct cgroup_subsys_state *css) 970 { 971 struct perf_cpu_context *cpuctx; 972 struct perf_event **storage; 973 int cpu, heap_size, ret = 0; 974 975 /* 976 * Allow storage to have sufficient space for an iterator for each 977 * possibly nested cgroup plus an iterator for events with no cgroup. 978 */ 979 for (heap_size = 1; css; css = css->parent) 980 heap_size++; 981 982 for_each_possible_cpu(cpu) { 983 cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); 984 if (heap_size <= cpuctx->heap_size) 985 continue; 986 987 storage = kmalloc_node(heap_size * sizeof(struct perf_event *), 988 GFP_KERNEL, cpu_to_node(cpu)); 989 if (!storage) { 990 ret = -ENOMEM; 991 break; 992 } 993 994 raw_spin_lock_irq(&cpuctx->ctx.lock); 995 if (cpuctx->heap_size < heap_size) { 996 swap(cpuctx->heap, storage); 997 if (storage == cpuctx->heap_default) 998 storage = NULL; 999 cpuctx->heap_size = heap_size; 1000 } 1001 raw_spin_unlock_irq(&cpuctx->ctx.lock); 1002 1003 kfree(storage); 1004 } 1005 1006 return ret; 1007 } 1008 1009 static inline int perf_cgroup_connect(int fd, struct perf_event *event, 1010 struct perf_event_attr *attr, 1011 struct perf_event *group_leader) 1012 { 1013 struct perf_cgroup *cgrp; 1014 struct cgroup_subsys_state *css; 1015 CLASS(fd, f)(fd); 1016 int ret = 0; 1017 1018 if (fd_empty(f)) 1019 return -EBADF; 1020 1021 css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, 1022 &perf_event_cgrp_subsys); 1023 if (IS_ERR(css)) 1024 return PTR_ERR(css); 1025 1026 ret = perf_cgroup_ensure_storage(event, css); 1027 if (ret) 1028 return ret; 1029 1030 cgrp = container_of(css, struct perf_cgroup, css); 1031 event->cgrp = cgrp; 1032 1033 /* 1034 * all events in a group must monitor 1035 * the same cgroup because a task belongs 1036 * to only one perf cgroup at a time 1037 */ 1038 if (group_leader && group_leader->cgrp != cgrp) { 1039 perf_detach_cgroup(event); 1040 ret = -EINVAL; 1041 } 1042 return ret; 1043 } 1044 1045 static inline void 1046 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) 1047 { 1048 struct perf_cpu_context *cpuctx; 1049 1050 if (!is_cgroup_event(event)) 1051 return; 1052 1053 event->pmu_ctx->nr_cgroups++; 1054 1055 /* 1056 * Because cgroup events are always per-cpu events, 1057 * @ctx == &cpuctx->ctx. 1058 */ 1059 cpuctx = container_of(ctx, struct perf_cpu_context, ctx); 1060 1061 if (ctx->nr_cgroups++) 1062 return; 1063 1064 cpuctx->cgrp = perf_cgroup_from_task(current, ctx); 1065 } 1066 1067 static inline void 1068 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) 1069 { 1070 struct perf_cpu_context *cpuctx; 1071 1072 if (!is_cgroup_event(event)) 1073 return; 1074 1075 event->pmu_ctx->nr_cgroups--; 1076 1077 /* 1078 * Because cgroup events are always per-cpu events, 1079 * @ctx == &cpuctx->ctx. 1080 */ 1081 cpuctx = container_of(ctx, struct perf_cpu_context, ctx); 1082 1083 if (--ctx->nr_cgroups) 1084 return; 1085 1086 cpuctx->cgrp = NULL; 1087 } 1088 1089 #else /* !CONFIG_CGROUP_PERF */ 1090 1091 static inline bool 1092 perf_cgroup_match(struct perf_event *event) 1093 { 1094 return true; 1095 } 1096 1097 static inline void perf_detach_cgroup(struct perf_event *event) 1098 {} 1099 1100 static inline int is_cgroup_event(struct perf_event *event) 1101 { 1102 return 0; 1103 } 1104 1105 static inline void update_cgrp_time_from_event(struct perf_event *event) 1106 { 1107 } 1108 1109 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, 1110 bool final) 1111 { 1112 } 1113 1114 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, 1115 struct perf_event_attr *attr, 1116 struct perf_event *group_leader) 1117 { 1118 return -EINVAL; 1119 } 1120 1121 static inline void 1122 perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) 1123 { 1124 } 1125 1126 static inline u64 perf_cgroup_event_time(struct perf_event *event) 1127 { 1128 return 0; 1129 } 1130 1131 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) 1132 { 1133 return 0; 1134 } 1135 1136 static inline void 1137 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) 1138 { 1139 } 1140 1141 static inline void 1142 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) 1143 { 1144 } 1145 1146 static void perf_cgroup_switch(struct task_struct *task) 1147 { 1148 } 1149 #endif 1150 1151 /* 1152 * set default to be dependent on timer tick just 1153 * like original code 1154 */ 1155 #define PERF_CPU_HRTIMER (1000 / HZ) 1156 /* 1157 * function must be called with interrupts disabled 1158 */ 1159 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) 1160 { 1161 struct perf_cpu_pmu_context *cpc; 1162 bool rotations; 1163 1164 lockdep_assert_irqs_disabled(); 1165 1166 cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); 1167 rotations = perf_rotate_context(cpc); 1168 1169 raw_spin_lock(&cpc->hrtimer_lock); 1170 if (rotations) 1171 hrtimer_forward_now(hr, cpc->hrtimer_interval); 1172 else 1173 cpc->hrtimer_active = 0; 1174 raw_spin_unlock(&cpc->hrtimer_lock); 1175 1176 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; 1177 } 1178 1179 static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) 1180 { 1181 struct hrtimer *timer = &cpc->hrtimer; 1182 struct pmu *pmu = cpc->epc.pmu; 1183 u64 interval; 1184 1185 /* 1186 * check default is sane, if not set then force to 1187 * default interval (1/tick) 1188 */ 1189 interval = pmu->hrtimer_interval_ms; 1190 if (interval < 1) 1191 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; 1192 1193 cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); 1194 1195 raw_spin_lock_init(&cpc->hrtimer_lock); 1196 hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, 1197 HRTIMER_MODE_ABS_PINNED_HARD); 1198 } 1199 1200 static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) 1201 { 1202 struct hrtimer *timer = &cpc->hrtimer; 1203 unsigned long flags; 1204 1205 raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); 1206 if (!cpc->hrtimer_active) { 1207 cpc->hrtimer_active = 1; 1208 hrtimer_forward_now(timer, cpc->hrtimer_interval); 1209 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); 1210 } 1211 raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); 1212 1213 return 0; 1214 } 1215 1216 static int perf_mux_hrtimer_restart_ipi(void *arg) 1217 { 1218 return perf_mux_hrtimer_restart(arg); 1219 } 1220 1221 static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) 1222 { 1223 return *this_cpu_ptr(pmu->cpu_pmu_context); 1224 } 1225 1226 void perf_pmu_disable(struct pmu *pmu) 1227 { 1228 int *count = &this_cpc(pmu)->pmu_disable_count; 1229 if (!(*count)++) 1230 pmu->pmu_disable(pmu); 1231 } 1232 1233 void perf_pmu_enable(struct pmu *pmu) 1234 { 1235 int *count = &this_cpc(pmu)->pmu_disable_count; 1236 if (!--(*count)) 1237 pmu->pmu_enable(pmu); 1238 } 1239 1240 static void perf_assert_pmu_disabled(struct pmu *pmu) 1241 { 1242 int *count = &this_cpc(pmu)->pmu_disable_count; 1243 WARN_ON_ONCE(*count == 0); 1244 } 1245 1246 static inline void perf_pmu_read(struct perf_event *event) 1247 { 1248 if (event->state == PERF_EVENT_STATE_ACTIVE) 1249 event->pmu->read(event); 1250 } 1251 1252 static void get_ctx(struct perf_event_context *ctx) 1253 { 1254 refcount_inc(&ctx->refcount); 1255 } 1256 1257 static void free_ctx(struct rcu_head *head) 1258 { 1259 struct perf_event_context *ctx; 1260 1261 ctx = container_of(head, struct perf_event_context, rcu_head); 1262 kfree(ctx); 1263 } 1264 1265 static void put_ctx(struct perf_event_context *ctx) 1266 { 1267 if (refcount_dec_and_test(&ctx->refcount)) { 1268 if (ctx->parent_ctx) 1269 put_ctx(ctx->parent_ctx); 1270 if (ctx->task && ctx->task != TASK_TOMBSTONE) 1271 put_task_struct(ctx->task); 1272 call_rcu(&ctx->rcu_head, free_ctx); 1273 } 1274 } 1275 1276 /* 1277 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and 1278 * perf_pmu_migrate_context() we need some magic. 1279 * 1280 * Those places that change perf_event::ctx will hold both 1281 * perf_event_ctx::mutex of the 'old' and 'new' ctx value. 1282 * 1283 * Lock ordering is by mutex address. There are two other sites where 1284 * perf_event_context::mutex nests and those are: 1285 * 1286 * - perf_event_exit_task_context() [ child , 0 ] 1287 * perf_event_exit_event() 1288 * put_event() [ parent, 1 ] 1289 * 1290 * - perf_event_init_context() [ parent, 0 ] 1291 * inherit_task_group() 1292 * inherit_group() 1293 * inherit_event() 1294 * perf_event_alloc() 1295 * perf_init_event() 1296 * perf_try_init_event() [ child , 1 ] 1297 * 1298 * While it appears there is an obvious deadlock here -- the parent and child 1299 * nesting levels are inverted between the two. This is in fact safe because 1300 * life-time rules separate them. That is an exiting task cannot fork, and a 1301 * spawning task cannot (yet) exit. 1302 * 1303 * But remember that these are parent<->child context relations, and 1304 * migration does not affect children, therefore these two orderings should not 1305 * interact. 1306 * 1307 * The change in perf_event::ctx does not affect children (as claimed above) 1308 * because the sys_perf_event_open() case will install a new event and break 1309 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only 1310 * concerned with cpuctx and that doesn't have children. 1311 * 1312 * The places that change perf_event::ctx will issue: 1313 * 1314 * perf_remove_from_context(); 1315 * synchronize_rcu(); 1316 * perf_install_in_context(); 1317 * 1318 * to affect the change. The remove_from_context() + synchronize_rcu() should 1319 * quiesce the event, after which we can install it in the new location. This 1320 * means that only external vectors (perf_fops, prctl) can perturb the event 1321 * while in transit. Therefore all such accessors should also acquire 1322 * perf_event_context::mutex to serialize against this. 1323 * 1324 * However; because event->ctx can change while we're waiting to acquire 1325 * ctx->mutex we must be careful and use the below perf_event_ctx_lock() 1326 * function. 1327 * 1328 * Lock order: 1329 * exec_update_lock 1330 * task_struct::perf_event_mutex 1331 * perf_event_context::mutex 1332 * perf_event::child_mutex; 1333 * perf_event_context::lock 1334 * mmap_lock 1335 * perf_event::mmap_mutex 1336 * perf_buffer::aux_mutex 1337 * perf_addr_filters_head::lock 1338 * 1339 * cpu_hotplug_lock 1340 * pmus_lock 1341 * cpuctx->mutex / perf_event_context::mutex 1342 */ 1343 static struct perf_event_context * 1344 perf_event_ctx_lock_nested(struct perf_event *event, int nesting) 1345 { 1346 struct perf_event_context *ctx; 1347 1348 again: 1349 rcu_read_lock(); 1350 ctx = READ_ONCE(event->ctx); 1351 if (!refcount_inc_not_zero(&ctx->refcount)) { 1352 rcu_read_unlock(); 1353 goto again; 1354 } 1355 rcu_read_unlock(); 1356 1357 mutex_lock_nested(&ctx->mutex, nesting); 1358 if (event->ctx != ctx) { 1359 mutex_unlock(&ctx->mutex); 1360 put_ctx(ctx); 1361 goto again; 1362 } 1363 1364 return ctx; 1365 } 1366 1367 static inline struct perf_event_context * 1368 perf_event_ctx_lock(struct perf_event *event) 1369 { 1370 return perf_event_ctx_lock_nested(event, 0); 1371 } 1372 1373 static void perf_event_ctx_unlock(struct perf_event *event, 1374 struct perf_event_context *ctx) 1375 { 1376 mutex_unlock(&ctx->mutex); 1377 put_ctx(ctx); 1378 } 1379 1380 /* 1381 * This must be done under the ctx->lock, such as to serialize against 1382 * context_equiv(), therefore we cannot call put_ctx() since that might end up 1383 * calling scheduler related locks and ctx->lock nests inside those. 1384 */ 1385 static __must_check struct perf_event_context * 1386 unclone_ctx(struct perf_event_context *ctx) 1387 { 1388 struct perf_event_context *parent_ctx = ctx->parent_ctx; 1389 1390 lockdep_assert_held(&ctx->lock); 1391 1392 if (parent_ctx) 1393 ctx->parent_ctx = NULL; 1394 ctx->generation++; 1395 1396 return parent_ctx; 1397 } 1398 1399 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, 1400 enum pid_type type) 1401 { 1402 u32 nr; 1403 /* 1404 * only top level events have the pid namespace they were created in 1405 */ 1406 if (event->parent) 1407 event = event->parent; 1408 1409 nr = __task_pid_nr_ns(p, type, event->ns); 1410 /* avoid -1 if it is idle thread or runs in another ns */ 1411 if (!nr && !pid_alive(p)) 1412 nr = -1; 1413 return nr; 1414 } 1415 1416 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 1417 { 1418 return perf_event_pid_type(event, p, PIDTYPE_TGID); 1419 } 1420 1421 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) 1422 { 1423 return perf_event_pid_type(event, p, PIDTYPE_PID); 1424 } 1425 1426 /* 1427 * If we inherit events we want to return the parent event id 1428 * to userspace. 1429 */ 1430 static u64 primary_event_id(struct perf_event *event) 1431 { 1432 u64 id = event->id; 1433 1434 if (event->parent) 1435 id = event->parent->id; 1436 1437 return id; 1438 } 1439 1440 /* 1441 * Get the perf_event_context for a task and lock it. 1442 * 1443 * This has to cope with the fact that until it is locked, 1444 * the context could get moved to another task. 1445 */ 1446 static struct perf_event_context * 1447 perf_lock_task_context(struct task_struct *task, unsigned long *flags) 1448 { 1449 struct perf_event_context *ctx; 1450 1451 retry: 1452 /* 1453 * One of the few rules of preemptible RCU is that one cannot do 1454 * rcu_read_unlock() while holding a scheduler (or nested) lock when 1455 * part of the read side critical section was irqs-enabled -- see 1456 * rcu_read_unlock_special(). 1457 * 1458 * Since ctx->lock nests under rq->lock we must ensure the entire read 1459 * side critical section has interrupts disabled. 1460 */ 1461 local_irq_save(*flags); 1462 rcu_read_lock(); 1463 ctx = rcu_dereference(task->perf_event_ctxp); 1464 if (ctx) { 1465 /* 1466 * If this context is a clone of another, it might 1467 * get swapped for another underneath us by 1468 * perf_event_task_sched_out, though the 1469 * rcu_read_lock() protects us from any context 1470 * getting freed. Lock the context and check if it 1471 * got swapped before we could get the lock, and retry 1472 * if so. If we locked the right context, then it 1473 * can't get swapped on us any more. 1474 */ 1475 raw_spin_lock(&ctx->lock); 1476 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 1477 raw_spin_unlock(&ctx->lock); 1478 rcu_read_unlock(); 1479 local_irq_restore(*flags); 1480 goto retry; 1481 } 1482 1483 if (ctx->task == TASK_TOMBSTONE || 1484 !refcount_inc_not_zero(&ctx->refcount)) { 1485 raw_spin_unlock(&ctx->lock); 1486 ctx = NULL; 1487 } else { 1488 WARN_ON_ONCE(ctx->task != task); 1489 } 1490 } 1491 rcu_read_unlock(); 1492 if (!ctx) 1493 local_irq_restore(*flags); 1494 return ctx; 1495 } 1496 1497 /* 1498 * Get the context for a task and increment its pin_count so it 1499 * can't get swapped to another task. This also increments its 1500 * reference count so that the context can't get freed. 1501 */ 1502 static struct perf_event_context * 1503 perf_pin_task_context(struct task_struct *task) 1504 { 1505 struct perf_event_context *ctx; 1506 unsigned long flags; 1507 1508 ctx = perf_lock_task_context(task, &flags); 1509 if (ctx) { 1510 ++ctx->pin_count; 1511 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1512 } 1513 return ctx; 1514 } 1515 1516 static void perf_unpin_context(struct perf_event_context *ctx) 1517 { 1518 unsigned long flags; 1519 1520 raw_spin_lock_irqsave(&ctx->lock, flags); 1521 --ctx->pin_count; 1522 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1523 } 1524 1525 /* 1526 * Update the record of the current time in a context. 1527 */ 1528 static void __update_context_time(struct perf_event_context *ctx, bool adv) 1529 { 1530 u64 now = perf_clock(); 1531 1532 lockdep_assert_held(&ctx->lock); 1533 1534 if (adv) 1535 ctx->time += now - ctx->timestamp; 1536 ctx->timestamp = now; 1537 1538 /* 1539 * The above: time' = time + (now - timestamp), can be re-arranged 1540 * into: time` = now + (time - timestamp), which gives a single value 1541 * offset to compute future time without locks on. 1542 * 1543 * See perf_event_time_now(), which can be used from NMI context where 1544 * it's (obviously) not possible to acquire ctx->lock in order to read 1545 * both the above values in a consistent manner. 1546 */ 1547 WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); 1548 } 1549 1550 static void update_context_time(struct perf_event_context *ctx) 1551 { 1552 __update_context_time(ctx, true); 1553 } 1554 1555 static u64 perf_event_time(struct perf_event *event) 1556 { 1557 struct perf_event_context *ctx = event->ctx; 1558 1559 if (unlikely(!ctx)) 1560 return 0; 1561 1562 if (is_cgroup_event(event)) 1563 return perf_cgroup_event_time(event); 1564 1565 return ctx->time; 1566 } 1567 1568 static u64 perf_event_time_now(struct perf_event *event, u64 now) 1569 { 1570 struct perf_event_context *ctx = event->ctx; 1571 1572 if (unlikely(!ctx)) 1573 return 0; 1574 1575 if (is_cgroup_event(event)) 1576 return perf_cgroup_event_time_now(event, now); 1577 1578 if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) 1579 return ctx->time; 1580 1581 now += READ_ONCE(ctx->timeoffset); 1582 return now; 1583 } 1584 1585 static enum event_type_t get_event_type(struct perf_event *event) 1586 { 1587 struct perf_event_context *ctx = event->ctx; 1588 enum event_type_t event_type; 1589 1590 lockdep_assert_held(&ctx->lock); 1591 1592 /* 1593 * It's 'group type', really, because if our group leader is 1594 * pinned, so are we. 1595 */ 1596 if (event->group_leader != event) 1597 event = event->group_leader; 1598 1599 event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; 1600 if (!ctx->task) 1601 event_type |= EVENT_CPU; 1602 1603 return event_type; 1604 } 1605 1606 /* 1607 * Helper function to initialize event group nodes. 1608 */ 1609 static void init_event_group(struct perf_event *event) 1610 { 1611 RB_CLEAR_NODE(&event->group_node); 1612 event->group_index = 0; 1613 } 1614 1615 /* 1616 * Extract pinned or flexible groups from the context 1617 * based on event attrs bits. 1618 */ 1619 static struct perf_event_groups * 1620 get_event_groups(struct perf_event *event, struct perf_event_context *ctx) 1621 { 1622 if (event->attr.pinned) 1623 return &ctx->pinned_groups; 1624 else 1625 return &ctx->flexible_groups; 1626 } 1627 1628 /* 1629 * Helper function to initializes perf_event_group trees. 1630 */ 1631 static void perf_event_groups_init(struct perf_event_groups *groups) 1632 { 1633 groups->tree = RB_ROOT; 1634 groups->index = 0; 1635 } 1636 1637 static inline struct cgroup *event_cgroup(const struct perf_event *event) 1638 { 1639 struct cgroup *cgroup = NULL; 1640 1641 #ifdef CONFIG_CGROUP_PERF 1642 if (event->cgrp) 1643 cgroup = event->cgrp->css.cgroup; 1644 #endif 1645 1646 return cgroup; 1647 } 1648 1649 /* 1650 * Compare function for event groups; 1651 * 1652 * Implements complex key that first sorts by CPU and then by virtual index 1653 * which provides ordering when rotating groups for the same CPU. 1654 */ 1655 static __always_inline int 1656 perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, 1657 const struct cgroup *left_cgroup, const u64 left_group_index, 1658 const struct perf_event *right) 1659 { 1660 if (left_cpu < right->cpu) 1661 return -1; 1662 if (left_cpu > right->cpu) 1663 return 1; 1664 1665 if (left_pmu) { 1666 if (left_pmu < right->pmu_ctx->pmu) 1667 return -1; 1668 if (left_pmu > right->pmu_ctx->pmu) 1669 return 1; 1670 } 1671 1672 #ifdef CONFIG_CGROUP_PERF 1673 { 1674 const struct cgroup *right_cgroup = event_cgroup(right); 1675 1676 if (left_cgroup != right_cgroup) { 1677 if (!left_cgroup) { 1678 /* 1679 * Left has no cgroup but right does, no 1680 * cgroups come first. 1681 */ 1682 return -1; 1683 } 1684 if (!right_cgroup) { 1685 /* 1686 * Right has no cgroup but left does, no 1687 * cgroups come first. 1688 */ 1689 return 1; 1690 } 1691 /* Two dissimilar cgroups, order by id. */ 1692 if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) 1693 return -1; 1694 1695 return 1; 1696 } 1697 } 1698 #endif 1699 1700 if (left_group_index < right->group_index) 1701 return -1; 1702 if (left_group_index > right->group_index) 1703 return 1; 1704 1705 return 0; 1706 } 1707 1708 #define __node_2_pe(node) \ 1709 rb_entry((node), struct perf_event, group_node) 1710 1711 static inline bool __group_less(struct rb_node *a, const struct rb_node *b) 1712 { 1713 struct perf_event *e = __node_2_pe(a); 1714 return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), 1715 e->group_index, __node_2_pe(b)) < 0; 1716 } 1717 1718 struct __group_key { 1719 int cpu; 1720 struct pmu *pmu; 1721 struct cgroup *cgroup; 1722 }; 1723 1724 static inline int __group_cmp(const void *key, const struct rb_node *node) 1725 { 1726 const struct __group_key *a = key; 1727 const struct perf_event *b = __node_2_pe(node); 1728 1729 /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ 1730 return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); 1731 } 1732 1733 static inline int 1734 __group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) 1735 { 1736 const struct __group_key *a = key; 1737 const struct perf_event *b = __node_2_pe(node); 1738 1739 /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ 1740 return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), 1741 b->group_index, b); 1742 } 1743 1744 /* 1745 * Insert @event into @groups' tree; using 1746 * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} 1747 * as key. This places it last inside the {cpu,pmu,cgroup} subtree. 1748 */ 1749 static void 1750 perf_event_groups_insert(struct perf_event_groups *groups, 1751 struct perf_event *event) 1752 { 1753 event->group_index = ++groups->index; 1754 1755 rb_add(&event->group_node, &groups->tree, __group_less); 1756 } 1757 1758 /* 1759 * Helper function to insert event into the pinned or flexible groups. 1760 */ 1761 static void 1762 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) 1763 { 1764 struct perf_event_groups *groups; 1765 1766 groups = get_event_groups(event, ctx); 1767 perf_event_groups_insert(groups, event); 1768 } 1769 1770 /* 1771 * Delete a group from a tree. 1772 */ 1773 static void 1774 perf_event_groups_delete(struct perf_event_groups *groups, 1775 struct perf_event *event) 1776 { 1777 WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || 1778 RB_EMPTY_ROOT(&groups->tree)); 1779 1780 rb_erase(&event->group_node, &groups->tree); 1781 init_event_group(event); 1782 } 1783 1784 /* 1785 * Helper function to delete event from its groups. 1786 */ 1787 static void 1788 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) 1789 { 1790 struct perf_event_groups *groups; 1791 1792 groups = get_event_groups(event, ctx); 1793 perf_event_groups_delete(groups, event); 1794 } 1795 1796 /* 1797 * Get the leftmost event in the {cpu,pmu,cgroup} subtree. 1798 */ 1799 static struct perf_event * 1800 perf_event_groups_first(struct perf_event_groups *groups, int cpu, 1801 struct pmu *pmu, struct cgroup *cgrp) 1802 { 1803 struct __group_key key = { 1804 .cpu = cpu, 1805 .pmu = pmu, 1806 .cgroup = cgrp, 1807 }; 1808 struct rb_node *node; 1809 1810 node = rb_find_first(&key, &groups->tree, __group_cmp); 1811 if (node) 1812 return __node_2_pe(node); 1813 1814 return NULL; 1815 } 1816 1817 static struct perf_event * 1818 perf_event_groups_next(struct perf_event *event, struct pmu *pmu) 1819 { 1820 struct __group_key key = { 1821 .cpu = event->cpu, 1822 .pmu = pmu, 1823 .cgroup = event_cgroup(event), 1824 }; 1825 struct rb_node *next; 1826 1827 next = rb_next_match(&key, &event->group_node, __group_cmp); 1828 if (next) 1829 return __node_2_pe(next); 1830 1831 return NULL; 1832 } 1833 1834 #define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ 1835 for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ 1836 event; event = perf_event_groups_next(event, pmu)) 1837 1838 /* 1839 * Iterate through the whole groups tree. 1840 */ 1841 #define perf_event_groups_for_each(event, groups) \ 1842 for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ 1843 typeof(*event), group_node); event; \ 1844 event = rb_entry_safe(rb_next(&event->group_node), \ 1845 typeof(*event), group_node)) 1846 1847 /* 1848 * Does the event attribute request inherit with PERF_SAMPLE_READ 1849 */ 1850 static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) 1851 { 1852 return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); 1853 } 1854 1855 /* 1856 * Add an event from the lists for its context. 1857 * Must be called with ctx->mutex and ctx->lock held. 1858 */ 1859 static void 1860 list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1861 { 1862 lockdep_assert_held(&ctx->lock); 1863 1864 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1865 event->attach_state |= PERF_ATTACH_CONTEXT; 1866 1867 event->tstamp = perf_event_time(event); 1868 1869 /* 1870 * If we're a stand alone event or group leader, we go to the context 1871 * list, group events are kept attached to the group so that 1872 * perf_group_detach can, at all times, locate all siblings. 1873 */ 1874 if (event->group_leader == event) { 1875 event->group_caps = event->event_caps; 1876 add_event_to_groups(event, ctx); 1877 } 1878 1879 list_add_rcu(&event->event_entry, &ctx->event_list); 1880 ctx->nr_events++; 1881 if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) 1882 ctx->nr_user++; 1883 if (event->attr.inherit_stat) 1884 ctx->nr_stat++; 1885 if (has_inherit_and_sample_read(&event->attr)) 1886 local_inc(&ctx->nr_no_switch_fast); 1887 1888 if (event->state > PERF_EVENT_STATE_OFF) 1889 perf_cgroup_event_enable(event, ctx); 1890 1891 ctx->generation++; 1892 event->pmu_ctx->nr_events++; 1893 } 1894 1895 /* 1896 * Initialize event state based on the perf_event_attr::disabled. 1897 */ 1898 static inline void perf_event__state_init(struct perf_event *event) 1899 { 1900 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : 1901 PERF_EVENT_STATE_INACTIVE; 1902 } 1903 1904 static int __perf_event_read_size(u64 read_format, int nr_siblings) 1905 { 1906 int entry = sizeof(u64); /* value */ 1907 int size = 0; 1908 int nr = 1; 1909 1910 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1911 size += sizeof(u64); 1912 1913 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1914 size += sizeof(u64); 1915 1916 if (read_format & PERF_FORMAT_ID) 1917 entry += sizeof(u64); 1918 1919 if (read_format & PERF_FORMAT_LOST) 1920 entry += sizeof(u64); 1921 1922 if (read_format & PERF_FORMAT_GROUP) { 1923 nr += nr_siblings; 1924 size += sizeof(u64); 1925 } 1926 1927 /* 1928 * Since perf_event_validate_size() limits this to 16k and inhibits 1929 * adding more siblings, this will never overflow. 1930 */ 1931 return size + nr * entry; 1932 } 1933 1934 static void __perf_event_header_size(struct perf_event *event, u64 sample_type) 1935 { 1936 struct perf_sample_data *data; 1937 u16 size = 0; 1938 1939 if (sample_type & PERF_SAMPLE_IP) 1940 size += sizeof(data->ip); 1941 1942 if (sample_type & PERF_SAMPLE_ADDR) 1943 size += sizeof(data->addr); 1944 1945 if (sample_type & PERF_SAMPLE_PERIOD) 1946 size += sizeof(data->period); 1947 1948 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) 1949 size += sizeof(data->weight.full); 1950 1951 if (sample_type & PERF_SAMPLE_READ) 1952 size += event->read_size; 1953 1954 if (sample_type & PERF_SAMPLE_DATA_SRC) 1955 size += sizeof(data->data_src.val); 1956 1957 if (sample_type & PERF_SAMPLE_TRANSACTION) 1958 size += sizeof(data->txn); 1959 1960 if (sample_type & PERF_SAMPLE_PHYS_ADDR) 1961 size += sizeof(data->phys_addr); 1962 1963 if (sample_type & PERF_SAMPLE_CGROUP) 1964 size += sizeof(data->cgroup); 1965 1966 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) 1967 size += sizeof(data->data_page_size); 1968 1969 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) 1970 size += sizeof(data->code_page_size); 1971 1972 event->header_size = size; 1973 } 1974 1975 /* 1976 * Called at perf_event creation and when events are attached/detached from a 1977 * group. 1978 */ 1979 static void perf_event__header_size(struct perf_event *event) 1980 { 1981 event->read_size = 1982 __perf_event_read_size(event->attr.read_format, 1983 event->group_leader->nr_siblings); 1984 __perf_event_header_size(event, event->attr.sample_type); 1985 } 1986 1987 static void perf_event__id_header_size(struct perf_event *event) 1988 { 1989 struct perf_sample_data *data; 1990 u64 sample_type = event->attr.sample_type; 1991 u16 size = 0; 1992 1993 if (sample_type & PERF_SAMPLE_TID) 1994 size += sizeof(data->tid_entry); 1995 1996 if (sample_type & PERF_SAMPLE_TIME) 1997 size += sizeof(data->time); 1998 1999 if (sample_type & PERF_SAMPLE_IDENTIFIER) 2000 size += sizeof(data->id); 2001 2002 if (sample_type & PERF_SAMPLE_ID) 2003 size += sizeof(data->id); 2004 2005 if (sample_type & PERF_SAMPLE_STREAM_ID) 2006 size += sizeof(data->stream_id); 2007 2008 if (sample_type & PERF_SAMPLE_CPU) 2009 size += sizeof(data->cpu_entry); 2010 2011 event->id_header_size = size; 2012 } 2013 2014 /* 2015 * Check that adding an event to the group does not result in anybody 2016 * overflowing the 64k event limit imposed by the output buffer. 2017 * 2018 * Specifically, check that the read_size for the event does not exceed 16k, 2019 * read_size being the one term that grows with groups size. Since read_size 2020 * depends on per-event read_format, also (re)check the existing events. 2021 * 2022 * This leaves 48k for the constant size fields and things like callchains, 2023 * branch stacks and register sets. 2024 */ 2025 static bool perf_event_validate_size(struct perf_event *event) 2026 { 2027 struct perf_event *sibling, *group_leader = event->group_leader; 2028 2029 if (__perf_event_read_size(event->attr.read_format, 2030 group_leader->nr_siblings + 1) > 16*1024) 2031 return false; 2032 2033 if (__perf_event_read_size(group_leader->attr.read_format, 2034 group_leader->nr_siblings + 1) > 16*1024) 2035 return false; 2036 2037 /* 2038 * When creating a new group leader, group_leader->ctx is initialized 2039 * after the size has been validated, but we cannot safely use 2040 * for_each_sibling_event() until group_leader->ctx is set. A new group 2041 * leader cannot have any siblings yet, so we can safely skip checking 2042 * the non-existent siblings. 2043 */ 2044 if (event == group_leader) 2045 return true; 2046 2047 for_each_sibling_event(sibling, group_leader) { 2048 if (__perf_event_read_size(sibling->attr.read_format, 2049 group_leader->nr_siblings + 1) > 16*1024) 2050 return false; 2051 } 2052 2053 return true; 2054 } 2055 2056 static void perf_group_attach(struct perf_event *event) 2057 { 2058 struct perf_event *group_leader = event->group_leader, *pos; 2059 2060 lockdep_assert_held(&event->ctx->lock); 2061 2062 /* 2063 * We can have double attach due to group movement (move_group) in 2064 * perf_event_open(). 2065 */ 2066 if (event->attach_state & PERF_ATTACH_GROUP) 2067 return; 2068 2069 event->attach_state |= PERF_ATTACH_GROUP; 2070 2071 if (group_leader == event) 2072 return; 2073 2074 WARN_ON_ONCE(group_leader->ctx != event->ctx); 2075 2076 group_leader->group_caps &= event->event_caps; 2077 2078 list_add_tail(&event->sibling_list, &group_leader->sibling_list); 2079 group_leader->nr_siblings++; 2080 group_leader->group_generation++; 2081 2082 perf_event__header_size(group_leader); 2083 2084 for_each_sibling_event(pos, group_leader) 2085 perf_event__header_size(pos); 2086 } 2087 2088 /* 2089 * Remove an event from the lists for its context. 2090 * Must be called with ctx->mutex and ctx->lock held. 2091 */ 2092 static void 2093 list_del_event(struct perf_event *event, struct perf_event_context *ctx) 2094 { 2095 WARN_ON_ONCE(event->ctx != ctx); 2096 lockdep_assert_held(&ctx->lock); 2097 2098 /* 2099 * We can have double detach due to exit/hot-unplug + close. 2100 */ 2101 if (!(event->attach_state & PERF_ATTACH_CONTEXT)) 2102 return; 2103 2104 event->attach_state &= ~PERF_ATTACH_CONTEXT; 2105 2106 ctx->nr_events--; 2107 if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) 2108 ctx->nr_user--; 2109 if (event->attr.inherit_stat) 2110 ctx->nr_stat--; 2111 if (has_inherit_and_sample_read(&event->attr)) 2112 local_dec(&ctx->nr_no_switch_fast); 2113 2114 list_del_rcu(&event->event_entry); 2115 2116 if (event->group_leader == event) 2117 del_event_from_groups(event, ctx); 2118 2119 /* 2120 * If event was in error state, then keep it 2121 * that way, otherwise bogus counts will be 2122 * returned on read(). The only way to get out 2123 * of error state is by explicit re-enabling 2124 * of the event 2125 */ 2126 if (event->state > PERF_EVENT_STATE_OFF) { 2127 perf_cgroup_event_disable(event, ctx); 2128 perf_event_set_state(event, PERF_EVENT_STATE_OFF); 2129 } 2130 2131 ctx->generation++; 2132 event->pmu_ctx->nr_events--; 2133 } 2134 2135 static int 2136 perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) 2137 { 2138 if (!has_aux(aux_event)) 2139 return 0; 2140 2141 if (!event->pmu->aux_output_match) 2142 return 0; 2143 2144 return event->pmu->aux_output_match(aux_event); 2145 } 2146 2147 static void put_event(struct perf_event *event); 2148 static void event_sched_out(struct perf_event *event, 2149 struct perf_event_context *ctx); 2150 2151 static void perf_put_aux_event(struct perf_event *event) 2152 { 2153 struct perf_event_context *ctx = event->ctx; 2154 struct perf_event *iter; 2155 2156 /* 2157 * If event uses aux_event tear down the link 2158 */ 2159 if (event->aux_event) { 2160 iter = event->aux_event; 2161 event->aux_event = NULL; 2162 put_event(iter); 2163 return; 2164 } 2165 2166 /* 2167 * If the event is an aux_event, tear down all links to 2168 * it from other events. 2169 */ 2170 for_each_sibling_event(iter, event->group_leader) { 2171 if (iter->aux_event != event) 2172 continue; 2173 2174 iter->aux_event = NULL; 2175 put_event(event); 2176 2177 /* 2178 * If it's ACTIVE, schedule it out and put it into ERROR 2179 * state so that we don't try to schedule it again. Note 2180 * that perf_event_enable() will clear the ERROR status. 2181 */ 2182 event_sched_out(iter, ctx); 2183 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); 2184 } 2185 } 2186 2187 static bool perf_need_aux_event(struct perf_event *event) 2188 { 2189 return event->attr.aux_output || has_aux_action(event); 2190 } 2191 2192 static int perf_get_aux_event(struct perf_event *event, 2193 struct perf_event *group_leader) 2194 { 2195 /* 2196 * Our group leader must be an aux event if we want to be 2197 * an aux_output. This way, the aux event will precede its 2198 * aux_output events in the group, and therefore will always 2199 * schedule first. 2200 */ 2201 if (!group_leader) 2202 return 0; 2203 2204 /* 2205 * aux_output and aux_sample_size are mutually exclusive. 2206 */ 2207 if (event->attr.aux_output && event->attr.aux_sample_size) 2208 return 0; 2209 2210 if (event->attr.aux_output && 2211 !perf_aux_output_match(event, group_leader)) 2212 return 0; 2213 2214 if ((event->attr.aux_pause || event->attr.aux_resume) && 2215 !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) 2216 return 0; 2217 2218 if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) 2219 return 0; 2220 2221 if (!atomic_long_inc_not_zero(&group_leader->refcount)) 2222 return 0; 2223 2224 /* 2225 * Link aux_outputs to their aux event; this is undone in 2226 * perf_group_detach() by perf_put_aux_event(). When the 2227 * group in torn down, the aux_output events loose their 2228 * link to the aux_event and can't schedule any more. 2229 */ 2230 event->aux_event = group_leader; 2231 2232 return 1; 2233 } 2234 2235 static inline struct list_head *get_event_list(struct perf_event *event) 2236 { 2237 return event->attr.pinned ? &event->pmu_ctx->pinned_active : 2238 &event->pmu_ctx->flexible_active; 2239 } 2240 2241 /* 2242 * Events that have PERF_EV_CAP_SIBLING require being part of a group and 2243 * cannot exist on their own, schedule them out and move them into the ERROR 2244 * state. Also see _perf_event_enable(), it will not be able to recover 2245 * this ERROR state. 2246 */ 2247 static inline void perf_remove_sibling_event(struct perf_event *event) 2248 { 2249 event_sched_out(event, event->ctx); 2250 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); 2251 } 2252 2253 static void perf_group_detach(struct perf_event *event) 2254 { 2255 struct perf_event *leader = event->group_leader; 2256 struct perf_event *sibling, *tmp; 2257 struct perf_event_context *ctx = event->ctx; 2258 2259 lockdep_assert_held(&ctx->lock); 2260 2261 /* 2262 * We can have double detach due to exit/hot-unplug + close. 2263 */ 2264 if (!(event->attach_state & PERF_ATTACH_GROUP)) 2265 return; 2266 2267 event->attach_state &= ~PERF_ATTACH_GROUP; 2268 2269 perf_put_aux_event(event); 2270 2271 /* 2272 * If this is a sibling, remove it from its group. 2273 */ 2274 if (leader != event) { 2275 list_del_init(&event->sibling_list); 2276 event->group_leader->nr_siblings--; 2277 event->group_leader->group_generation++; 2278 goto out; 2279 } 2280 2281 /* 2282 * If this was a group event with sibling events then 2283 * upgrade the siblings to singleton events by adding them 2284 * to whatever list we are on. 2285 */ 2286 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { 2287 2288 if (sibling->event_caps & PERF_EV_CAP_SIBLING) 2289 perf_remove_sibling_event(sibling); 2290 2291 sibling->group_leader = sibling; 2292 list_del_init(&sibling->sibling_list); 2293 2294 /* Inherit group flags from the previous leader */ 2295 sibling->group_caps = event->group_caps; 2296 2297 if (sibling->attach_state & PERF_ATTACH_CONTEXT) { 2298 add_event_to_groups(sibling, event->ctx); 2299 2300 if (sibling->state == PERF_EVENT_STATE_ACTIVE) 2301 list_add_tail(&sibling->active_list, get_event_list(sibling)); 2302 } 2303 2304 WARN_ON_ONCE(sibling->ctx != event->ctx); 2305 } 2306 2307 out: 2308 for_each_sibling_event(tmp, leader) 2309 perf_event__header_size(tmp); 2310 2311 perf_event__header_size(leader); 2312 } 2313 2314 static void sync_child_event(struct perf_event *child_event); 2315 2316 static void perf_child_detach(struct perf_event *event) 2317 { 2318 struct perf_event *parent_event = event->parent; 2319 2320 if (!(event->attach_state & PERF_ATTACH_CHILD)) 2321 return; 2322 2323 event->attach_state &= ~PERF_ATTACH_CHILD; 2324 2325 if (WARN_ON_ONCE(!parent_event)) 2326 return; 2327 2328 lockdep_assert_held(&parent_event->child_mutex); 2329 2330 sync_child_event(event); 2331 list_del_init(&event->child_list); 2332 } 2333 2334 static bool is_orphaned_event(struct perf_event *event) 2335 { 2336 return event->state == PERF_EVENT_STATE_DEAD; 2337 } 2338 2339 static inline int 2340 event_filter_match(struct perf_event *event) 2341 { 2342 return (event->cpu == -1 || event->cpu == smp_processor_id()) && 2343 perf_cgroup_match(event); 2344 } 2345 2346 static void 2347 event_sched_out(struct perf_event *event, struct perf_event_context *ctx) 2348 { 2349 struct perf_event_pmu_context *epc = event->pmu_ctx; 2350 struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); 2351 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; 2352 2353 // XXX cpc serialization, probably per-cpu IRQ disabled 2354 2355 WARN_ON_ONCE(event->ctx != ctx); 2356 lockdep_assert_held(&ctx->lock); 2357 2358 if (event->state != PERF_EVENT_STATE_ACTIVE) 2359 return; 2360 2361 /* 2362 * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but 2363 * we can schedule events _OUT_ individually through things like 2364 * __perf_remove_from_context(). 2365 */ 2366 list_del_init(&event->active_list); 2367 2368 perf_pmu_disable(event->pmu); 2369 2370 event->pmu->del(event, 0); 2371 event->oncpu = -1; 2372 2373 if (event->pending_disable) { 2374 event->pending_disable = 0; 2375 perf_cgroup_event_disable(event, ctx); 2376 state = PERF_EVENT_STATE_OFF; 2377 } 2378 2379 perf_event_set_state(event, state); 2380 2381 if (!is_software_event(event)) 2382 cpc->active_oncpu--; 2383 if (event->attr.freq && event->attr.sample_freq) { 2384 ctx->nr_freq--; 2385 epc->nr_freq--; 2386 } 2387 if (event->attr.exclusive || !cpc->active_oncpu) 2388 cpc->exclusive = 0; 2389 2390 perf_pmu_enable(event->pmu); 2391 } 2392 2393 static void 2394 group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) 2395 { 2396 struct perf_event *event; 2397 2398 if (group_event->state != PERF_EVENT_STATE_ACTIVE) 2399 return; 2400 2401 perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); 2402 2403 event_sched_out(group_event, ctx); 2404 2405 /* 2406 * Schedule out siblings (if any): 2407 */ 2408 for_each_sibling_event(event, group_event) 2409 event_sched_out(event, ctx); 2410 } 2411 2412 static inline void 2413 __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) 2414 { 2415 if (ctx->is_active & EVENT_TIME) { 2416 if (ctx->is_active & EVENT_FROZEN) 2417 return; 2418 update_context_time(ctx); 2419 update_cgrp_time_from_cpuctx(cpuctx, final); 2420 } 2421 } 2422 2423 static inline void 2424 ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) 2425 { 2426 __ctx_time_update(cpuctx, ctx, false); 2427 } 2428 2429 /* 2430 * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). 2431 */ 2432 static inline void 2433 ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) 2434 { 2435 ctx_time_update(cpuctx, ctx); 2436 if (ctx->is_active & EVENT_TIME) 2437 ctx->is_active |= EVENT_FROZEN; 2438 } 2439 2440 static inline void 2441 ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) 2442 { 2443 if (ctx->is_active & EVENT_TIME) { 2444 if (ctx->is_active & EVENT_FROZEN) 2445 return; 2446 update_context_time(ctx); 2447 update_cgrp_time_from_event(event); 2448 } 2449 } 2450 2451 #define DETACH_GROUP 0x01UL 2452 #define DETACH_CHILD 0x02UL 2453 #define DETACH_DEAD 0x04UL 2454 #define DETACH_EXIT 0x08UL 2455 2456 /* 2457 * Cross CPU call to remove a performance event 2458 * 2459 * We disable the event on the hardware level first. After that we 2460 * remove it from the context list. 2461 */ 2462 static void 2463 __perf_remove_from_context(struct perf_event *event, 2464 struct perf_cpu_context *cpuctx, 2465 struct perf_event_context *ctx, 2466 void *info) 2467 { 2468 struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; 2469 enum perf_event_state state = PERF_EVENT_STATE_OFF; 2470 unsigned long flags = (unsigned long)info; 2471 2472 ctx_time_update(cpuctx, ctx); 2473 2474 /* 2475 * Ensure event_sched_out() switches to OFF, at the very least 2476 * this avoids raising perf_pending_task() at this time. 2477 */ 2478 if (flags & DETACH_EXIT) 2479 state = PERF_EVENT_STATE_EXIT; 2480 if (flags & DETACH_DEAD) { 2481 event->pending_disable = 1; 2482 state = PERF_EVENT_STATE_DEAD; 2483 } 2484 event_sched_out(event, ctx); 2485 perf_event_set_state(event, min(event->state, state)); 2486 if (flags & DETACH_GROUP) 2487 perf_group_detach(event); 2488 if (flags & DETACH_CHILD) 2489 perf_child_detach(event); 2490 list_del_event(event, ctx); 2491 2492 if (!pmu_ctx->nr_events) { 2493 pmu_ctx->rotate_necessary = 0; 2494 2495 if (ctx->task && ctx->is_active) { 2496 struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); 2497 2498 WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); 2499 cpc->task_epc = NULL; 2500 } 2501 } 2502 2503 if (!ctx->nr_events && ctx->is_active) { 2504 if (ctx == &cpuctx->ctx) 2505 update_cgrp_time_from_cpuctx(cpuctx, true); 2506 2507 ctx->is_active = 0; 2508 if (ctx->task) { 2509 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2510 cpuctx->task_ctx = NULL; 2511 } 2512 } 2513 } 2514 2515 /* 2516 * Remove the event from a task's (or a CPU's) list of events. 2517 * 2518 * If event->ctx is a cloned context, callers must make sure that 2519 * every task struct that event->ctx->task could possibly point to 2520 * remains valid. This is OK when called from perf_release since 2521 * that only calls us on the top-level context, which can't be a clone. 2522 * When called from perf_event_exit_task, it's OK because the 2523 * context has been detached from its task. 2524 */ 2525 static void perf_remove_from_context(struct perf_event *event, unsigned long flags) 2526 { 2527 struct perf_event_context *ctx = event->ctx; 2528 2529 lockdep_assert_held(&ctx->mutex); 2530 2531 /* 2532 * Because of perf_event_exit_task(), perf_remove_from_context() ought 2533 * to work in the face of TASK_TOMBSTONE, unlike every other 2534 * event_function_call() user. 2535 */ 2536 raw_spin_lock_irq(&ctx->lock); 2537 if (!ctx->is_active) { 2538 __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), 2539 ctx, (void *)flags); 2540 raw_spin_unlock_irq(&ctx->lock); 2541 return; 2542 } 2543 raw_spin_unlock_irq(&ctx->lock); 2544 2545 event_function_call(event, __perf_remove_from_context, (void *)flags); 2546 } 2547 2548 /* 2549 * Cross CPU call to disable a performance event 2550 */ 2551 static void __perf_event_disable(struct perf_event *event, 2552 struct perf_cpu_context *cpuctx, 2553 struct perf_event_context *ctx, 2554 void *info) 2555 { 2556 if (event->state < PERF_EVENT_STATE_INACTIVE) 2557 return; 2558 2559 perf_pmu_disable(event->pmu_ctx->pmu); 2560 ctx_time_update_event(ctx, event); 2561 2562 if (event == event->group_leader) 2563 group_sched_out(event, ctx); 2564 else 2565 event_sched_out(event, ctx); 2566 2567 perf_event_set_state(event, PERF_EVENT_STATE_OFF); 2568 perf_cgroup_event_disable(event, ctx); 2569 2570 perf_pmu_enable(event->pmu_ctx->pmu); 2571 } 2572 2573 /* 2574 * Disable an event. 2575 * 2576 * If event->ctx is a cloned context, callers must make sure that 2577 * every task struct that event->ctx->task could possibly point to 2578 * remains valid. This condition is satisfied when called through 2579 * perf_event_for_each_child or perf_event_for_each because they 2580 * hold the top-level event's child_mutex, so any descendant that 2581 * goes to exit will block in perf_event_exit_event(). 2582 * 2583 * When called from perf_pending_disable it's OK because event->ctx 2584 * is the current context on this CPU and preemption is disabled, 2585 * hence we can't get into perf_event_task_sched_out for this context. 2586 */ 2587 static void _perf_event_disable(struct perf_event *event) 2588 { 2589 struct perf_event_context *ctx = event->ctx; 2590 2591 raw_spin_lock_irq(&ctx->lock); 2592 if (event->state <= PERF_EVENT_STATE_OFF) { 2593 raw_spin_unlock_irq(&ctx->lock); 2594 return; 2595 } 2596 raw_spin_unlock_irq(&ctx->lock); 2597 2598 event_function_call(event, __perf_event_disable, NULL); 2599 } 2600 2601 void perf_event_disable_local(struct perf_event *event) 2602 { 2603 event_function_local(event, __perf_event_disable, NULL); 2604 } 2605 2606 /* 2607 * Strictly speaking kernel users cannot create groups and therefore this 2608 * interface does not need the perf_event_ctx_lock() magic. 2609 */ 2610 void perf_event_disable(struct perf_event *event) 2611 { 2612 struct perf_event_context *ctx; 2613 2614 ctx = perf_event_ctx_lock(event); 2615 _perf_event_disable(event); 2616 perf_event_ctx_unlock(event, ctx); 2617 } 2618 EXPORT_SYMBOL_GPL(perf_event_disable); 2619 2620 void perf_event_disable_inatomic(struct perf_event *event) 2621 { 2622 event->pending_disable = 1; 2623 irq_work_queue(&event->pending_disable_irq); 2624 } 2625 2626 #define MAX_INTERRUPTS (~0ULL) 2627 2628 static void perf_log_throttle(struct perf_event *event, int enable); 2629 static void perf_log_itrace_start(struct perf_event *event); 2630 2631 static int 2632 event_sched_in(struct perf_event *event, struct perf_event_context *ctx) 2633 { 2634 struct perf_event_pmu_context *epc = event->pmu_ctx; 2635 struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); 2636 int ret = 0; 2637 2638 WARN_ON_ONCE(event->ctx != ctx); 2639 2640 lockdep_assert_held(&ctx->lock); 2641 2642 if (event->state <= PERF_EVENT_STATE_OFF) 2643 return 0; 2644 2645 WRITE_ONCE(event->oncpu, smp_processor_id()); 2646 /* 2647 * Order event::oncpu write to happen before the ACTIVE state is 2648 * visible. This allows perf_event_{stop,read}() to observe the correct 2649 * ->oncpu if it sees ACTIVE. 2650 */ 2651 smp_wmb(); 2652 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); 2653 2654 /* 2655 * Unthrottle events, since we scheduled we might have missed several 2656 * ticks already, also for a heavily scheduling task there is little 2657 * guarantee it'll get a tick in a timely manner. 2658 */ 2659 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { 2660 perf_log_throttle(event, 1); 2661 event->hw.interrupts = 0; 2662 } 2663 2664 perf_pmu_disable(event->pmu); 2665 2666 perf_log_itrace_start(event); 2667 2668 if (event->pmu->add(event, PERF_EF_START)) { 2669 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 2670 event->oncpu = -1; 2671 ret = -EAGAIN; 2672 goto out; 2673 } 2674 2675 if (!is_software_event(event)) 2676 cpc->active_oncpu++; 2677 if (event->attr.freq && event->attr.sample_freq) { 2678 ctx->nr_freq++; 2679 epc->nr_freq++; 2680 } 2681 if (event->attr.exclusive) 2682 cpc->exclusive = 1; 2683 2684 out: 2685 perf_pmu_enable(event->pmu); 2686 2687 return ret; 2688 } 2689 2690 static int 2691 group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) 2692 { 2693 struct perf_event *event, *partial_group = NULL; 2694 struct pmu *pmu = group_event->pmu_ctx->pmu; 2695 2696 if (group_event->state == PERF_EVENT_STATE_OFF) 2697 return 0; 2698 2699 pmu->start_txn(pmu, PERF_PMU_TXN_ADD); 2700 2701 if (event_sched_in(group_event, ctx)) 2702 goto error; 2703 2704 /* 2705 * Schedule in siblings as one group (if any): 2706 */ 2707 for_each_sibling_event(event, group_event) { 2708 if (event_sched_in(event, ctx)) { 2709 partial_group = event; 2710 goto group_error; 2711 } 2712 } 2713 2714 if (!pmu->commit_txn(pmu)) 2715 return 0; 2716 2717 group_error: 2718 /* 2719 * Groups can be scheduled in as one unit only, so undo any 2720 * partial group before returning: 2721 * The events up to the failed event are scheduled out normally. 2722 */ 2723 for_each_sibling_event(event, group_event) { 2724 if (event == partial_group) 2725 break; 2726 2727 event_sched_out(event, ctx); 2728 } 2729 event_sched_out(group_event, ctx); 2730 2731 error: 2732 pmu->cancel_txn(pmu); 2733 return -EAGAIN; 2734 } 2735 2736 /* 2737 * Work out whether we can put this event group on the CPU now. 2738 */ 2739 static int group_can_go_on(struct perf_event *event, int can_add_hw) 2740 { 2741 struct perf_event_pmu_context *epc = event->pmu_ctx; 2742 struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); 2743 2744 /* 2745 * Groups consisting entirely of software events can always go on. 2746 */ 2747 if (event->group_caps & PERF_EV_CAP_SOFTWARE) 2748 return 1; 2749 /* 2750 * If an exclusive group is already on, no other hardware 2751 * events can go on. 2752 */ 2753 if (cpc->exclusive) 2754 return 0; 2755 /* 2756 * If this group is exclusive and there are already 2757 * events on the CPU, it can't go on. 2758 */ 2759 if (event->attr.exclusive && !list_empty(get_event_list(event))) 2760 return 0; 2761 /* 2762 * Otherwise, try to add it if all previous groups were able 2763 * to go on. 2764 */ 2765 return can_add_hw; 2766 } 2767 2768 static void add_event_to_ctx(struct perf_event *event, 2769 struct perf_event_context *ctx) 2770 { 2771 list_add_event(event, ctx); 2772 perf_group_attach(event); 2773 } 2774 2775 static void task_ctx_sched_out(struct perf_event_context *ctx, 2776 struct pmu *pmu, 2777 enum event_type_t event_type) 2778 { 2779 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 2780 2781 if (!cpuctx->task_ctx) 2782 return; 2783 2784 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2785 return; 2786 2787 ctx_sched_out(ctx, pmu, event_type); 2788 } 2789 2790 static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 2791 struct perf_event_context *ctx, 2792 struct pmu *pmu) 2793 { 2794 ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); 2795 if (ctx) 2796 ctx_sched_in(ctx, pmu, EVENT_PINNED); 2797 ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); 2798 if (ctx) 2799 ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); 2800 } 2801 2802 /* 2803 * We want to maintain the following priority of scheduling: 2804 * - CPU pinned (EVENT_CPU | EVENT_PINNED) 2805 * - task pinned (EVENT_PINNED) 2806 * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) 2807 * - task flexible (EVENT_FLEXIBLE). 2808 * 2809 * In order to avoid unscheduling and scheduling back in everything every 2810 * time an event is added, only do it for the groups of equal priority and 2811 * below. 2812 * 2813 * This can be called after a batch operation on task events, in which case 2814 * event_type is a bit mask of the types of events involved. For CPU events, 2815 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. 2816 */ 2817 static void ctx_resched(struct perf_cpu_context *cpuctx, 2818 struct perf_event_context *task_ctx, 2819 struct pmu *pmu, enum event_type_t event_type) 2820 { 2821 bool cpu_event = !!(event_type & EVENT_CPU); 2822 struct perf_event_pmu_context *epc; 2823 2824 /* 2825 * If pinned groups are involved, flexible groups also need to be 2826 * scheduled out. 2827 */ 2828 if (event_type & EVENT_PINNED) 2829 event_type |= EVENT_FLEXIBLE; 2830 2831 event_type &= EVENT_ALL; 2832 2833 for_each_epc(epc, &cpuctx->ctx, pmu, false) 2834 perf_pmu_disable(epc->pmu); 2835 2836 if (task_ctx) { 2837 for_each_epc(epc, task_ctx, pmu, false) 2838 perf_pmu_disable(epc->pmu); 2839 2840 task_ctx_sched_out(task_ctx, pmu, event_type); 2841 } 2842 2843 /* 2844 * Decide which cpu ctx groups to schedule out based on the types 2845 * of events that caused rescheduling: 2846 * - EVENT_CPU: schedule out corresponding groups; 2847 * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; 2848 * - otherwise, do nothing more. 2849 */ 2850 if (cpu_event) 2851 ctx_sched_out(&cpuctx->ctx, pmu, event_type); 2852 else if (event_type & EVENT_PINNED) 2853 ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); 2854 2855 perf_event_sched_in(cpuctx, task_ctx, pmu); 2856 2857 for_each_epc(epc, &cpuctx->ctx, pmu, false) 2858 perf_pmu_enable(epc->pmu); 2859 2860 if (task_ctx) { 2861 for_each_epc(epc, task_ctx, pmu, false) 2862 perf_pmu_enable(epc->pmu); 2863 } 2864 } 2865 2866 void perf_pmu_resched(struct pmu *pmu) 2867 { 2868 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 2869 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2870 2871 perf_ctx_lock(cpuctx, task_ctx); 2872 ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); 2873 perf_ctx_unlock(cpuctx, task_ctx); 2874 } 2875 2876 /* 2877 * Cross CPU call to install and enable a performance event 2878 * 2879 * Very similar to remote_function() + event_function() but cannot assume that 2880 * things like ctx->is_active and cpuctx->task_ctx are set. 2881 */ 2882 static int __perf_install_in_context(void *info) 2883 { 2884 struct perf_event *event = info; 2885 struct perf_event_context *ctx = event->ctx; 2886 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 2887 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2888 bool reprogram = true; 2889 int ret = 0; 2890 2891 raw_spin_lock(&cpuctx->ctx.lock); 2892 if (ctx->task) { 2893 raw_spin_lock(&ctx->lock); 2894 task_ctx = ctx; 2895 2896 reprogram = (ctx->task == current); 2897 2898 /* 2899 * If the task is running, it must be running on this CPU, 2900 * otherwise we cannot reprogram things. 2901 * 2902 * If its not running, we don't care, ctx->lock will 2903 * serialize against it becoming runnable. 2904 */ 2905 if (task_curr(ctx->task) && !reprogram) { 2906 ret = -ESRCH; 2907 goto unlock; 2908 } 2909 2910 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); 2911 } else if (task_ctx) { 2912 raw_spin_lock(&task_ctx->lock); 2913 } 2914 2915 #ifdef CONFIG_CGROUP_PERF 2916 if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { 2917 /* 2918 * If the current cgroup doesn't match the event's 2919 * cgroup, we should not try to schedule it. 2920 */ 2921 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); 2922 reprogram = cgroup_is_descendant(cgrp->css.cgroup, 2923 event->cgrp->css.cgroup); 2924 } 2925 #endif 2926 2927 if (reprogram) { 2928 ctx_time_freeze(cpuctx, ctx); 2929 add_event_to_ctx(event, ctx); 2930 ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, 2931 get_event_type(event)); 2932 } else { 2933 add_event_to_ctx(event, ctx); 2934 } 2935 2936 unlock: 2937 perf_ctx_unlock(cpuctx, task_ctx); 2938 2939 return ret; 2940 } 2941 2942 static bool exclusive_event_installable(struct perf_event *event, 2943 struct perf_event_context *ctx); 2944 2945 /* 2946 * Attach a performance event to a context. 2947 * 2948 * Very similar to event_function_call, see comment there. 2949 */ 2950 static void 2951 perf_install_in_context(struct perf_event_context *ctx, 2952 struct perf_event *event, 2953 int cpu) 2954 { 2955 struct task_struct *task = READ_ONCE(ctx->task); 2956 2957 lockdep_assert_held(&ctx->mutex); 2958 2959 WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); 2960 2961 if (event->cpu != -1) 2962 WARN_ON_ONCE(event->cpu != cpu); 2963 2964 /* 2965 * Ensures that if we can observe event->ctx, both the event and ctx 2966 * will be 'complete'. See perf_iterate_sb_cpu(). 2967 */ 2968 smp_store_release(&event->ctx, ctx); 2969 2970 /* 2971 * perf_event_attr::disabled events will not run and can be initialized 2972 * without IPI. Except when this is the first event for the context, in 2973 * that case we need the magic of the IPI to set ctx->is_active. 2974 * 2975 * The IOC_ENABLE that is sure to follow the creation of a disabled 2976 * event will issue the IPI and reprogram the hardware. 2977 */ 2978 if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && 2979 ctx->nr_events && !is_cgroup_event(event)) { 2980 raw_spin_lock_irq(&ctx->lock); 2981 if (ctx->task == TASK_TOMBSTONE) { 2982 raw_spin_unlock_irq(&ctx->lock); 2983 return; 2984 } 2985 add_event_to_ctx(event, ctx); 2986 raw_spin_unlock_irq(&ctx->lock); 2987 return; 2988 } 2989 2990 if (!task) { 2991 cpu_function_call(cpu, __perf_install_in_context, event); 2992 return; 2993 } 2994 2995 /* 2996 * Should not happen, we validate the ctx is still alive before calling. 2997 */ 2998 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) 2999 return; 3000 3001 /* 3002 * Installing events is tricky because we cannot rely on ctx->is_active 3003 * to be set in case this is the nr_events 0 -> 1 transition. 3004 * 3005 * Instead we use task_curr(), which tells us if the task is running. 3006 * However, since we use task_curr() outside of rq::lock, we can race 3007 * against the actual state. This means the result can be wrong. 3008 * 3009 * If we get a false positive, we retry, this is harmless. 3010 * 3011 * If we get a false negative, things are complicated. If we are after 3012 * perf_event_context_sched_in() ctx::lock will serialize us, and the 3013 * value must be correct. If we're before, it doesn't matter since 3014 * perf_event_context_sched_in() will program the counter. 3015 * 3016 * However, this hinges on the remote context switch having observed 3017 * our task->perf_event_ctxp[] store, such that it will in fact take 3018 * ctx::lock in perf_event_context_sched_in(). 3019 * 3020 * We do this by task_function_call(), if the IPI fails to hit the task 3021 * we know any future context switch of task must see the 3022 * perf_event_ctpx[] store. 3023 */ 3024 3025 /* 3026 * This smp_mb() orders the task->perf_event_ctxp[] store with the 3027 * task_cpu() load, such that if the IPI then does not find the task 3028 * running, a future context switch of that task must observe the 3029 * store. 3030 */ 3031 smp_mb(); 3032 again: 3033 if (!task_function_call(task, __perf_install_in_context, event)) 3034 return; 3035 3036 raw_spin_lock_irq(&ctx->lock); 3037 task = ctx->task; 3038 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { 3039 /* 3040 * Cannot happen because we already checked above (which also 3041 * cannot happen), and we hold ctx->mutex, which serializes us 3042 * against perf_event_exit_task_context(). 3043 */ 3044 raw_spin_unlock_irq(&ctx->lock); 3045 return; 3046 } 3047 /* 3048 * If the task is not running, ctx->lock will avoid it becoming so, 3049 * thus we can safely install the event. 3050 */ 3051 if (task_curr(task)) { 3052 raw_spin_unlock_irq(&ctx->lock); 3053 goto again; 3054 } 3055 add_event_to_ctx(event, ctx); 3056 raw_spin_unlock_irq(&ctx->lock); 3057 } 3058 3059 /* 3060 * Cross CPU call to enable a performance event 3061 */ 3062 static void __perf_event_enable(struct perf_event *event, 3063 struct perf_cpu_context *cpuctx, 3064 struct perf_event_context *ctx, 3065 void *info) 3066 { 3067 struct perf_event *leader = event->group_leader; 3068 struct perf_event_context *task_ctx; 3069 3070 if (event->state >= PERF_EVENT_STATE_INACTIVE || 3071 event->state <= PERF_EVENT_STATE_ERROR) 3072 return; 3073 3074 ctx_time_freeze(cpuctx, ctx); 3075 3076 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 3077 perf_cgroup_event_enable(event, ctx); 3078 3079 if (!ctx->is_active) 3080 return; 3081 3082 if (!event_filter_match(event)) 3083 return; 3084 3085 /* 3086 * If the event is in a group and isn't the group leader, 3087 * then don't put it on unless the group is on. 3088 */ 3089 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) 3090 return; 3091 3092 task_ctx = cpuctx->task_ctx; 3093 if (ctx->task) 3094 WARN_ON_ONCE(task_ctx != ctx); 3095 3096 ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); 3097 } 3098 3099 /* 3100 * Enable an event. 3101 * 3102 * If event->ctx is a cloned context, callers must make sure that 3103 * every task struct that event->ctx->task could possibly point to 3104 * remains valid. This condition is satisfied when called through 3105 * perf_event_for_each_child or perf_event_for_each as described 3106 * for perf_event_disable. 3107 */ 3108 static void _perf_event_enable(struct perf_event *event) 3109 { 3110 struct perf_event_context *ctx = event->ctx; 3111 3112 raw_spin_lock_irq(&ctx->lock); 3113 if (event->state >= PERF_EVENT_STATE_INACTIVE || 3114 event->state < PERF_EVENT_STATE_ERROR) { 3115 out: 3116 raw_spin_unlock_irq(&ctx->lock); 3117 return; 3118 } 3119 3120 /* 3121 * If the event is in error state, clear that first. 3122 * 3123 * That way, if we see the event in error state below, we know that it 3124 * has gone back into error state, as distinct from the task having 3125 * been scheduled away before the cross-call arrived. 3126 */ 3127 if (event->state == PERF_EVENT_STATE_ERROR) { 3128 /* 3129 * Detached SIBLING events cannot leave ERROR state. 3130 */ 3131 if (event->event_caps & PERF_EV_CAP_SIBLING && 3132 event->group_leader == event) 3133 goto out; 3134 3135 event->state = PERF_EVENT_STATE_OFF; 3136 } 3137 raw_spin_unlock_irq(&ctx->lock); 3138 3139 event_function_call(event, __perf_event_enable, NULL); 3140 } 3141 3142 /* 3143 * See perf_event_disable(); 3144 */ 3145 void perf_event_enable(struct perf_event *event) 3146 { 3147 struct perf_event_context *ctx; 3148 3149 ctx = perf_event_ctx_lock(event); 3150 _perf_event_enable(event); 3151 perf_event_ctx_unlock(event, ctx); 3152 } 3153 EXPORT_SYMBOL_GPL(perf_event_enable); 3154 3155 struct stop_event_data { 3156 struct perf_event *event; 3157 unsigned int restart; 3158 }; 3159 3160 static int __perf_event_stop(void *info) 3161 { 3162 struct stop_event_data *sd = info; 3163 struct perf_event *event = sd->event; 3164 3165 /* if it's already INACTIVE, do nothing */ 3166 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) 3167 return 0; 3168 3169 /* matches smp_wmb() in event_sched_in() */ 3170 smp_rmb(); 3171 3172 /* 3173 * There is a window with interrupts enabled before we get here, 3174 * so we need to check again lest we try to stop another CPU's event. 3175 */ 3176 if (READ_ONCE(event->oncpu) != smp_processor_id()) 3177 return -EAGAIN; 3178 3179 event->pmu->stop(event, PERF_EF_UPDATE); 3180 3181 /* 3182 * May race with the actual stop (through perf_pmu_output_stop()), 3183 * but it is only used for events with AUX ring buffer, and such 3184 * events will refuse to restart because of rb::aux_mmap_count==0, 3185 * see comments in perf_aux_output_begin(). 3186 * 3187 * Since this is happening on an event-local CPU, no trace is lost 3188 * while restarting. 3189 */ 3190 if (sd->restart) 3191 event->pmu->start(event, 0); 3192 3193 return 0; 3194 } 3195 3196 static int perf_event_stop(struct perf_event *event, int restart) 3197 { 3198 struct stop_event_data sd = { 3199 .event = event, 3200 .restart = restart, 3201 }; 3202 int ret = 0; 3203 3204 do { 3205 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) 3206 return 0; 3207 3208 /* matches smp_wmb() in event_sched_in() */ 3209 smp_rmb(); 3210 3211 /* 3212 * We only want to restart ACTIVE events, so if the event goes 3213 * inactive here (event->oncpu==-1), there's nothing more to do; 3214 * fall through with ret==-ENXIO. 3215 */ 3216 ret = cpu_function_call(READ_ONCE(event->oncpu), 3217 __perf_event_stop, &sd); 3218 } while (ret == -EAGAIN); 3219 3220 return ret; 3221 } 3222 3223 /* 3224 * In order to contain the amount of racy and tricky in the address filter 3225 * configuration management, it is a two part process: 3226 * 3227 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, 3228 * we update the addresses of corresponding vmas in 3229 * event::addr_filter_ranges array and bump the event::addr_filters_gen; 3230 * (p2) when an event is scheduled in (pmu::add), it calls 3231 * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() 3232 * if the generation has changed since the previous call. 3233 * 3234 * If (p1) happens while the event is active, we restart it to force (p2). 3235 * 3236 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on 3237 * pre-existing mappings, called once when new filters arrive via SET_FILTER 3238 * ioctl; 3239 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly 3240 * registered mapping, called for every new mmap(), with mm::mmap_lock down 3241 * for reading; 3242 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process 3243 * of exec. 3244 */ 3245 void perf_event_addr_filters_sync(struct perf_event *event) 3246 { 3247 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 3248 3249 if (!has_addr_filter(event)) 3250 return; 3251 3252 raw_spin_lock(&ifh->lock); 3253 if (event->addr_filters_gen != event->hw.addr_filters_gen) { 3254 event->pmu->addr_filters_sync(event); 3255 event->hw.addr_filters_gen = event->addr_filters_gen; 3256 } 3257 raw_spin_unlock(&ifh->lock); 3258 } 3259 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); 3260 3261 static int _perf_event_refresh(struct perf_event *event, int refresh) 3262 { 3263 /* 3264 * not supported on inherited events 3265 */ 3266 if (event->attr.inherit || !is_sampling_event(event)) 3267 return -EINVAL; 3268 3269 atomic_add(refresh, &event->event_limit); 3270 _perf_event_enable(event); 3271 3272 return 0; 3273 } 3274 3275 /* 3276 * See perf_event_disable() 3277 */ 3278 int perf_event_refresh(struct perf_event *event, int refresh) 3279 { 3280 struct perf_event_context *ctx; 3281 int ret; 3282 3283 ctx = perf_event_ctx_lock(event); 3284 ret = _perf_event_refresh(event, refresh); 3285 perf_event_ctx_unlock(event, ctx); 3286 3287 return ret; 3288 } 3289 EXPORT_SYMBOL_GPL(perf_event_refresh); 3290 3291 static int perf_event_modify_breakpoint(struct perf_event *bp, 3292 struct perf_event_attr *attr) 3293 { 3294 int err; 3295 3296 _perf_event_disable(bp); 3297 3298 err = modify_user_hw_breakpoint_check(bp, attr, true); 3299 3300 if (!bp->attr.disabled) 3301 _perf_event_enable(bp); 3302 3303 return err; 3304 } 3305 3306 /* 3307 * Copy event-type-independent attributes that may be modified. 3308 */ 3309 static void perf_event_modify_copy_attr(struct perf_event_attr *to, 3310 const struct perf_event_attr *from) 3311 { 3312 to->sig_data = from->sig_data; 3313 } 3314 3315 static int perf_event_modify_attr(struct perf_event *event, 3316 struct perf_event_attr *attr) 3317 { 3318 int (*func)(struct perf_event *, struct perf_event_attr *); 3319 struct perf_event *child; 3320 int err; 3321 3322 if (event->attr.type != attr->type) 3323 return -EINVAL; 3324 3325 switch (event->attr.type) { 3326 case PERF_TYPE_BREAKPOINT: 3327 func = perf_event_modify_breakpoint; 3328 break; 3329 default: 3330 /* Place holder for future additions. */ 3331 return -EOPNOTSUPP; 3332 } 3333 3334 WARN_ON_ONCE(event->ctx->parent_ctx); 3335 3336 mutex_lock(&event->child_mutex); 3337 /* 3338 * Event-type-independent attributes must be copied before event-type 3339 * modification, which will validate that final attributes match the 3340 * source attributes after all relevant attributes have been copied. 3341 */ 3342 perf_event_modify_copy_attr(&event->attr, attr); 3343 err = func(event, attr); 3344 if (err) 3345 goto out; 3346 list_for_each_entry(child, &event->child_list, child_list) { 3347 perf_event_modify_copy_attr(&child->attr, attr); 3348 err = func(child, attr); 3349 if (err) 3350 goto out; 3351 } 3352 out: 3353 mutex_unlock(&event->child_mutex); 3354 return err; 3355 } 3356 3357 static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, 3358 enum event_type_t event_type) 3359 { 3360 struct perf_event_context *ctx = pmu_ctx->ctx; 3361 struct perf_event *event, *tmp; 3362 struct pmu *pmu = pmu_ctx->pmu; 3363 3364 if (ctx->task && !(ctx->is_active & EVENT_ALL)) { 3365 struct perf_cpu_pmu_context *cpc = this_cpc(pmu); 3366 3367 WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); 3368 cpc->task_epc = NULL; 3369 } 3370 3371 if (!(event_type & EVENT_ALL)) 3372 return; 3373 3374 perf_pmu_disable(pmu); 3375 if (event_type & EVENT_PINNED) { 3376 list_for_each_entry_safe(event, tmp, 3377 &pmu_ctx->pinned_active, 3378 active_list) 3379 group_sched_out(event, ctx); 3380 } 3381 3382 if (event_type & EVENT_FLEXIBLE) { 3383 list_for_each_entry_safe(event, tmp, 3384 &pmu_ctx->flexible_active, 3385 active_list) 3386 group_sched_out(event, ctx); 3387 /* 3388 * Since we cleared EVENT_FLEXIBLE, also clear 3389 * rotate_necessary, is will be reset by 3390 * ctx_flexible_sched_in() when needed. 3391 */ 3392 pmu_ctx->rotate_necessary = 0; 3393 } 3394 perf_pmu_enable(pmu); 3395 } 3396 3397 /* 3398 * Be very careful with the @pmu argument since this will change ctx state. 3399 * The @pmu argument works for ctx_resched(), because that is symmetric in 3400 * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. 3401 * 3402 * However, if you were to be asymmetrical, you could end up with messed up 3403 * state, eg. ctx->is_active cleared even though most EPCs would still actually 3404 * be active. 3405 */ 3406 static void 3407 ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) 3408 { 3409 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 3410 struct perf_event_pmu_context *pmu_ctx; 3411 int is_active = ctx->is_active; 3412 bool cgroup = event_type & EVENT_CGROUP; 3413 3414 event_type &= ~EVENT_CGROUP; 3415 3416 lockdep_assert_held(&ctx->lock); 3417 3418 if (likely(!ctx->nr_events)) { 3419 /* 3420 * See __perf_remove_from_context(). 3421 */ 3422 WARN_ON_ONCE(ctx->is_active); 3423 if (ctx->task) 3424 WARN_ON_ONCE(cpuctx->task_ctx); 3425 return; 3426 } 3427 3428 /* 3429 * Always update time if it was set; not only when it changes. 3430 * Otherwise we can 'forget' to update time for any but the last 3431 * context we sched out. For example: 3432 * 3433 * ctx_sched_out(.event_type = EVENT_FLEXIBLE) 3434 * ctx_sched_out(.event_type = EVENT_PINNED) 3435 * 3436 * would only update time for the pinned events. 3437 */ 3438 __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); 3439 3440 /* 3441 * CPU-release for the below ->is_active store, 3442 * see __load_acquire() in perf_event_time_now() 3443 */ 3444 barrier(); 3445 ctx->is_active &= ~event_type; 3446 3447 if (!(ctx->is_active & EVENT_ALL)) { 3448 /* 3449 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() 3450 * does not observe a hole. perf_ctx_unlock() will clean up. 3451 */ 3452 if (ctx->is_active & EVENT_FROZEN) 3453 ctx->is_active &= EVENT_TIME_FROZEN; 3454 else 3455 ctx->is_active = 0; 3456 } 3457 3458 if (ctx->task) { 3459 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 3460 if (!(ctx->is_active & EVENT_ALL)) 3461 cpuctx->task_ctx = NULL; 3462 } 3463 3464 is_active ^= ctx->is_active; /* changed bits */ 3465 3466 for_each_epc(pmu_ctx, ctx, pmu, cgroup) 3467 __pmu_ctx_sched_out(pmu_ctx, is_active); 3468 } 3469 3470 /* 3471 * Test whether two contexts are equivalent, i.e. whether they have both been 3472 * cloned from the same version of the same context. 3473 * 3474 * Equivalence is measured using a generation number in the context that is 3475 * incremented on each modification to it; see unclone_ctx(), list_add_event() 3476 * and list_del_event(). 3477 */ 3478 static int context_equiv(struct perf_event_context *ctx1, 3479 struct perf_event_context *ctx2) 3480 { 3481 lockdep_assert_held(&ctx1->lock); 3482 lockdep_assert_held(&ctx2->lock); 3483 3484 /* Pinning disables the swap optimization */ 3485 if (ctx1->pin_count || ctx2->pin_count) 3486 return 0; 3487 3488 /* If ctx1 is the parent of ctx2 */ 3489 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) 3490 return 1; 3491 3492 /* If ctx2 is the parent of ctx1 */ 3493 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) 3494 return 1; 3495 3496 /* 3497 * If ctx1 and ctx2 have the same parent; we flatten the parent 3498 * hierarchy, see perf_event_init_context(). 3499 */ 3500 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && 3501 ctx1->parent_gen == ctx2->parent_gen) 3502 return 1; 3503 3504 /* Unmatched */ 3505 return 0; 3506 } 3507 3508 static void __perf_event_sync_stat(struct perf_event *event, 3509 struct perf_event *next_event) 3510 { 3511 u64 value; 3512 3513 if (!event->attr.inherit_stat) 3514 return; 3515 3516 /* 3517 * Update the event value, we cannot use perf_event_read() 3518 * because we're in the middle of a context switch and have IRQs 3519 * disabled, which upsets smp_call_function_single(), however 3520 * we know the event must be on the current CPU, therefore we 3521 * don't need to use it. 3522 */ 3523 perf_pmu_read(event); 3524 3525 perf_event_update_time(event); 3526 3527 /* 3528 * In order to keep per-task stats reliable we need to flip the event 3529 * values when we flip the contexts. 3530 */ 3531 value = local64_read(&next_event->count); 3532 value = local64_xchg(&event->count, value); 3533 local64_set(&next_event->count, value); 3534 3535 swap(event->total_time_enabled, next_event->total_time_enabled); 3536 swap(event->total_time_running, next_event->total_time_running); 3537 3538 /* 3539 * Since we swizzled the values, update the user visible data too. 3540 */ 3541 perf_event_update_userpage(event); 3542 perf_event_update_userpage(next_event); 3543 } 3544 3545 static void perf_event_sync_stat(struct perf_event_context *ctx, 3546 struct perf_event_context *next_ctx) 3547 { 3548 struct perf_event *event, *next_event; 3549 3550 if (!ctx->nr_stat) 3551 return; 3552 3553 update_context_time(ctx); 3554 3555 event = list_first_entry(&ctx->event_list, 3556 struct perf_event, event_entry); 3557 3558 next_event = list_first_entry(&next_ctx->event_list, 3559 struct perf_event, event_entry); 3560 3561 while (&event->event_entry != &ctx->event_list && 3562 &next_event->event_entry != &next_ctx->event_list) { 3563 3564 __perf_event_sync_stat(event, next_event); 3565 3566 event = list_next_entry(event, event_entry); 3567 next_event = list_next_entry(next_event, event_entry); 3568 } 3569 } 3570 3571 static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, 3572 struct task_struct *task, bool sched_in) 3573 { 3574 struct perf_event_pmu_context *pmu_ctx; 3575 struct perf_cpu_pmu_context *cpc; 3576 3577 list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { 3578 cpc = this_cpc(pmu_ctx->pmu); 3579 3580 if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) 3581 pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in); 3582 } 3583 } 3584 3585 static void 3586 perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) 3587 { 3588 struct perf_event_context *ctx = task->perf_event_ctxp; 3589 struct perf_event_context *next_ctx; 3590 struct perf_event_context *parent, *next_parent; 3591 int do_switch = 1; 3592 3593 if (likely(!ctx)) 3594 return; 3595 3596 rcu_read_lock(); 3597 next_ctx = rcu_dereference(next->perf_event_ctxp); 3598 if (!next_ctx) 3599 goto unlock; 3600 3601 parent = rcu_dereference(ctx->parent_ctx); 3602 next_parent = rcu_dereference(next_ctx->parent_ctx); 3603 3604 /* If neither context have a parent context; they cannot be clones. */ 3605 if (!parent && !next_parent) 3606 goto unlock; 3607 3608 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 3609 /* 3610 * Looks like the two contexts are clones, so we might be 3611 * able to optimize the context switch. We lock both 3612 * contexts and check that they are clones under the 3613 * lock (including re-checking that neither has been 3614 * uncloned in the meantime). It doesn't matter which 3615 * order we take the locks because no other cpu could 3616 * be trying to lock both of these tasks. 3617 */ 3618 raw_spin_lock(&ctx->lock); 3619 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 3620 if (context_equiv(ctx, next_ctx)) { 3621 3622 perf_ctx_disable(ctx, false); 3623 3624 /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ 3625 if (local_read(&ctx->nr_no_switch_fast) || 3626 local_read(&next_ctx->nr_no_switch_fast)) { 3627 /* 3628 * Must not swap out ctx when there's pending 3629 * events that rely on the ctx->task relation. 3630 * 3631 * Likewise, when a context contains inherit + 3632 * SAMPLE_READ events they should be switched 3633 * out using the slow path so that they are 3634 * treated as if they were distinct contexts. 3635 */ 3636 raw_spin_unlock(&next_ctx->lock); 3637 rcu_read_unlock(); 3638 goto inside_switch; 3639 } 3640 3641 WRITE_ONCE(ctx->task, next); 3642 WRITE_ONCE(next_ctx->task, task); 3643 3644 perf_ctx_sched_task_cb(ctx, task, false); 3645 3646 perf_ctx_enable(ctx, false); 3647 3648 /* 3649 * RCU_INIT_POINTER here is safe because we've not 3650 * modified the ctx and the above modification of 3651 * ctx->task is immaterial since this value is 3652 * always verified under ctx->lock which we're now 3653 * holding. 3654 */ 3655 RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); 3656 RCU_INIT_POINTER(next->perf_event_ctxp, ctx); 3657 3658 do_switch = 0; 3659 3660 perf_event_sync_stat(ctx, next_ctx); 3661 } 3662 raw_spin_unlock(&next_ctx->lock); 3663 raw_spin_unlock(&ctx->lock); 3664 } 3665 unlock: 3666 rcu_read_unlock(); 3667 3668 if (do_switch) { 3669 raw_spin_lock(&ctx->lock); 3670 perf_ctx_disable(ctx, false); 3671 3672 inside_switch: 3673 perf_ctx_sched_task_cb(ctx, task, false); 3674 task_ctx_sched_out(ctx, NULL, EVENT_ALL); 3675 3676 perf_ctx_enable(ctx, false); 3677 raw_spin_unlock(&ctx->lock); 3678 } 3679 } 3680 3681 static DEFINE_PER_CPU(struct list_head, sched_cb_list); 3682 static DEFINE_PER_CPU(int, perf_sched_cb_usages); 3683 3684 void perf_sched_cb_dec(struct pmu *pmu) 3685 { 3686 struct perf_cpu_pmu_context *cpc = this_cpc(pmu); 3687 3688 this_cpu_dec(perf_sched_cb_usages); 3689 barrier(); 3690 3691 if (!--cpc->sched_cb_usage) 3692 list_del(&cpc->sched_cb_entry); 3693 } 3694 3695 3696 void perf_sched_cb_inc(struct pmu *pmu) 3697 { 3698 struct perf_cpu_pmu_context *cpc = this_cpc(pmu); 3699 3700 if (!cpc->sched_cb_usage++) 3701 list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); 3702 3703 barrier(); 3704 this_cpu_inc(perf_sched_cb_usages); 3705 } 3706 3707 /* 3708 * This function provides the context switch callback to the lower code 3709 * layer. It is invoked ONLY when the context switch callback is enabled. 3710 * 3711 * This callback is relevant even to per-cpu events; for example multi event 3712 * PEBS requires this to provide PID/TID information. This requires we flush 3713 * all queued PEBS records before we context switch to a new task. 3714 */ 3715 static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, 3716 struct task_struct *task, bool sched_in) 3717 { 3718 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 3719 struct pmu *pmu; 3720 3721 pmu = cpc->epc.pmu; 3722 3723 /* software PMUs will not have sched_task */ 3724 if (WARN_ON_ONCE(!pmu->sched_task)) 3725 return; 3726 3727 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 3728 perf_pmu_disable(pmu); 3729 3730 pmu->sched_task(cpc->task_epc, task, sched_in); 3731 3732 perf_pmu_enable(pmu); 3733 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3734 } 3735 3736 static void perf_pmu_sched_task(struct task_struct *prev, 3737 struct task_struct *next, 3738 bool sched_in) 3739 { 3740 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 3741 struct perf_cpu_pmu_context *cpc; 3742 3743 /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ 3744 if (prev == next || cpuctx->task_ctx) 3745 return; 3746 3747 list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) 3748 __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in); 3749 } 3750 3751 static void perf_event_switch(struct task_struct *task, 3752 struct task_struct *next_prev, bool sched_in); 3753 3754 /* 3755 * Called from scheduler to remove the events of the current task, 3756 * with interrupts disabled. 3757 * 3758 * We stop each event and update the event value in event->count. 3759 * 3760 * This does not protect us against NMI, but disable() 3761 * sets the disabled bit in the control field of event _before_ 3762 * accessing the event control register. If a NMI hits, then it will 3763 * not restart the event. 3764 */ 3765 void __perf_event_task_sched_out(struct task_struct *task, 3766 struct task_struct *next) 3767 { 3768 if (__this_cpu_read(perf_sched_cb_usages)) 3769 perf_pmu_sched_task(task, next, false); 3770 3771 if (atomic_read(&nr_switch_events)) 3772 perf_event_switch(task, next, false); 3773 3774 perf_event_context_sched_out(task, next); 3775 3776 /* 3777 * if cgroup events exist on this CPU, then we need 3778 * to check if we have to switch out PMU state. 3779 * cgroup event are system-wide mode only 3780 */ 3781 perf_cgroup_switch(next); 3782 } 3783 3784 static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args) 3785 { 3786 const struct perf_event *le = *(const struct perf_event **)l; 3787 const struct perf_event *re = *(const struct perf_event **)r; 3788 3789 return le->group_index < re->group_index; 3790 } 3791 3792 DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); 3793 3794 static const struct min_heap_callbacks perf_min_heap = { 3795 .less = perf_less_group_idx, 3796 .swp = NULL, 3797 }; 3798 3799 static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event) 3800 { 3801 struct perf_event **itrs = heap->data; 3802 3803 if (event) { 3804 itrs[heap->nr] = event; 3805 heap->nr++; 3806 } 3807 } 3808 3809 static void __link_epc(struct perf_event_pmu_context *pmu_ctx) 3810 { 3811 struct perf_cpu_pmu_context *cpc; 3812 3813 if (!pmu_ctx->ctx->task) 3814 return; 3815 3816 cpc = this_cpc(pmu_ctx->pmu); 3817 WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); 3818 cpc->task_epc = pmu_ctx; 3819 } 3820 3821 static noinline int visit_groups_merge(struct perf_event_context *ctx, 3822 struct perf_event_groups *groups, int cpu, 3823 struct pmu *pmu, 3824 int (*func)(struct perf_event *, void *), 3825 void *data) 3826 { 3827 #ifdef CONFIG_CGROUP_PERF 3828 struct cgroup_subsys_state *css = NULL; 3829 #endif 3830 struct perf_cpu_context *cpuctx = NULL; 3831 /* Space for per CPU and/or any CPU event iterators. */ 3832 struct perf_event *itrs[2]; 3833 struct perf_event_min_heap event_heap; 3834 struct perf_event **evt; 3835 int ret; 3836 3837 if (pmu->filter && pmu->filter(pmu, cpu)) 3838 return 0; 3839 3840 if (!ctx->task) { 3841 cpuctx = this_cpu_ptr(&perf_cpu_context); 3842 event_heap = (struct perf_event_min_heap){ 3843 .data = cpuctx->heap, 3844 .nr = 0, 3845 .size = cpuctx->heap_size, 3846 }; 3847 3848 lockdep_assert_held(&cpuctx->ctx.lock); 3849 3850 #ifdef CONFIG_CGROUP_PERF 3851 if (cpuctx->cgrp) 3852 css = &cpuctx->cgrp->css; 3853 #endif 3854 } else { 3855 event_heap = (struct perf_event_min_heap){ 3856 .data = itrs, 3857 .nr = 0, 3858 .size = ARRAY_SIZE(itrs), 3859 }; 3860 /* Events not within a CPU context may be on any CPU. */ 3861 __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); 3862 } 3863 evt = event_heap.data; 3864 3865 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); 3866 3867 #ifdef CONFIG_CGROUP_PERF 3868 for (; css; css = css->parent) 3869 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); 3870 #endif 3871 3872 if (event_heap.nr) { 3873 __link_epc((*evt)->pmu_ctx); 3874 perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); 3875 } 3876 3877 min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); 3878 3879 while (event_heap.nr) { 3880 ret = func(*evt, data); 3881 if (ret) 3882 return ret; 3883 3884 *evt = perf_event_groups_next(*evt, pmu); 3885 if (*evt) 3886 min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); 3887 else 3888 min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); 3889 } 3890 3891 return 0; 3892 } 3893 3894 /* 3895 * Because the userpage is strictly per-event (there is no concept of context, 3896 * so there cannot be a context indirection), every userpage must be updated 3897 * when context time starts :-( 3898 * 3899 * IOW, we must not miss EVENT_TIME edges. 3900 */ 3901 static inline bool event_update_userpage(struct perf_event *event) 3902 { 3903 if (likely(!atomic_read(&event->mmap_count))) 3904 return false; 3905 3906 perf_event_update_time(event); 3907 perf_event_update_userpage(event); 3908 3909 return true; 3910 } 3911 3912 static inline void group_update_userpage(struct perf_event *group_event) 3913 { 3914 struct perf_event *event; 3915 3916 if (!event_update_userpage(group_event)) 3917 return; 3918 3919 for_each_sibling_event(event, group_event) 3920 event_update_userpage(event); 3921 } 3922 3923 static int merge_sched_in(struct perf_event *event, void *data) 3924 { 3925 struct perf_event_context *ctx = event->ctx; 3926 int *can_add_hw = data; 3927 3928 if (event->state <= PERF_EVENT_STATE_OFF) 3929 return 0; 3930 3931 if (!event_filter_match(event)) 3932 return 0; 3933 3934 if (group_can_go_on(event, *can_add_hw)) { 3935 if (!group_sched_in(event, ctx)) 3936 list_add_tail(&event->active_list, get_event_list(event)); 3937 } 3938 3939 if (event->state == PERF_EVENT_STATE_INACTIVE) { 3940 *can_add_hw = 0; 3941 if (event->attr.pinned) { 3942 perf_cgroup_event_disable(event, ctx); 3943 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); 3944 3945 if (*perf_event_fasync(event)) 3946 event->pending_kill = POLL_ERR; 3947 3948 perf_event_wakeup(event); 3949 } else { 3950 struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); 3951 3952 event->pmu_ctx->rotate_necessary = 1; 3953 perf_mux_hrtimer_restart(cpc); 3954 group_update_userpage(event); 3955 } 3956 } 3957 3958 return 0; 3959 } 3960 3961 static void pmu_groups_sched_in(struct perf_event_context *ctx, 3962 struct perf_event_groups *groups, 3963 struct pmu *pmu) 3964 { 3965 int can_add_hw = 1; 3966 visit_groups_merge(ctx, groups, smp_processor_id(), pmu, 3967 merge_sched_in, &can_add_hw); 3968 } 3969 3970 static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, 3971 enum event_type_t event_type) 3972 { 3973 struct perf_event_context *ctx = pmu_ctx->ctx; 3974 3975 if (event_type & EVENT_PINNED) 3976 pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); 3977 if (event_type & EVENT_FLEXIBLE) 3978 pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); 3979 } 3980 3981 static void 3982 ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) 3983 { 3984 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 3985 struct perf_event_pmu_context *pmu_ctx; 3986 int is_active = ctx->is_active; 3987 bool cgroup = event_type & EVENT_CGROUP; 3988 3989 event_type &= ~EVENT_CGROUP; 3990 3991 lockdep_assert_held(&ctx->lock); 3992 3993 if (likely(!ctx->nr_events)) 3994 return; 3995 3996 if (!(is_active & EVENT_TIME)) { 3997 /* start ctx time */ 3998 __update_context_time(ctx, false); 3999 perf_cgroup_set_timestamp(cpuctx); 4000 /* 4001 * CPU-release for the below ->is_active store, 4002 * see __load_acquire() in perf_event_time_now() 4003 */ 4004 barrier(); 4005 } 4006 4007 ctx->is_active |= (event_type | EVENT_TIME); 4008 if (ctx->task) { 4009 if (!(is_active & EVENT_ALL)) 4010 cpuctx->task_ctx = ctx; 4011 else 4012 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 4013 } 4014 4015 is_active ^= ctx->is_active; /* changed bits */ 4016 4017 /* 4018 * First go through the list and put on any pinned groups 4019 * in order to give them the best chance of going on. 4020 */ 4021 if (is_active & EVENT_PINNED) { 4022 for_each_epc(pmu_ctx, ctx, pmu, cgroup) 4023 __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); 4024 } 4025 4026 /* Then walk through the lower prio flexible groups */ 4027 if (is_active & EVENT_FLEXIBLE) { 4028 for_each_epc(pmu_ctx, ctx, pmu, cgroup) 4029 __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); 4030 } 4031 } 4032 4033 static void perf_event_context_sched_in(struct task_struct *task) 4034 { 4035 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 4036 struct perf_event_context *ctx; 4037 4038 rcu_read_lock(); 4039 ctx = rcu_dereference(task->perf_event_ctxp); 4040 if (!ctx) 4041 goto rcu_unlock; 4042 4043 if (cpuctx->task_ctx == ctx) { 4044 perf_ctx_lock(cpuctx, ctx); 4045 perf_ctx_disable(ctx, false); 4046 4047 perf_ctx_sched_task_cb(ctx, task, true); 4048 4049 perf_ctx_enable(ctx, false); 4050 perf_ctx_unlock(cpuctx, ctx); 4051 goto rcu_unlock; 4052 } 4053 4054 perf_ctx_lock(cpuctx, ctx); 4055 /* 4056 * We must check ctx->nr_events while holding ctx->lock, such 4057 * that we serialize against perf_install_in_context(). 4058 */ 4059 if (!ctx->nr_events) 4060 goto unlock; 4061 4062 perf_ctx_disable(ctx, false); 4063 /* 4064 * We want to keep the following priority order: 4065 * cpu pinned (that don't need to move), task pinned, 4066 * cpu flexible, task flexible. 4067 * 4068 * However, if task's ctx is not carrying any pinned 4069 * events, no need to flip the cpuctx's events around. 4070 */ 4071 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { 4072 perf_ctx_disable(&cpuctx->ctx, false); 4073 ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); 4074 } 4075 4076 perf_event_sched_in(cpuctx, ctx, NULL); 4077 4078 perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); 4079 4080 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) 4081 perf_ctx_enable(&cpuctx->ctx, false); 4082 4083 perf_ctx_enable(ctx, false); 4084 4085 unlock: 4086 perf_ctx_unlock(cpuctx, ctx); 4087 rcu_unlock: 4088 rcu_read_unlock(); 4089 } 4090 4091 /* 4092 * Called from scheduler to add the events of the current task 4093 * with interrupts disabled. 4094 * 4095 * We restore the event value and then enable it. 4096 * 4097 * This does not protect us against NMI, but enable() 4098 * sets the enabled bit in the control field of event _before_ 4099 * accessing the event control register. If a NMI hits, then it will 4100 * keep the event running. 4101 */ 4102 void __perf_event_task_sched_in(struct task_struct *prev, 4103 struct task_struct *task) 4104 { 4105 perf_event_context_sched_in(task); 4106 4107 if (atomic_read(&nr_switch_events)) 4108 perf_event_switch(task, prev, true); 4109 4110 if (__this_cpu_read(perf_sched_cb_usages)) 4111 perf_pmu_sched_task(prev, task, true); 4112 } 4113 4114 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 4115 { 4116 u64 frequency = event->attr.sample_freq; 4117 u64 sec = NSEC_PER_SEC; 4118 u64 divisor, dividend; 4119 4120 int count_fls, nsec_fls, frequency_fls, sec_fls; 4121 4122 count_fls = fls64(count); 4123 nsec_fls = fls64(nsec); 4124 frequency_fls = fls64(frequency); 4125 sec_fls = 30; 4126 4127 /* 4128 * We got @count in @nsec, with a target of sample_freq HZ 4129 * the target period becomes: 4130 * 4131 * @count * 10^9 4132 * period = ------------------- 4133 * @nsec * sample_freq 4134 * 4135 */ 4136 4137 /* 4138 * Reduce accuracy by one bit such that @a and @b converge 4139 * to a similar magnitude. 4140 */ 4141 #define REDUCE_FLS(a, b) \ 4142 do { \ 4143 if (a##_fls > b##_fls) { \ 4144 a >>= 1; \ 4145 a##_fls--; \ 4146 } else { \ 4147 b >>= 1; \ 4148 b##_fls--; \ 4149 } \ 4150 } while (0) 4151 4152 /* 4153 * Reduce accuracy until either term fits in a u64, then proceed with 4154 * the other, so that finally we can do a u64/u64 division. 4155 */ 4156 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { 4157 REDUCE_FLS(nsec, frequency); 4158 REDUCE_FLS(sec, count); 4159 } 4160 4161 if (count_fls + sec_fls > 64) { 4162 divisor = nsec * frequency; 4163 4164 while (count_fls + sec_fls > 64) { 4165 REDUCE_FLS(count, sec); 4166 divisor >>= 1; 4167 } 4168 4169 dividend = count * sec; 4170 } else { 4171 dividend = count * sec; 4172 4173 while (nsec_fls + frequency_fls > 64) { 4174 REDUCE_FLS(nsec, frequency); 4175 dividend >>= 1; 4176 } 4177 4178 divisor = nsec * frequency; 4179 } 4180 4181 if (!divisor) 4182 return dividend; 4183 4184 return div64_u64(dividend, divisor); 4185 } 4186 4187 static DEFINE_PER_CPU(int, perf_throttled_count); 4188 static DEFINE_PER_CPU(u64, perf_throttled_seq); 4189 4190 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) 4191 { 4192 struct hw_perf_event *hwc = &event->hw; 4193 s64 period, sample_period; 4194 s64 delta; 4195 4196 period = perf_calculate_period(event, nsec, count); 4197 4198 delta = (s64)(period - hwc->sample_period); 4199 if (delta >= 0) 4200 delta += 7; 4201 else 4202 delta -= 7; 4203 delta /= 8; /* low pass filter */ 4204 4205 sample_period = hwc->sample_period + delta; 4206 4207 if (!sample_period) 4208 sample_period = 1; 4209 4210 hwc->sample_period = sample_period; 4211 4212 if (local64_read(&hwc->period_left) > 8*sample_period) { 4213 if (disable) 4214 event->pmu->stop(event, PERF_EF_UPDATE); 4215 4216 local64_set(&hwc->period_left, 0); 4217 4218 if (disable) 4219 event->pmu->start(event, PERF_EF_RELOAD); 4220 } 4221 } 4222 4223 static void perf_adjust_freq_unthr_events(struct list_head *event_list) 4224 { 4225 struct perf_event *event; 4226 struct hw_perf_event *hwc; 4227 u64 now, period = TICK_NSEC; 4228 s64 delta; 4229 4230 list_for_each_entry(event, event_list, active_list) { 4231 if (event->state != PERF_EVENT_STATE_ACTIVE) 4232 continue; 4233 4234 // XXX use visit thingy to avoid the -1,cpu match 4235 if (!event_filter_match(event)) 4236 continue; 4237 4238 hwc = &event->hw; 4239 4240 if (hwc->interrupts == MAX_INTERRUPTS) { 4241 hwc->interrupts = 0; 4242 perf_log_throttle(event, 1); 4243 if (!event->attr.freq || !event->attr.sample_freq) 4244 event->pmu->start(event, 0); 4245 } 4246 4247 if (!event->attr.freq || !event->attr.sample_freq) 4248 continue; 4249 4250 /* 4251 * stop the event and update event->count 4252 */ 4253 event->pmu->stop(event, PERF_EF_UPDATE); 4254 4255 now = local64_read(&event->count); 4256 delta = now - hwc->freq_count_stamp; 4257 hwc->freq_count_stamp = now; 4258 4259 /* 4260 * restart the event 4261 * reload only if value has changed 4262 * we have stopped the event so tell that 4263 * to perf_adjust_period() to avoid stopping it 4264 * twice. 4265 */ 4266 if (delta > 0) 4267 perf_adjust_period(event, period, delta, false); 4268 4269 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); 4270 } 4271 } 4272 4273 /* 4274 * combine freq adjustment with unthrottling to avoid two passes over the 4275 * events. At the same time, make sure, having freq events does not change 4276 * the rate of unthrottling as that would introduce bias. 4277 */ 4278 static void 4279 perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) 4280 { 4281 struct perf_event_pmu_context *pmu_ctx; 4282 4283 /* 4284 * only need to iterate over all events iff: 4285 * - context have events in frequency mode (needs freq adjust) 4286 * - there are events to unthrottle on this cpu 4287 */ 4288 if (!(ctx->nr_freq || unthrottle)) 4289 return; 4290 4291 raw_spin_lock(&ctx->lock); 4292 4293 list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { 4294 if (!(pmu_ctx->nr_freq || unthrottle)) 4295 continue; 4296 if (!perf_pmu_ctx_is_active(pmu_ctx)) 4297 continue; 4298 if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) 4299 continue; 4300 4301 perf_pmu_disable(pmu_ctx->pmu); 4302 perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active); 4303 perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active); 4304 perf_pmu_enable(pmu_ctx->pmu); 4305 } 4306 4307 raw_spin_unlock(&ctx->lock); 4308 } 4309 4310 /* 4311 * Move @event to the tail of the @ctx's elegible events. 4312 */ 4313 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) 4314 { 4315 /* 4316 * Rotate the first entry last of non-pinned groups. Rotation might be 4317 * disabled by the inheritance code. 4318 */ 4319 if (ctx->rotate_disable) 4320 return; 4321 4322 perf_event_groups_delete(&ctx->flexible_groups, event); 4323 perf_event_groups_insert(&ctx->flexible_groups, event); 4324 } 4325 4326 /* pick an event from the flexible_groups to rotate */ 4327 static inline struct perf_event * 4328 ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) 4329 { 4330 struct perf_event *event; 4331 struct rb_node *node; 4332 struct rb_root *tree; 4333 struct __group_key key = { 4334 .pmu = pmu_ctx->pmu, 4335 }; 4336 4337 /* pick the first active flexible event */ 4338 event = list_first_entry_or_null(&pmu_ctx->flexible_active, 4339 struct perf_event, active_list); 4340 if (event) 4341 goto out; 4342 4343 /* if no active flexible event, pick the first event */ 4344 tree = &pmu_ctx->ctx->flexible_groups.tree; 4345 4346 if (!pmu_ctx->ctx->task) { 4347 key.cpu = smp_processor_id(); 4348 4349 node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); 4350 if (node) 4351 event = __node_2_pe(node); 4352 goto out; 4353 } 4354 4355 key.cpu = -1; 4356 node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); 4357 if (node) { 4358 event = __node_2_pe(node); 4359 goto out; 4360 } 4361 4362 key.cpu = smp_processor_id(); 4363 node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); 4364 if (node) 4365 event = __node_2_pe(node); 4366 4367 out: 4368 /* 4369 * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() 4370 * finds there are unschedulable events, it will set it again. 4371 */ 4372 pmu_ctx->rotate_necessary = 0; 4373 4374 return event; 4375 } 4376 4377 static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) 4378 { 4379 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 4380 struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; 4381 struct perf_event *cpu_event = NULL, *task_event = NULL; 4382 int cpu_rotate, task_rotate; 4383 struct pmu *pmu; 4384 4385 /* 4386 * Since we run this from IRQ context, nobody can install new 4387 * events, thus the event count values are stable. 4388 */ 4389 4390 cpu_epc = &cpc->epc; 4391 pmu = cpu_epc->pmu; 4392 task_epc = cpc->task_epc; 4393 4394 cpu_rotate = cpu_epc->rotate_necessary; 4395 task_rotate = task_epc ? task_epc->rotate_necessary : 0; 4396 4397 if (!(cpu_rotate || task_rotate)) 4398 return false; 4399 4400 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 4401 perf_pmu_disable(pmu); 4402 4403 if (task_rotate) 4404 task_event = ctx_event_to_rotate(task_epc); 4405 if (cpu_rotate) 4406 cpu_event = ctx_event_to_rotate(cpu_epc); 4407 4408 /* 4409 * As per the order given at ctx_resched() first 'pop' task flexible 4410 * and then, if needed CPU flexible. 4411 */ 4412 if (task_event || (task_epc && cpu_event)) { 4413 update_context_time(task_epc->ctx); 4414 __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); 4415 } 4416 4417 if (cpu_event) { 4418 update_context_time(&cpuctx->ctx); 4419 __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); 4420 rotate_ctx(&cpuctx->ctx, cpu_event); 4421 __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); 4422 } 4423 4424 if (task_event) 4425 rotate_ctx(task_epc->ctx, task_event); 4426 4427 if (task_event || (task_epc && cpu_event)) 4428 __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); 4429 4430 perf_pmu_enable(pmu); 4431 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 4432 4433 return true; 4434 } 4435 4436 void perf_event_task_tick(void) 4437 { 4438 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 4439 struct perf_event_context *ctx; 4440 int throttled; 4441 4442 lockdep_assert_irqs_disabled(); 4443 4444 __this_cpu_inc(perf_throttled_seq); 4445 throttled = __this_cpu_xchg(perf_throttled_count, 0); 4446 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); 4447 4448 perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); 4449 4450 rcu_read_lock(); 4451 ctx = rcu_dereference(current->perf_event_ctxp); 4452 if (ctx) 4453 perf_adjust_freq_unthr_context(ctx, !!throttled); 4454 rcu_read_unlock(); 4455 } 4456 4457 static int event_enable_on_exec(struct perf_event *event, 4458 struct perf_event_context *ctx) 4459 { 4460 if (!event->attr.enable_on_exec) 4461 return 0; 4462 4463 event->attr.enable_on_exec = 0; 4464 if (event->state >= PERF_EVENT_STATE_INACTIVE) 4465 return 0; 4466 4467 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 4468 4469 return 1; 4470 } 4471 4472 /* 4473 * Enable all of a task's events that have been marked enable-on-exec. 4474 * This expects task == current. 4475 */ 4476 static void perf_event_enable_on_exec(struct perf_event_context *ctx) 4477 { 4478 struct perf_event_context *clone_ctx = NULL; 4479 enum event_type_t event_type = 0; 4480 struct perf_cpu_context *cpuctx; 4481 struct perf_event *event; 4482 unsigned long flags; 4483 int enabled = 0; 4484 4485 local_irq_save(flags); 4486 if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) 4487 goto out; 4488 4489 if (!ctx->nr_events) 4490 goto out; 4491 4492 cpuctx = this_cpu_ptr(&perf_cpu_context); 4493 perf_ctx_lock(cpuctx, ctx); 4494 ctx_time_freeze(cpuctx, ctx); 4495 4496 list_for_each_entry(event, &ctx->event_list, event_entry) { 4497 enabled |= event_enable_on_exec(event, ctx); 4498 event_type |= get_event_type(event); 4499 } 4500 4501 /* 4502 * Unclone and reschedule this context if we enabled any event. 4503 */ 4504 if (enabled) { 4505 clone_ctx = unclone_ctx(ctx); 4506 ctx_resched(cpuctx, ctx, NULL, event_type); 4507 } 4508 perf_ctx_unlock(cpuctx, ctx); 4509 4510 out: 4511 local_irq_restore(flags); 4512 4513 if (clone_ctx) 4514 put_ctx(clone_ctx); 4515 } 4516 4517 static void perf_remove_from_owner(struct perf_event *event); 4518 static void perf_event_exit_event(struct perf_event *event, 4519 struct perf_event_context *ctx); 4520 4521 /* 4522 * Removes all events from the current task that have been marked 4523 * remove-on-exec, and feeds their values back to parent events. 4524 */ 4525 static void perf_event_remove_on_exec(struct perf_event_context *ctx) 4526 { 4527 struct perf_event_context *clone_ctx = NULL; 4528 struct perf_event *event, *next; 4529 unsigned long flags; 4530 bool modified = false; 4531 4532 mutex_lock(&ctx->mutex); 4533 4534 if (WARN_ON_ONCE(ctx->task != current)) 4535 goto unlock; 4536 4537 list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { 4538 if (!event->attr.remove_on_exec) 4539 continue; 4540 4541 if (!is_kernel_event(event)) 4542 perf_remove_from_owner(event); 4543 4544 modified = true; 4545 4546 perf_event_exit_event(event, ctx); 4547 } 4548 4549 raw_spin_lock_irqsave(&ctx->lock, flags); 4550 if (modified) 4551 clone_ctx = unclone_ctx(ctx); 4552 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4553 4554 unlock: 4555 mutex_unlock(&ctx->mutex); 4556 4557 if (clone_ctx) 4558 put_ctx(clone_ctx); 4559 } 4560 4561 struct perf_read_data { 4562 struct perf_event *event; 4563 bool group; 4564 int ret; 4565 }; 4566 4567 static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); 4568 4569 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) 4570 { 4571 int local_cpu = smp_processor_id(); 4572 u16 local_pkg, event_pkg; 4573 4574 if ((unsigned)event_cpu >= nr_cpu_ids) 4575 return event_cpu; 4576 4577 if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { 4578 const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); 4579 4580 if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) 4581 return local_cpu; 4582 } 4583 4584 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { 4585 event_pkg = topology_physical_package_id(event_cpu); 4586 local_pkg = topology_physical_package_id(local_cpu); 4587 4588 if (event_pkg == local_pkg) 4589 return local_cpu; 4590 } 4591 4592 return event_cpu; 4593 } 4594 4595 /* 4596 * Cross CPU call to read the hardware event 4597 */ 4598 static void __perf_event_read(void *info) 4599 { 4600 struct perf_read_data *data = info; 4601 struct perf_event *sub, *event = data->event; 4602 struct perf_event_context *ctx = event->ctx; 4603 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 4604 struct pmu *pmu = event->pmu; 4605 4606 /* 4607 * If this is a task context, we need to check whether it is 4608 * the current task context of this cpu. If not it has been 4609 * scheduled out before the smp call arrived. In that case 4610 * event->count would have been updated to a recent sample 4611 * when the event was scheduled out. 4612 */ 4613 if (ctx->task && cpuctx->task_ctx != ctx) 4614 return; 4615 4616 raw_spin_lock(&ctx->lock); 4617 ctx_time_update_event(ctx, event); 4618 4619 perf_event_update_time(event); 4620 if (data->group) 4621 perf_event_update_sibling_time(event); 4622 4623 if (event->state != PERF_EVENT_STATE_ACTIVE) 4624 goto unlock; 4625 4626 if (!data->group) { 4627 pmu->read(event); 4628 data->ret = 0; 4629 goto unlock; 4630 } 4631 4632 pmu->start_txn(pmu, PERF_PMU_TXN_READ); 4633 4634 pmu->read(event); 4635 4636 for_each_sibling_event(sub, event) 4637 perf_pmu_read(sub); 4638 4639 data->ret = pmu->commit_txn(pmu); 4640 4641 unlock: 4642 raw_spin_unlock(&ctx->lock); 4643 } 4644 4645 static inline u64 perf_event_count(struct perf_event *event, bool self) 4646 { 4647 if (self) 4648 return local64_read(&event->count); 4649 4650 return local64_read(&event->count) + atomic64_read(&event->child_count); 4651 } 4652 4653 static void calc_timer_values(struct perf_event *event, 4654 u64 *now, 4655 u64 *enabled, 4656 u64 *running) 4657 { 4658 u64 ctx_time; 4659 4660 *now = perf_clock(); 4661 ctx_time = perf_event_time_now(event, *now); 4662 __perf_update_times(event, ctx_time, enabled, running); 4663 } 4664 4665 /* 4666 * NMI-safe method to read a local event, that is an event that 4667 * is: 4668 * - either for the current task, or for this CPU 4669 * - does not have inherit set, for inherited task events 4670 * will not be local and we cannot read them atomically 4671 * - must not have a pmu::count method 4672 */ 4673 int perf_event_read_local(struct perf_event *event, u64 *value, 4674 u64 *enabled, u64 *running) 4675 { 4676 unsigned long flags; 4677 int event_oncpu; 4678 int event_cpu; 4679 int ret = 0; 4680 4681 /* 4682 * Disabling interrupts avoids all counter scheduling (context 4683 * switches, timer based rotation and IPIs). 4684 */ 4685 local_irq_save(flags); 4686 4687 /* 4688 * It must not be an event with inherit set, we cannot read 4689 * all child counters from atomic context. 4690 */ 4691 if (event->attr.inherit) { 4692 ret = -EOPNOTSUPP; 4693 goto out; 4694 } 4695 4696 /* If this is a per-task event, it must be for current */ 4697 if ((event->attach_state & PERF_ATTACH_TASK) && 4698 event->hw.target != current) { 4699 ret = -EINVAL; 4700 goto out; 4701 } 4702 4703 /* 4704 * Get the event CPU numbers, and adjust them to local if the event is 4705 * a per-package event that can be read locally 4706 */ 4707 event_oncpu = __perf_event_read_cpu(event, event->oncpu); 4708 event_cpu = __perf_event_read_cpu(event, event->cpu); 4709 4710 /* If this is a per-CPU event, it must be for this CPU */ 4711 if (!(event->attach_state & PERF_ATTACH_TASK) && 4712 event_cpu != smp_processor_id()) { 4713 ret = -EINVAL; 4714 goto out; 4715 } 4716 4717 /* If this is a pinned event it must be running on this CPU */ 4718 if (event->attr.pinned && event_oncpu != smp_processor_id()) { 4719 ret = -EBUSY; 4720 goto out; 4721 } 4722 4723 /* 4724 * If the event is currently on this CPU, its either a per-task event, 4725 * or local to this CPU. Furthermore it means its ACTIVE (otherwise 4726 * oncpu == -1). 4727 */ 4728 if (event_oncpu == smp_processor_id()) 4729 event->pmu->read(event); 4730 4731 *value = local64_read(&event->count); 4732 if (enabled || running) { 4733 u64 __enabled, __running, __now; 4734 4735 calc_timer_values(event, &__now, &__enabled, &__running); 4736 if (enabled) 4737 *enabled = __enabled; 4738 if (running) 4739 *running = __running; 4740 } 4741 out: 4742 local_irq_restore(flags); 4743 4744 return ret; 4745 } 4746 4747 static int perf_event_read(struct perf_event *event, bool group) 4748 { 4749 enum perf_event_state state = READ_ONCE(event->state); 4750 int event_cpu, ret = 0; 4751 4752 /* 4753 * If event is enabled and currently active on a CPU, update the 4754 * value in the event structure: 4755 */ 4756 again: 4757 if (state == PERF_EVENT_STATE_ACTIVE) { 4758 struct perf_read_data data; 4759 4760 /* 4761 * Orders the ->state and ->oncpu loads such that if we see 4762 * ACTIVE we must also see the right ->oncpu. 4763 * 4764 * Matches the smp_wmb() from event_sched_in(). 4765 */ 4766 smp_rmb(); 4767 4768 event_cpu = READ_ONCE(event->oncpu); 4769 if ((unsigned)event_cpu >= nr_cpu_ids) 4770 return 0; 4771 4772 data = (struct perf_read_data){ 4773 .event = event, 4774 .group = group, 4775 .ret = 0, 4776 }; 4777 4778 preempt_disable(); 4779 event_cpu = __perf_event_read_cpu(event, event_cpu); 4780 4781 /* 4782 * Purposely ignore the smp_call_function_single() return 4783 * value. 4784 * 4785 * If event_cpu isn't a valid CPU it means the event got 4786 * scheduled out and that will have updated the event count. 4787 * 4788 * Therefore, either way, we'll have an up-to-date event count 4789 * after this. 4790 */ 4791 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); 4792 preempt_enable(); 4793 ret = data.ret; 4794 4795 } else if (state == PERF_EVENT_STATE_INACTIVE) { 4796 struct perf_event_context *ctx = event->ctx; 4797 unsigned long flags; 4798 4799 raw_spin_lock_irqsave(&ctx->lock, flags); 4800 state = event->state; 4801 if (state != PERF_EVENT_STATE_INACTIVE) { 4802 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4803 goto again; 4804 } 4805 4806 /* 4807 * May read while context is not active (e.g., thread is 4808 * blocked), in that case we cannot update context time 4809 */ 4810 ctx_time_update_event(ctx, event); 4811 4812 perf_event_update_time(event); 4813 if (group) 4814 perf_event_update_sibling_time(event); 4815 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4816 } 4817 4818 return ret; 4819 } 4820 4821 /* 4822 * Initialize the perf_event context in a task_struct: 4823 */ 4824 static void __perf_event_init_context(struct perf_event_context *ctx) 4825 { 4826 raw_spin_lock_init(&ctx->lock); 4827 mutex_init(&ctx->mutex); 4828 INIT_LIST_HEAD(&ctx->pmu_ctx_list); 4829 perf_event_groups_init(&ctx->pinned_groups); 4830 perf_event_groups_init(&ctx->flexible_groups); 4831 INIT_LIST_HEAD(&ctx->event_list); 4832 refcount_set(&ctx->refcount, 1); 4833 } 4834 4835 static void 4836 __perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) 4837 { 4838 epc->pmu = pmu; 4839 INIT_LIST_HEAD(&epc->pmu_ctx_entry); 4840 INIT_LIST_HEAD(&epc->pinned_active); 4841 INIT_LIST_HEAD(&epc->flexible_active); 4842 atomic_set(&epc->refcount, 1); 4843 } 4844 4845 static struct perf_event_context * 4846 alloc_perf_context(struct task_struct *task) 4847 { 4848 struct perf_event_context *ctx; 4849 4850 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 4851 if (!ctx) 4852 return NULL; 4853 4854 __perf_event_init_context(ctx); 4855 if (task) 4856 ctx->task = get_task_struct(task); 4857 4858 return ctx; 4859 } 4860 4861 static struct task_struct * 4862 find_lively_task_by_vpid(pid_t vpid) 4863 { 4864 struct task_struct *task; 4865 4866 rcu_read_lock(); 4867 if (!vpid) 4868 task = current; 4869 else 4870 task = find_task_by_vpid(vpid); 4871 if (task) 4872 get_task_struct(task); 4873 rcu_read_unlock(); 4874 4875 if (!task) 4876 return ERR_PTR(-ESRCH); 4877 4878 return task; 4879 } 4880 4881 /* 4882 * Returns a matching context with refcount and pincount. 4883 */ 4884 static struct perf_event_context * 4885 find_get_context(struct task_struct *task, struct perf_event *event) 4886 { 4887 struct perf_event_context *ctx, *clone_ctx = NULL; 4888 struct perf_cpu_context *cpuctx; 4889 unsigned long flags; 4890 int err; 4891 4892 if (!task) { 4893 /* Must be root to operate on a CPU event: */ 4894 err = perf_allow_cpu(); 4895 if (err) 4896 return ERR_PTR(err); 4897 4898 cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); 4899 ctx = &cpuctx->ctx; 4900 get_ctx(ctx); 4901 raw_spin_lock_irqsave(&ctx->lock, flags); 4902 ++ctx->pin_count; 4903 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4904 4905 return ctx; 4906 } 4907 4908 err = -EINVAL; 4909 retry: 4910 ctx = perf_lock_task_context(task, &flags); 4911 if (ctx) { 4912 clone_ctx = unclone_ctx(ctx); 4913 ++ctx->pin_count; 4914 4915 raw_spin_unlock_irqrestore(&ctx->lock, flags); 4916 4917 if (clone_ctx) 4918 put_ctx(clone_ctx); 4919 } else { 4920 ctx = alloc_perf_context(task); 4921 err = -ENOMEM; 4922 if (!ctx) 4923 goto errout; 4924 4925 err = 0; 4926 mutex_lock(&task->perf_event_mutex); 4927 /* 4928 * If it has already passed perf_event_exit_task(). 4929 * we must see PF_EXITING, it takes this mutex too. 4930 */ 4931 if (task->flags & PF_EXITING) 4932 err = -ESRCH; 4933 else if (task->perf_event_ctxp) 4934 err = -EAGAIN; 4935 else { 4936 get_ctx(ctx); 4937 ++ctx->pin_count; 4938 rcu_assign_pointer(task->perf_event_ctxp, ctx); 4939 } 4940 mutex_unlock(&task->perf_event_mutex); 4941 4942 if (unlikely(err)) { 4943 put_ctx(ctx); 4944 4945 if (err == -EAGAIN) 4946 goto retry; 4947 goto errout; 4948 } 4949 } 4950 4951 return ctx; 4952 4953 errout: 4954 return ERR_PTR(err); 4955 } 4956 4957 static struct perf_event_pmu_context * 4958 find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, 4959 struct perf_event *event) 4960 { 4961 struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; 4962 4963 if (!ctx->task) { 4964 /* 4965 * perf_pmu_migrate_context() / __perf_pmu_install_event() 4966 * relies on the fact that find_get_pmu_context() cannot fail 4967 * for CPU contexts. 4968 */ 4969 struct perf_cpu_pmu_context *cpc; 4970 4971 cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); 4972 epc = &cpc->epc; 4973 raw_spin_lock_irq(&ctx->lock); 4974 if (!epc->ctx) { 4975 /* 4976 * One extra reference for the pmu; see perf_pmu_free(). 4977 */ 4978 atomic_set(&epc->refcount, 2); 4979 epc->embedded = 1; 4980 list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); 4981 epc->ctx = ctx; 4982 } else { 4983 WARN_ON_ONCE(epc->ctx != ctx); 4984 atomic_inc(&epc->refcount); 4985 } 4986 raw_spin_unlock_irq(&ctx->lock); 4987 return epc; 4988 } 4989 4990 new = kzalloc(sizeof(*epc), GFP_KERNEL); 4991 if (!new) 4992 return ERR_PTR(-ENOMEM); 4993 4994 __perf_init_event_pmu_context(new, pmu); 4995 4996 /* 4997 * XXX 4998 * 4999 * lockdep_assert_held(&ctx->mutex); 5000 * 5001 * can't because perf_event_init_task() doesn't actually hold the 5002 * child_ctx->mutex. 5003 */ 5004 5005 raw_spin_lock_irq(&ctx->lock); 5006 list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { 5007 if (epc->pmu == pmu) { 5008 WARN_ON_ONCE(epc->ctx != ctx); 5009 atomic_inc(&epc->refcount); 5010 goto found_epc; 5011 } 5012 /* Make sure the pmu_ctx_list is sorted by PMU type: */ 5013 if (!pos && epc->pmu->type > pmu->type) 5014 pos = epc; 5015 } 5016 5017 epc = new; 5018 new = NULL; 5019 5020 if (!pos) 5021 list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); 5022 else 5023 list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); 5024 5025 epc->ctx = ctx; 5026 5027 found_epc: 5028 raw_spin_unlock_irq(&ctx->lock); 5029 kfree(new); 5030 5031 return epc; 5032 } 5033 5034 static void get_pmu_ctx(struct perf_event_pmu_context *epc) 5035 { 5036 WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); 5037 } 5038 5039 static void free_cpc_rcu(struct rcu_head *head) 5040 { 5041 struct perf_cpu_pmu_context *cpc = 5042 container_of(head, typeof(*cpc), epc.rcu_head); 5043 5044 kfree(cpc); 5045 } 5046 5047 static void free_epc_rcu(struct rcu_head *head) 5048 { 5049 struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); 5050 5051 kfree(epc); 5052 } 5053 5054 static void put_pmu_ctx(struct perf_event_pmu_context *epc) 5055 { 5056 struct perf_event_context *ctx = epc->ctx; 5057 unsigned long flags; 5058 5059 /* 5060 * XXX 5061 * 5062 * lockdep_assert_held(&ctx->mutex); 5063 * 5064 * can't because of the call-site in _free_event()/put_event() 5065 * which isn't always called under ctx->mutex. 5066 */ 5067 if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) 5068 return; 5069 5070 WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); 5071 5072 list_del_init(&epc->pmu_ctx_entry); 5073 epc->ctx = NULL; 5074 5075 WARN_ON_ONCE(!list_empty(&epc->pinned_active)); 5076 WARN_ON_ONCE(!list_empty(&epc->flexible_active)); 5077 5078 raw_spin_unlock_irqrestore(&ctx->lock, flags); 5079 5080 if (epc->embedded) { 5081 call_rcu(&epc->rcu_head, free_cpc_rcu); 5082 return; 5083 } 5084 5085 call_rcu(&epc->rcu_head, free_epc_rcu); 5086 } 5087 5088 static void perf_event_free_filter(struct perf_event *event); 5089 5090 static void free_event_rcu(struct rcu_head *head) 5091 { 5092 struct perf_event *event = container_of(head, typeof(*event), rcu_head); 5093 5094 if (event->ns) 5095 put_pid_ns(event->ns); 5096 perf_event_free_filter(event); 5097 kmem_cache_free(perf_event_cache, event); 5098 } 5099 5100 static void ring_buffer_attach(struct perf_event *event, 5101 struct perf_buffer *rb); 5102 5103 static void detach_sb_event(struct perf_event *event) 5104 { 5105 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); 5106 5107 raw_spin_lock(&pel->lock); 5108 list_del_rcu(&event->sb_list); 5109 raw_spin_unlock(&pel->lock); 5110 } 5111 5112 static bool is_sb_event(struct perf_event *event) 5113 { 5114 struct perf_event_attr *attr = &event->attr; 5115 5116 if (event->parent) 5117 return false; 5118 5119 if (event->attach_state & PERF_ATTACH_TASK) 5120 return false; 5121 5122 if (attr->mmap || attr->mmap_data || attr->mmap2 || 5123 attr->comm || attr->comm_exec || 5124 attr->task || attr->ksymbol || 5125 attr->context_switch || attr->text_poke || 5126 attr->bpf_event) 5127 return true; 5128 return false; 5129 } 5130 5131 static void unaccount_pmu_sb_event(struct perf_event *event) 5132 { 5133 if (is_sb_event(event)) 5134 detach_sb_event(event); 5135 } 5136 5137 #ifdef CONFIG_NO_HZ_FULL 5138 static DEFINE_SPINLOCK(nr_freq_lock); 5139 #endif 5140 5141 static void unaccount_freq_event_nohz(void) 5142 { 5143 #ifdef CONFIG_NO_HZ_FULL 5144 spin_lock(&nr_freq_lock); 5145 if (atomic_dec_and_test(&nr_freq_events)) 5146 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); 5147 spin_unlock(&nr_freq_lock); 5148 #endif 5149 } 5150 5151 static void unaccount_freq_event(void) 5152 { 5153 if (tick_nohz_full_enabled()) 5154 unaccount_freq_event_nohz(); 5155 else 5156 atomic_dec(&nr_freq_events); 5157 } 5158 5159 5160 static struct perf_ctx_data * 5161 alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global) 5162 { 5163 struct perf_ctx_data *cd; 5164 5165 cd = kzalloc(sizeof(*cd), GFP_KERNEL); 5166 if (!cd) 5167 return NULL; 5168 5169 cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL); 5170 if (!cd->data) { 5171 kfree(cd); 5172 return NULL; 5173 } 5174 5175 cd->global = global; 5176 cd->ctx_cache = ctx_cache; 5177 refcount_set(&cd->refcount, 1); 5178 5179 return cd; 5180 } 5181 5182 static void free_perf_ctx_data(struct perf_ctx_data *cd) 5183 { 5184 kmem_cache_free(cd->ctx_cache, cd->data); 5185 kfree(cd); 5186 } 5187 5188 static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head) 5189 { 5190 struct perf_ctx_data *cd; 5191 5192 cd = container_of(rcu_head, struct perf_ctx_data, rcu_head); 5193 free_perf_ctx_data(cd); 5194 } 5195 5196 static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd) 5197 { 5198 call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu); 5199 } 5200 5201 static int 5202 attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, 5203 bool global) 5204 { 5205 struct perf_ctx_data *cd, *old = NULL; 5206 5207 cd = alloc_perf_ctx_data(ctx_cache, global); 5208 if (!cd) 5209 return -ENOMEM; 5210 5211 for (;;) { 5212 if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { 5213 if (old) 5214 perf_free_ctx_data_rcu(old); 5215 return 0; 5216 } 5217 5218 if (!old) { 5219 /* 5220 * After seeing a dead @old, we raced with 5221 * removal and lost, try again to install @cd. 5222 */ 5223 continue; 5224 } 5225 5226 if (refcount_inc_not_zero(&old->refcount)) { 5227 free_perf_ctx_data(cd); /* unused */ 5228 return 0; 5229 } 5230 5231 /* 5232 * @old is a dead object, refcount==0 is stable, try and 5233 * replace it with @cd. 5234 */ 5235 } 5236 return 0; 5237 } 5238 5239 static void __detach_global_ctx_data(void); 5240 DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem); 5241 static refcount_t global_ctx_data_ref; 5242 5243 static int 5244 attach_global_ctx_data(struct kmem_cache *ctx_cache) 5245 { 5246 struct task_struct *g, *p; 5247 struct perf_ctx_data *cd; 5248 int ret; 5249 5250 if (refcount_inc_not_zero(&global_ctx_data_ref)) 5251 return 0; 5252 5253 guard(percpu_write)(&global_ctx_data_rwsem); 5254 if (refcount_inc_not_zero(&global_ctx_data_ref)) 5255 return 0; 5256 again: 5257 /* Allocate everything */ 5258 scoped_guard (rcu) { 5259 for_each_process_thread(g, p) { 5260 cd = rcu_dereference(p->perf_ctx_data); 5261 if (cd && !cd->global) { 5262 cd->global = 1; 5263 if (!refcount_inc_not_zero(&cd->refcount)) 5264 cd = NULL; 5265 } 5266 if (!cd) { 5267 get_task_struct(p); 5268 goto alloc; 5269 } 5270 } 5271 } 5272 5273 refcount_set(&global_ctx_data_ref, 1); 5274 5275 return 0; 5276 alloc: 5277 ret = attach_task_ctx_data(p, ctx_cache, true); 5278 put_task_struct(p); 5279 if (ret) { 5280 __detach_global_ctx_data(); 5281 return ret; 5282 } 5283 goto again; 5284 } 5285 5286 static int 5287 attach_perf_ctx_data(struct perf_event *event) 5288 { 5289 struct task_struct *task = event->hw.target; 5290 struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache; 5291 int ret; 5292 5293 if (!ctx_cache) 5294 return -ENOMEM; 5295 5296 if (task) 5297 return attach_task_ctx_data(task, ctx_cache, false); 5298 5299 ret = attach_global_ctx_data(ctx_cache); 5300 if (ret) 5301 return ret; 5302 5303 event->attach_state |= PERF_ATTACH_GLOBAL_DATA; 5304 return 0; 5305 } 5306 5307 static void 5308 detach_task_ctx_data(struct task_struct *p) 5309 { 5310 struct perf_ctx_data *cd; 5311 5312 scoped_guard (rcu) { 5313 cd = rcu_dereference(p->perf_ctx_data); 5314 if (!cd || !refcount_dec_and_test(&cd->refcount)) 5315 return; 5316 } 5317 5318 /* 5319 * The old ctx_data may be lost because of the race. 5320 * Nothing is required to do for the case. 5321 * See attach_task_ctx_data(). 5322 */ 5323 if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL)) 5324 perf_free_ctx_data_rcu(cd); 5325 } 5326 5327 static void __detach_global_ctx_data(void) 5328 { 5329 struct task_struct *g, *p; 5330 struct perf_ctx_data *cd; 5331 5332 again: 5333 scoped_guard (rcu) { 5334 for_each_process_thread(g, p) { 5335 cd = rcu_dereference(p->perf_ctx_data); 5336 if (!cd || !cd->global) 5337 continue; 5338 cd->global = 0; 5339 get_task_struct(p); 5340 goto detach; 5341 } 5342 } 5343 return; 5344 detach: 5345 detach_task_ctx_data(p); 5346 put_task_struct(p); 5347 goto again; 5348 } 5349 5350 static void detach_global_ctx_data(void) 5351 { 5352 if (refcount_dec_not_one(&global_ctx_data_ref)) 5353 return; 5354 5355 guard(percpu_write)(&global_ctx_data_rwsem); 5356 if (!refcount_dec_and_test(&global_ctx_data_ref)) 5357 return; 5358 5359 /* remove everything */ 5360 __detach_global_ctx_data(); 5361 } 5362 5363 static void detach_perf_ctx_data(struct perf_event *event) 5364 { 5365 struct task_struct *task = event->hw.target; 5366 5367 event->attach_state &= ~PERF_ATTACH_TASK_DATA; 5368 5369 if (task) 5370 return detach_task_ctx_data(task); 5371 5372 if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) { 5373 detach_global_ctx_data(); 5374 event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA; 5375 } 5376 } 5377 5378 static void unaccount_event(struct perf_event *event) 5379 { 5380 bool dec = false; 5381 5382 if (event->parent) 5383 return; 5384 5385 if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) 5386 dec = true; 5387 if (event->attr.mmap || event->attr.mmap_data) 5388 atomic_dec(&nr_mmap_events); 5389 if (event->attr.build_id) 5390 atomic_dec(&nr_build_id_events); 5391 if (event->attr.comm) 5392 atomic_dec(&nr_comm_events); 5393 if (event->attr.namespaces) 5394 atomic_dec(&nr_namespaces_events); 5395 if (event->attr.cgroup) 5396 atomic_dec(&nr_cgroup_events); 5397 if (event->attr.task) 5398 atomic_dec(&nr_task_events); 5399 if (event->attr.freq) 5400 unaccount_freq_event(); 5401 if (event->attr.context_switch) { 5402 dec = true; 5403 atomic_dec(&nr_switch_events); 5404 } 5405 if (is_cgroup_event(event)) 5406 dec = true; 5407 if (has_branch_stack(event)) 5408 dec = true; 5409 if (event->attr.ksymbol) 5410 atomic_dec(&nr_ksymbol_events); 5411 if (event->attr.bpf_event) 5412 atomic_dec(&nr_bpf_events); 5413 if (event->attr.text_poke) 5414 atomic_dec(&nr_text_poke_events); 5415 5416 if (dec) { 5417 if (!atomic_add_unless(&perf_sched_count, -1, 1)) 5418 schedule_delayed_work(&perf_sched_work, HZ); 5419 } 5420 5421 unaccount_pmu_sb_event(event); 5422 } 5423 5424 static void perf_sched_delayed(struct work_struct *work) 5425 { 5426 mutex_lock(&perf_sched_mutex); 5427 if (atomic_dec_and_test(&perf_sched_count)) 5428 static_branch_disable(&perf_sched_events); 5429 mutex_unlock(&perf_sched_mutex); 5430 } 5431 5432 /* 5433 * The following implement mutual exclusion of events on "exclusive" pmus 5434 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled 5435 * at a time, so we disallow creating events that might conflict, namely: 5436 * 5437 * 1) cpu-wide events in the presence of per-task events, 5438 * 2) per-task events in the presence of cpu-wide events, 5439 * 3) two matching events on the same perf_event_context. 5440 * 5441 * The former two cases are handled in the allocation path (perf_event_alloc(), 5442 * _free_event()), the latter -- before the first perf_install_in_context(). 5443 */ 5444 static int exclusive_event_init(struct perf_event *event) 5445 { 5446 struct pmu *pmu = event->pmu; 5447 5448 if (!is_exclusive_pmu(pmu)) 5449 return 0; 5450 5451 /* 5452 * Prevent co-existence of per-task and cpu-wide events on the 5453 * same exclusive pmu. 5454 * 5455 * Negative pmu::exclusive_cnt means there are cpu-wide 5456 * events on this "exclusive" pmu, positive means there are 5457 * per-task events. 5458 * 5459 * Since this is called in perf_event_alloc() path, event::ctx 5460 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK 5461 * to mean "per-task event", because unlike other attach states it 5462 * never gets cleared. 5463 */ 5464 if (event->attach_state & PERF_ATTACH_TASK) { 5465 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) 5466 return -EBUSY; 5467 } else { 5468 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) 5469 return -EBUSY; 5470 } 5471 5472 event->attach_state |= PERF_ATTACH_EXCLUSIVE; 5473 5474 return 0; 5475 } 5476 5477 static void exclusive_event_destroy(struct perf_event *event) 5478 { 5479 struct pmu *pmu = event->pmu; 5480 5481 /* see comment in exclusive_event_init() */ 5482 if (event->attach_state & PERF_ATTACH_TASK) 5483 atomic_dec(&pmu->exclusive_cnt); 5484 else 5485 atomic_inc(&pmu->exclusive_cnt); 5486 5487 event->attach_state &= ~PERF_ATTACH_EXCLUSIVE; 5488 } 5489 5490 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) 5491 { 5492 if ((e1->pmu == e2->pmu) && 5493 (e1->cpu == e2->cpu || 5494 e1->cpu == -1 || 5495 e2->cpu == -1)) 5496 return true; 5497 return false; 5498 } 5499 5500 static bool exclusive_event_installable(struct perf_event *event, 5501 struct perf_event_context *ctx) 5502 { 5503 struct perf_event *iter_event; 5504 struct pmu *pmu = event->pmu; 5505 5506 lockdep_assert_held(&ctx->mutex); 5507 5508 if (!is_exclusive_pmu(pmu)) 5509 return true; 5510 5511 list_for_each_entry(iter_event, &ctx->event_list, event_entry) { 5512 if (exclusive_event_match(iter_event, event)) 5513 return false; 5514 } 5515 5516 return true; 5517 } 5518 5519 static void perf_free_addr_filters(struct perf_event *event); 5520 5521 /* vs perf_event_alloc() error */ 5522 static void __free_event(struct perf_event *event) 5523 { 5524 if (event->attach_state & PERF_ATTACH_CALLCHAIN) 5525 put_callchain_buffers(); 5526 5527 kfree(event->addr_filter_ranges); 5528 5529 if (event->attach_state & PERF_ATTACH_EXCLUSIVE) 5530 exclusive_event_destroy(event); 5531 5532 if (is_cgroup_event(event)) 5533 perf_detach_cgroup(event); 5534 5535 if (event->attach_state & PERF_ATTACH_TASK_DATA) 5536 detach_perf_ctx_data(event); 5537 5538 if (event->destroy) 5539 event->destroy(event); 5540 5541 /* 5542 * Must be after ->destroy(), due to uprobe_perf_close() using 5543 * hw.target. 5544 */ 5545 if (event->hw.target) 5546 put_task_struct(event->hw.target); 5547 5548 if (event->pmu_ctx) { 5549 /* 5550 * put_pmu_ctx() needs an event->ctx reference, because of 5551 * epc->ctx. 5552 */ 5553 WARN_ON_ONCE(!event->ctx); 5554 WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); 5555 put_pmu_ctx(event->pmu_ctx); 5556 } 5557 5558 /* 5559 * perf_event_free_task() relies on put_ctx() being 'last', in 5560 * particular all task references must be cleaned up. 5561 */ 5562 if (event->ctx) 5563 put_ctx(event->ctx); 5564 5565 if (event->pmu) 5566 module_put(event->pmu->module); 5567 5568 call_rcu(&event->rcu_head, free_event_rcu); 5569 } 5570 5571 DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) 5572 5573 /* vs perf_event_alloc() success */ 5574 static void _free_event(struct perf_event *event) 5575 { 5576 irq_work_sync(&event->pending_irq); 5577 irq_work_sync(&event->pending_disable_irq); 5578 5579 unaccount_event(event); 5580 5581 security_perf_event_free(event); 5582 5583 if (event->rb) { 5584 /* 5585 * Can happen when we close an event with re-directed output. 5586 * 5587 * Since we have a 0 refcount, perf_mmap_close() will skip 5588 * over us; possibly making our ring_buffer_put() the last. 5589 */ 5590 mutex_lock(&event->mmap_mutex); 5591 ring_buffer_attach(event, NULL); 5592 mutex_unlock(&event->mmap_mutex); 5593 } 5594 5595 perf_event_free_bpf_prog(event); 5596 perf_free_addr_filters(event); 5597 5598 __free_event(event); 5599 } 5600 5601 /* 5602 * Used to free events which have a known refcount of 1, such as in error paths 5603 * where the event isn't exposed yet and inherited events. 5604 */ 5605 static void free_event(struct perf_event *event) 5606 { 5607 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, 5608 "unexpected event refcount: %ld; ptr=%p\n", 5609 atomic_long_read(&event->refcount), event)) { 5610 /* leak to avoid use-after-free */ 5611 return; 5612 } 5613 5614 _free_event(event); 5615 } 5616 5617 /* 5618 * Remove user event from the owner task. 5619 */ 5620 static void perf_remove_from_owner(struct perf_event *event) 5621 { 5622 struct task_struct *owner; 5623 5624 rcu_read_lock(); 5625 /* 5626 * Matches the smp_store_release() in perf_event_exit_task(). If we 5627 * observe !owner it means the list deletion is complete and we can 5628 * indeed free this event, otherwise we need to serialize on 5629 * owner->perf_event_mutex. 5630 */ 5631 owner = READ_ONCE(event->owner); 5632 if (owner) { 5633 /* 5634 * Since delayed_put_task_struct() also drops the last 5635 * task reference we can safely take a new reference 5636 * while holding the rcu_read_lock(). 5637 */ 5638 get_task_struct(owner); 5639 } 5640 rcu_read_unlock(); 5641 5642 if (owner) { 5643 /* 5644 * If we're here through perf_event_exit_task() we're already 5645 * holding ctx->mutex which would be an inversion wrt. the 5646 * normal lock order. 5647 * 5648 * However we can safely take this lock because its the child 5649 * ctx->mutex. 5650 */ 5651 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); 5652 5653 /* 5654 * We have to re-check the event->owner field, if it is cleared 5655 * we raced with perf_event_exit_task(), acquiring the mutex 5656 * ensured they're done, and we can proceed with freeing the 5657 * event. 5658 */ 5659 if (event->owner) { 5660 list_del_init(&event->owner_entry); 5661 smp_store_release(&event->owner, NULL); 5662 } 5663 mutex_unlock(&owner->perf_event_mutex); 5664 put_task_struct(owner); 5665 } 5666 } 5667 5668 static void put_event(struct perf_event *event) 5669 { 5670 struct perf_event *parent; 5671 5672 if (!atomic_long_dec_and_test(&event->refcount)) 5673 return; 5674 5675 parent = event->parent; 5676 _free_event(event); 5677 5678 /* Matches the refcount bump in inherit_event() */ 5679 if (parent) 5680 put_event(parent); 5681 } 5682 5683 /* 5684 * Kill an event dead; while event:refcount will preserve the event 5685 * object, it will not preserve its functionality. Once the last 'user' 5686 * gives up the object, we'll destroy the thing. 5687 */ 5688 int perf_event_release_kernel(struct perf_event *event) 5689 { 5690 struct perf_event_context *ctx = event->ctx; 5691 struct perf_event *child, *tmp; 5692 LIST_HEAD(free_list); 5693 5694 /* 5695 * If we got here through err_alloc: free_event(event); we will not 5696 * have attached to a context yet. 5697 */ 5698 if (!ctx) { 5699 WARN_ON_ONCE(event->attach_state & 5700 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); 5701 goto no_ctx; 5702 } 5703 5704 if (!is_kernel_event(event)) 5705 perf_remove_from_owner(event); 5706 5707 ctx = perf_event_ctx_lock(event); 5708 WARN_ON_ONCE(ctx->parent_ctx); 5709 5710 /* 5711 * Mark this event as STATE_DEAD, there is no external reference to it 5712 * anymore. 5713 * 5714 * Anybody acquiring event->child_mutex after the below loop _must_ 5715 * also see this, most importantly inherit_event() which will avoid 5716 * placing more children on the list. 5717 * 5718 * Thus this guarantees that we will in fact observe and kill _ALL_ 5719 * child events. 5720 */ 5721 perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); 5722 5723 perf_event_ctx_unlock(event, ctx); 5724 5725 again: 5726 mutex_lock(&event->child_mutex); 5727 list_for_each_entry(child, &event->child_list, child_list) { 5728 void *var = NULL; 5729 5730 /* 5731 * Cannot change, child events are not migrated, see the 5732 * comment with perf_event_ctx_lock_nested(). 5733 */ 5734 ctx = READ_ONCE(child->ctx); 5735 /* 5736 * Since child_mutex nests inside ctx::mutex, we must jump 5737 * through hoops. We start by grabbing a reference on the ctx. 5738 * 5739 * Since the event cannot get freed while we hold the 5740 * child_mutex, the context must also exist and have a !0 5741 * reference count. 5742 */ 5743 get_ctx(ctx); 5744 5745 /* 5746 * Now that we have a ctx ref, we can drop child_mutex, and 5747 * acquire ctx::mutex without fear of it going away. Then we 5748 * can re-acquire child_mutex. 5749 */ 5750 mutex_unlock(&event->child_mutex); 5751 mutex_lock(&ctx->mutex); 5752 mutex_lock(&event->child_mutex); 5753 5754 /* 5755 * Now that we hold ctx::mutex and child_mutex, revalidate our 5756 * state, if child is still the first entry, it didn't get freed 5757 * and we can continue doing so. 5758 */ 5759 tmp = list_first_entry_or_null(&event->child_list, 5760 struct perf_event, child_list); 5761 if (tmp == child) { 5762 perf_remove_from_context(child, DETACH_GROUP); 5763 list_move(&child->child_list, &free_list); 5764 } else { 5765 var = &ctx->refcount; 5766 } 5767 5768 mutex_unlock(&event->child_mutex); 5769 mutex_unlock(&ctx->mutex); 5770 put_ctx(ctx); 5771 5772 if (var) { 5773 /* 5774 * If perf_event_free_task() has deleted all events from the 5775 * ctx while the child_mutex got released above, make sure to 5776 * notify about the preceding put_ctx(). 5777 */ 5778 smp_mb(); /* pairs with wait_var_event() */ 5779 wake_up_var(var); 5780 } 5781 goto again; 5782 } 5783 mutex_unlock(&event->child_mutex); 5784 5785 list_for_each_entry_safe(child, tmp, &free_list, child_list) { 5786 void *var = &child->ctx->refcount; 5787 5788 list_del(&child->child_list); 5789 /* Last reference unless ->pending_task work is pending */ 5790 put_event(child); 5791 5792 /* 5793 * Wake any perf_event_free_task() waiting for this event to be 5794 * freed. 5795 */ 5796 smp_mb(); /* pairs with wait_var_event() */ 5797 wake_up_var(var); 5798 } 5799 5800 no_ctx: 5801 /* 5802 * Last reference unless ->pending_task work is pending on this event 5803 * or any of its children. 5804 */ 5805 put_event(event); 5806 return 0; 5807 } 5808 EXPORT_SYMBOL_GPL(perf_event_release_kernel); 5809 5810 /* 5811 * Called when the last reference to the file is gone. 5812 */ 5813 static int perf_release(struct inode *inode, struct file *file) 5814 { 5815 perf_event_release_kernel(file->private_data); 5816 return 0; 5817 } 5818 5819 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 5820 { 5821 struct perf_event *child; 5822 u64 total = 0; 5823 5824 *enabled = 0; 5825 *running = 0; 5826 5827 mutex_lock(&event->child_mutex); 5828 5829 (void)perf_event_read(event, false); 5830 total += perf_event_count(event, false); 5831 5832 *enabled += event->total_time_enabled + 5833 atomic64_read(&event->child_total_time_enabled); 5834 *running += event->total_time_running + 5835 atomic64_read(&event->child_total_time_running); 5836 5837 list_for_each_entry(child, &event->child_list, child_list) { 5838 (void)perf_event_read(child, false); 5839 total += perf_event_count(child, false); 5840 *enabled += child->total_time_enabled; 5841 *running += child->total_time_running; 5842 } 5843 mutex_unlock(&event->child_mutex); 5844 5845 return total; 5846 } 5847 5848 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 5849 { 5850 struct perf_event_context *ctx; 5851 u64 count; 5852 5853 ctx = perf_event_ctx_lock(event); 5854 count = __perf_event_read_value(event, enabled, running); 5855 perf_event_ctx_unlock(event, ctx); 5856 5857 return count; 5858 } 5859 EXPORT_SYMBOL_GPL(perf_event_read_value); 5860 5861 static int __perf_read_group_add(struct perf_event *leader, 5862 u64 read_format, u64 *values) 5863 { 5864 struct perf_event_context *ctx = leader->ctx; 5865 struct perf_event *sub, *parent; 5866 unsigned long flags; 5867 int n = 1; /* skip @nr */ 5868 int ret; 5869 5870 ret = perf_event_read(leader, true); 5871 if (ret) 5872 return ret; 5873 5874 raw_spin_lock_irqsave(&ctx->lock, flags); 5875 /* 5876 * Verify the grouping between the parent and child (inherited) 5877 * events is still in tact. 5878 * 5879 * Specifically: 5880 * - leader->ctx->lock pins leader->sibling_list 5881 * - parent->child_mutex pins parent->child_list 5882 * - parent->ctx->mutex pins parent->sibling_list 5883 * 5884 * Because parent->ctx != leader->ctx (and child_list nests inside 5885 * ctx->mutex), group destruction is not atomic between children, also 5886 * see perf_event_release_kernel(). Additionally, parent can grow the 5887 * group. 5888 * 5889 * Therefore it is possible to have parent and child groups in a 5890 * different configuration and summing over such a beast makes no sense 5891 * what so ever. 5892 * 5893 * Reject this. 5894 */ 5895 parent = leader->parent; 5896 if (parent && 5897 (parent->group_generation != leader->group_generation || 5898 parent->nr_siblings != leader->nr_siblings)) { 5899 ret = -ECHILD; 5900 goto unlock; 5901 } 5902 5903 /* 5904 * Since we co-schedule groups, {enabled,running} times of siblings 5905 * will be identical to those of the leader, so we only publish one 5906 * set. 5907 */ 5908 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 5909 values[n++] += leader->total_time_enabled + 5910 atomic64_read(&leader->child_total_time_enabled); 5911 } 5912 5913 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 5914 values[n++] += leader->total_time_running + 5915 atomic64_read(&leader->child_total_time_running); 5916 } 5917 5918 /* 5919 * Write {count,id} tuples for every sibling. 5920 */ 5921 values[n++] += perf_event_count(leader, false); 5922 if (read_format & PERF_FORMAT_ID) 5923 values[n++] = primary_event_id(leader); 5924 if (read_format & PERF_FORMAT_LOST) 5925 values[n++] = atomic64_read(&leader->lost_samples); 5926 5927 for_each_sibling_event(sub, leader) { 5928 values[n++] += perf_event_count(sub, false); 5929 if (read_format & PERF_FORMAT_ID) 5930 values[n++] = primary_event_id(sub); 5931 if (read_format & PERF_FORMAT_LOST) 5932 values[n++] = atomic64_read(&sub->lost_samples); 5933 } 5934 5935 unlock: 5936 raw_spin_unlock_irqrestore(&ctx->lock, flags); 5937 return ret; 5938 } 5939 5940 static int perf_read_group(struct perf_event *event, 5941 u64 read_format, char __user *buf) 5942 { 5943 struct perf_event *leader = event->group_leader, *child; 5944 struct perf_event_context *ctx = leader->ctx; 5945 int ret; 5946 u64 *values; 5947 5948 lockdep_assert_held(&ctx->mutex); 5949 5950 values = kzalloc(event->read_size, GFP_KERNEL); 5951 if (!values) 5952 return -ENOMEM; 5953 5954 values[0] = 1 + leader->nr_siblings; 5955 5956 mutex_lock(&leader->child_mutex); 5957 5958 ret = __perf_read_group_add(leader, read_format, values); 5959 if (ret) 5960 goto unlock; 5961 5962 list_for_each_entry(child, &leader->child_list, child_list) { 5963 ret = __perf_read_group_add(child, read_format, values); 5964 if (ret) 5965 goto unlock; 5966 } 5967 5968 mutex_unlock(&leader->child_mutex); 5969 5970 ret = event->read_size; 5971 if (copy_to_user(buf, values, event->read_size)) 5972 ret = -EFAULT; 5973 goto out; 5974 5975 unlock: 5976 mutex_unlock(&leader->child_mutex); 5977 out: 5978 kfree(values); 5979 return ret; 5980 } 5981 5982 static int perf_read_one(struct perf_event *event, 5983 u64 read_format, char __user *buf) 5984 { 5985 u64 enabled, running; 5986 u64 values[5]; 5987 int n = 0; 5988 5989 values[n++] = __perf_event_read_value(event, &enabled, &running); 5990 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 5991 values[n++] = enabled; 5992 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 5993 values[n++] = running; 5994 if (read_format & PERF_FORMAT_ID) 5995 values[n++] = primary_event_id(event); 5996 if (read_format & PERF_FORMAT_LOST) 5997 values[n++] = atomic64_read(&event->lost_samples); 5998 5999 if (copy_to_user(buf, values, n * sizeof(u64))) 6000 return -EFAULT; 6001 6002 return n * sizeof(u64); 6003 } 6004 6005 static bool is_event_hup(struct perf_event *event) 6006 { 6007 bool no_children; 6008 6009 if (event->state > PERF_EVENT_STATE_EXIT) 6010 return false; 6011 6012 mutex_lock(&event->child_mutex); 6013 no_children = list_empty(&event->child_list); 6014 mutex_unlock(&event->child_mutex); 6015 return no_children; 6016 } 6017 6018 /* 6019 * Read the performance event - simple non blocking version for now 6020 */ 6021 static ssize_t 6022 __perf_read(struct perf_event *event, char __user *buf, size_t count) 6023 { 6024 u64 read_format = event->attr.read_format; 6025 int ret; 6026 6027 /* 6028 * Return end-of-file for a read on an event that is in 6029 * error state (i.e. because it was pinned but it couldn't be 6030 * scheduled on to the CPU at some point). 6031 */ 6032 if (event->state == PERF_EVENT_STATE_ERROR) 6033 return 0; 6034 6035 if (count < event->read_size) 6036 return -ENOSPC; 6037 6038 WARN_ON_ONCE(event->ctx->parent_ctx); 6039 if (read_format & PERF_FORMAT_GROUP) 6040 ret = perf_read_group(event, read_format, buf); 6041 else 6042 ret = perf_read_one(event, read_format, buf); 6043 6044 return ret; 6045 } 6046 6047 static ssize_t 6048 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 6049 { 6050 struct perf_event *event = file->private_data; 6051 struct perf_event_context *ctx; 6052 int ret; 6053 6054 ret = security_perf_event_read(event); 6055 if (ret) 6056 return ret; 6057 6058 ctx = perf_event_ctx_lock(event); 6059 ret = __perf_read(event, buf, count); 6060 perf_event_ctx_unlock(event, ctx); 6061 6062 return ret; 6063 } 6064 6065 static __poll_t perf_poll(struct file *file, poll_table *wait) 6066 { 6067 struct perf_event *event = file->private_data; 6068 struct perf_buffer *rb; 6069 __poll_t events = EPOLLHUP; 6070 6071 poll_wait(file, &event->waitq, wait); 6072 6073 if (is_event_hup(event)) 6074 return events; 6075 6076 if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && 6077 event->attr.pinned)) 6078 return EPOLLERR; 6079 6080 /* 6081 * Pin the event->rb by taking event->mmap_mutex; otherwise 6082 * perf_event_set_output() can swizzle our rb and make us miss wakeups. 6083 */ 6084 mutex_lock(&event->mmap_mutex); 6085 rb = event->rb; 6086 if (rb) 6087 events = atomic_xchg(&rb->poll, 0); 6088 mutex_unlock(&event->mmap_mutex); 6089 return events; 6090 } 6091 6092 static void _perf_event_reset(struct perf_event *event) 6093 { 6094 (void)perf_event_read(event, false); 6095 local64_set(&event->count, 0); 6096 perf_event_update_userpage(event); 6097 } 6098 6099 /* Assume it's not an event with inherit set. */ 6100 u64 perf_event_pause(struct perf_event *event, bool reset) 6101 { 6102 struct perf_event_context *ctx; 6103 u64 count; 6104 6105 ctx = perf_event_ctx_lock(event); 6106 WARN_ON_ONCE(event->attr.inherit); 6107 _perf_event_disable(event); 6108 count = local64_read(&event->count); 6109 if (reset) 6110 local64_set(&event->count, 0); 6111 perf_event_ctx_unlock(event, ctx); 6112 6113 return count; 6114 } 6115 EXPORT_SYMBOL_GPL(perf_event_pause); 6116 6117 /* 6118 * Holding the top-level event's child_mutex means that any 6119 * descendant process that has inherited this event will block 6120 * in perf_event_exit_event() if it goes to exit, thus satisfying the 6121 * task existence requirements of perf_event_enable/disable. 6122 */ 6123 static void perf_event_for_each_child(struct perf_event *event, 6124 void (*func)(struct perf_event *)) 6125 { 6126 struct perf_event *child; 6127 6128 WARN_ON_ONCE(event->ctx->parent_ctx); 6129 6130 mutex_lock(&event->child_mutex); 6131 func(event); 6132 list_for_each_entry(child, &event->child_list, child_list) 6133 func(child); 6134 mutex_unlock(&event->child_mutex); 6135 } 6136 6137 static void perf_event_for_each(struct perf_event *event, 6138 void (*func)(struct perf_event *)) 6139 { 6140 struct perf_event_context *ctx = event->ctx; 6141 struct perf_event *sibling; 6142 6143 lockdep_assert_held(&ctx->mutex); 6144 6145 event = event->group_leader; 6146 6147 perf_event_for_each_child(event, func); 6148 for_each_sibling_event(sibling, event) 6149 perf_event_for_each_child(sibling, func); 6150 } 6151 6152 static void __perf_event_period(struct perf_event *event, 6153 struct perf_cpu_context *cpuctx, 6154 struct perf_event_context *ctx, 6155 void *info) 6156 { 6157 u64 value = *((u64 *)info); 6158 bool active; 6159 6160 if (event->attr.freq) { 6161 event->attr.sample_freq = value; 6162 } else { 6163 event->attr.sample_period = value; 6164 event->hw.sample_period = value; 6165 } 6166 6167 active = (event->state == PERF_EVENT_STATE_ACTIVE); 6168 if (active) { 6169 perf_pmu_disable(event->pmu); 6170 /* 6171 * We could be throttled; unthrottle now to avoid the tick 6172 * trying to unthrottle while we already re-started the event. 6173 */ 6174 if (event->hw.interrupts == MAX_INTERRUPTS) { 6175 event->hw.interrupts = 0; 6176 perf_log_throttle(event, 1); 6177 } 6178 event->pmu->stop(event, PERF_EF_UPDATE); 6179 } 6180 6181 local64_set(&event->hw.period_left, 0); 6182 6183 if (active) { 6184 event->pmu->start(event, PERF_EF_RELOAD); 6185 perf_pmu_enable(event->pmu); 6186 } 6187 } 6188 6189 static int perf_event_check_period(struct perf_event *event, u64 value) 6190 { 6191 return event->pmu->check_period(event, value); 6192 } 6193 6194 static int _perf_event_period(struct perf_event *event, u64 value) 6195 { 6196 if (!is_sampling_event(event)) 6197 return -EINVAL; 6198 6199 if (!value) 6200 return -EINVAL; 6201 6202 if (event->attr.freq) { 6203 if (value > sysctl_perf_event_sample_rate) 6204 return -EINVAL; 6205 } else { 6206 if (perf_event_check_period(event, value)) 6207 return -EINVAL; 6208 if (value & (1ULL << 63)) 6209 return -EINVAL; 6210 } 6211 6212 event_function_call(event, __perf_event_period, &value); 6213 6214 return 0; 6215 } 6216 6217 int perf_event_period(struct perf_event *event, u64 value) 6218 { 6219 struct perf_event_context *ctx; 6220 int ret; 6221 6222 ctx = perf_event_ctx_lock(event); 6223 ret = _perf_event_period(event, value); 6224 perf_event_ctx_unlock(event, ctx); 6225 6226 return ret; 6227 } 6228 EXPORT_SYMBOL_GPL(perf_event_period); 6229 6230 static const struct file_operations perf_fops; 6231 6232 static inline bool is_perf_file(struct fd f) 6233 { 6234 return !fd_empty(f) && fd_file(f)->f_op == &perf_fops; 6235 } 6236 6237 static int perf_event_set_output(struct perf_event *event, 6238 struct perf_event *output_event); 6239 static int perf_event_set_filter(struct perf_event *event, void __user *arg); 6240 static int perf_copy_attr(struct perf_event_attr __user *uattr, 6241 struct perf_event_attr *attr); 6242 6243 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) 6244 { 6245 void (*func)(struct perf_event *); 6246 u32 flags = arg; 6247 6248 switch (cmd) { 6249 case PERF_EVENT_IOC_ENABLE: 6250 func = _perf_event_enable; 6251 break; 6252 case PERF_EVENT_IOC_DISABLE: 6253 func = _perf_event_disable; 6254 break; 6255 case PERF_EVENT_IOC_RESET: 6256 func = _perf_event_reset; 6257 break; 6258 6259 case PERF_EVENT_IOC_REFRESH: 6260 return _perf_event_refresh(event, arg); 6261 6262 case PERF_EVENT_IOC_PERIOD: 6263 { 6264 u64 value; 6265 6266 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) 6267 return -EFAULT; 6268 6269 return _perf_event_period(event, value); 6270 } 6271 case PERF_EVENT_IOC_ID: 6272 { 6273 u64 id = primary_event_id(event); 6274 6275 if (copy_to_user((void __user *)arg, &id, sizeof(id))) 6276 return -EFAULT; 6277 return 0; 6278 } 6279 6280 case PERF_EVENT_IOC_SET_OUTPUT: 6281 { 6282 CLASS(fd, output)(arg); // arg == -1 => empty 6283 struct perf_event *output_event = NULL; 6284 if (arg != -1) { 6285 if (!is_perf_file(output)) 6286 return -EBADF; 6287 output_event = fd_file(output)->private_data; 6288 } 6289 return perf_event_set_output(event, output_event); 6290 } 6291 6292 case PERF_EVENT_IOC_SET_FILTER: 6293 return perf_event_set_filter(event, (void __user *)arg); 6294 6295 case PERF_EVENT_IOC_SET_BPF: 6296 { 6297 struct bpf_prog *prog; 6298 int err; 6299 6300 prog = bpf_prog_get(arg); 6301 if (IS_ERR(prog)) 6302 return PTR_ERR(prog); 6303 6304 err = perf_event_set_bpf_prog(event, prog, 0); 6305 if (err) { 6306 bpf_prog_put(prog); 6307 return err; 6308 } 6309 6310 return 0; 6311 } 6312 6313 case PERF_EVENT_IOC_PAUSE_OUTPUT: { 6314 struct perf_buffer *rb; 6315 6316 rcu_read_lock(); 6317 rb = rcu_dereference(event->rb); 6318 if (!rb || !rb->nr_pages) { 6319 rcu_read_unlock(); 6320 return -EINVAL; 6321 } 6322 rb_toggle_paused(rb, !!arg); 6323 rcu_read_unlock(); 6324 return 0; 6325 } 6326 6327 case PERF_EVENT_IOC_QUERY_BPF: 6328 return perf_event_query_prog_array(event, (void __user *)arg); 6329 6330 case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { 6331 struct perf_event_attr new_attr; 6332 int err = perf_copy_attr((struct perf_event_attr __user *)arg, 6333 &new_attr); 6334 6335 if (err) 6336 return err; 6337 6338 return perf_event_modify_attr(event, &new_attr); 6339 } 6340 default: 6341 return -ENOTTY; 6342 } 6343 6344 if (flags & PERF_IOC_FLAG_GROUP) 6345 perf_event_for_each(event, func); 6346 else 6347 perf_event_for_each_child(event, func); 6348 6349 return 0; 6350 } 6351 6352 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 6353 { 6354 struct perf_event *event = file->private_data; 6355 struct perf_event_context *ctx; 6356 long ret; 6357 6358 /* Treat ioctl like writes as it is likely a mutating operation. */ 6359 ret = security_perf_event_write(event); 6360 if (ret) 6361 return ret; 6362 6363 ctx = perf_event_ctx_lock(event); 6364 ret = _perf_ioctl(event, cmd, arg); 6365 perf_event_ctx_unlock(event, ctx); 6366 6367 return ret; 6368 } 6369 6370 #ifdef CONFIG_COMPAT 6371 static long perf_compat_ioctl(struct file *file, unsigned int cmd, 6372 unsigned long arg) 6373 { 6374 switch (_IOC_NR(cmd)) { 6375 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): 6376 case _IOC_NR(PERF_EVENT_IOC_ID): 6377 case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): 6378 case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): 6379 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ 6380 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { 6381 cmd &= ~IOCSIZE_MASK; 6382 cmd |= sizeof(void *) << IOCSIZE_SHIFT; 6383 } 6384 break; 6385 } 6386 return perf_ioctl(file, cmd, arg); 6387 } 6388 #else 6389 # define perf_compat_ioctl NULL 6390 #endif 6391 6392 int perf_event_task_enable(void) 6393 { 6394 struct perf_event_context *ctx; 6395 struct perf_event *event; 6396 6397 mutex_lock(¤t->perf_event_mutex); 6398 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { 6399 ctx = perf_event_ctx_lock(event); 6400 perf_event_for_each_child(event, _perf_event_enable); 6401 perf_event_ctx_unlock(event, ctx); 6402 } 6403 mutex_unlock(¤t->perf_event_mutex); 6404 6405 return 0; 6406 } 6407 6408 int perf_event_task_disable(void) 6409 { 6410 struct perf_event_context *ctx; 6411 struct perf_event *event; 6412 6413 mutex_lock(¤t->perf_event_mutex); 6414 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { 6415 ctx = perf_event_ctx_lock(event); 6416 perf_event_for_each_child(event, _perf_event_disable); 6417 perf_event_ctx_unlock(event, ctx); 6418 } 6419 mutex_unlock(¤t->perf_event_mutex); 6420 6421 return 0; 6422 } 6423 6424 static int perf_event_index(struct perf_event *event) 6425 { 6426 if (event->hw.state & PERF_HES_STOPPED) 6427 return 0; 6428 6429 if (event->state != PERF_EVENT_STATE_ACTIVE) 6430 return 0; 6431 6432 return event->pmu->event_idx(event); 6433 } 6434 6435 static void perf_event_init_userpage(struct perf_event *event) 6436 { 6437 struct perf_event_mmap_page *userpg; 6438 struct perf_buffer *rb; 6439 6440 rcu_read_lock(); 6441 rb = rcu_dereference(event->rb); 6442 if (!rb) 6443 goto unlock; 6444 6445 userpg = rb->user_page; 6446 6447 /* Allow new userspace to detect that bit 0 is deprecated */ 6448 userpg->cap_bit0_is_deprecated = 1; 6449 userpg->size = offsetof(struct perf_event_mmap_page, __reserved); 6450 userpg->data_offset = PAGE_SIZE; 6451 userpg->data_size = perf_data_size(rb); 6452 6453 unlock: 6454 rcu_read_unlock(); 6455 } 6456 6457 void __weak arch_perf_update_userpage( 6458 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) 6459 { 6460 } 6461 6462 /* 6463 * Callers need to ensure there can be no nesting of this function, otherwise 6464 * the seqlock logic goes bad. We can not serialize this because the arch 6465 * code calls this from NMI context. 6466 */ 6467 void perf_event_update_userpage(struct perf_event *event) 6468 { 6469 struct perf_event_mmap_page *userpg; 6470 struct perf_buffer *rb; 6471 u64 enabled, running, now; 6472 6473 rcu_read_lock(); 6474 rb = rcu_dereference(event->rb); 6475 if (!rb) 6476 goto unlock; 6477 6478 /* 6479 * compute total_time_enabled, total_time_running 6480 * based on snapshot values taken when the event 6481 * was last scheduled in. 6482 * 6483 * we cannot simply called update_context_time() 6484 * because of locking issue as we can be called in 6485 * NMI context 6486 */ 6487 calc_timer_values(event, &now, &enabled, &running); 6488 6489 userpg = rb->user_page; 6490 /* 6491 * Disable preemption to guarantee consistent time stamps are stored to 6492 * the user page. 6493 */ 6494 preempt_disable(); 6495 ++userpg->lock; 6496 barrier(); 6497 userpg->index = perf_event_index(event); 6498 userpg->offset = perf_event_count(event, false); 6499 if (userpg->index) 6500 userpg->offset -= local64_read(&event->hw.prev_count); 6501 6502 userpg->time_enabled = enabled + 6503 atomic64_read(&event->child_total_time_enabled); 6504 6505 userpg->time_running = running + 6506 atomic64_read(&event->child_total_time_running); 6507 6508 arch_perf_update_userpage(event, userpg, now); 6509 6510 barrier(); 6511 ++userpg->lock; 6512 preempt_enable(); 6513 unlock: 6514 rcu_read_unlock(); 6515 } 6516 EXPORT_SYMBOL_GPL(perf_event_update_userpage); 6517 6518 static void ring_buffer_attach(struct perf_event *event, 6519 struct perf_buffer *rb) 6520 { 6521 struct perf_buffer *old_rb = NULL; 6522 unsigned long flags; 6523 6524 WARN_ON_ONCE(event->parent); 6525 6526 if (event->rb) { 6527 /* 6528 * Should be impossible, we set this when removing 6529 * event->rb_entry and wait/clear when adding event->rb_entry. 6530 */ 6531 WARN_ON_ONCE(event->rcu_pending); 6532 6533 old_rb = event->rb; 6534 spin_lock_irqsave(&old_rb->event_lock, flags); 6535 list_del_rcu(&event->rb_entry); 6536 spin_unlock_irqrestore(&old_rb->event_lock, flags); 6537 6538 event->rcu_batches = get_state_synchronize_rcu(); 6539 event->rcu_pending = 1; 6540 } 6541 6542 if (rb) { 6543 if (event->rcu_pending) { 6544 cond_synchronize_rcu(event->rcu_batches); 6545 event->rcu_pending = 0; 6546 } 6547 6548 spin_lock_irqsave(&rb->event_lock, flags); 6549 list_add_rcu(&event->rb_entry, &rb->event_list); 6550 spin_unlock_irqrestore(&rb->event_lock, flags); 6551 } 6552 6553 /* 6554 * Avoid racing with perf_mmap_close(AUX): stop the event 6555 * before swizzling the event::rb pointer; if it's getting 6556 * unmapped, its aux_mmap_count will be 0 and it won't 6557 * restart. See the comment in __perf_pmu_output_stop(). 6558 * 6559 * Data will inevitably be lost when set_output is done in 6560 * mid-air, but then again, whoever does it like this is 6561 * not in for the data anyway. 6562 */ 6563 if (has_aux(event)) 6564 perf_event_stop(event, 0); 6565 6566 rcu_assign_pointer(event->rb, rb); 6567 6568 if (old_rb) { 6569 ring_buffer_put(old_rb); 6570 /* 6571 * Since we detached before setting the new rb, so that we 6572 * could attach the new rb, we could have missed a wakeup. 6573 * Provide it now. 6574 */ 6575 wake_up_all(&event->waitq); 6576 } 6577 } 6578 6579 static void ring_buffer_wakeup(struct perf_event *event) 6580 { 6581 struct perf_buffer *rb; 6582 6583 if (event->parent) 6584 event = event->parent; 6585 6586 rcu_read_lock(); 6587 rb = rcu_dereference(event->rb); 6588 if (rb) { 6589 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 6590 wake_up_all(&event->waitq); 6591 } 6592 rcu_read_unlock(); 6593 } 6594 6595 struct perf_buffer *ring_buffer_get(struct perf_event *event) 6596 { 6597 struct perf_buffer *rb; 6598 6599 if (event->parent) 6600 event = event->parent; 6601 6602 rcu_read_lock(); 6603 rb = rcu_dereference(event->rb); 6604 if (rb) { 6605 if (!refcount_inc_not_zero(&rb->refcount)) 6606 rb = NULL; 6607 } 6608 rcu_read_unlock(); 6609 6610 return rb; 6611 } 6612 6613 void ring_buffer_put(struct perf_buffer *rb) 6614 { 6615 if (!refcount_dec_and_test(&rb->refcount)) 6616 return; 6617 6618 WARN_ON_ONCE(!list_empty(&rb->event_list)); 6619 6620 call_rcu(&rb->rcu_head, rb_free_rcu); 6621 } 6622 6623 static void perf_mmap_open(struct vm_area_struct *vma) 6624 { 6625 struct perf_event *event = vma->vm_file->private_data; 6626 6627 atomic_inc(&event->mmap_count); 6628 atomic_inc(&event->rb->mmap_count); 6629 6630 if (vma->vm_pgoff) 6631 atomic_inc(&event->rb->aux_mmap_count); 6632 6633 if (event->pmu->event_mapped) 6634 event->pmu->event_mapped(event, vma->vm_mm); 6635 } 6636 6637 static void perf_pmu_output_stop(struct perf_event *event); 6638 6639 /* 6640 * A buffer can be mmap()ed multiple times; either directly through the same 6641 * event, or through other events by use of perf_event_set_output(). 6642 * 6643 * In order to undo the VM accounting done by perf_mmap() we need to destroy 6644 * the buffer here, where we still have a VM context. This means we need 6645 * to detach all events redirecting to us. 6646 */ 6647 static void perf_mmap_close(struct vm_area_struct *vma) 6648 { 6649 struct perf_event *event = vma->vm_file->private_data; 6650 struct perf_buffer *rb = ring_buffer_get(event); 6651 struct user_struct *mmap_user = rb->mmap_user; 6652 int mmap_locked = rb->mmap_locked; 6653 unsigned long size = perf_data_size(rb); 6654 bool detach_rest = false; 6655 6656 if (event->pmu->event_unmapped) 6657 event->pmu->event_unmapped(event, vma->vm_mm); 6658 6659 /* 6660 * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex 6661 * to avoid complications. 6662 */ 6663 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && 6664 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { 6665 /* 6666 * Stop all AUX events that are writing to this buffer, 6667 * so that we can free its AUX pages and corresponding PMU 6668 * data. Note that after rb::aux_mmap_count dropped to zero, 6669 * they won't start any more (see perf_aux_output_begin()). 6670 */ 6671 perf_pmu_output_stop(event); 6672 6673 /* now it's safe to free the pages */ 6674 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); 6675 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); 6676 6677 /* this has to be the last one */ 6678 rb_free_aux(rb); 6679 WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); 6680 6681 mutex_unlock(&rb->aux_mutex); 6682 } 6683 6684 if (atomic_dec_and_test(&rb->mmap_count)) 6685 detach_rest = true; 6686 6687 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 6688 goto out_put; 6689 6690 ring_buffer_attach(event, NULL); 6691 mutex_unlock(&event->mmap_mutex); 6692 6693 /* If there's still other mmap()s of this buffer, we're done. */ 6694 if (!detach_rest) 6695 goto out_put; 6696 6697 /* 6698 * No other mmap()s, detach from all other events that might redirect 6699 * into the now unreachable buffer. Somewhat complicated by the 6700 * fact that rb::event_lock otherwise nests inside mmap_mutex. 6701 */ 6702 again: 6703 rcu_read_lock(); 6704 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { 6705 if (!atomic_long_inc_not_zero(&event->refcount)) { 6706 /* 6707 * This event is en-route to free_event() which will 6708 * detach it and remove it from the list. 6709 */ 6710 continue; 6711 } 6712 rcu_read_unlock(); 6713 6714 mutex_lock(&event->mmap_mutex); 6715 /* 6716 * Check we didn't race with perf_event_set_output() which can 6717 * swizzle the rb from under us while we were waiting to 6718 * acquire mmap_mutex. 6719 * 6720 * If we find a different rb; ignore this event, a next 6721 * iteration will no longer find it on the list. We have to 6722 * still restart the iteration to make sure we're not now 6723 * iterating the wrong list. 6724 */ 6725 if (event->rb == rb) 6726 ring_buffer_attach(event, NULL); 6727 6728 mutex_unlock(&event->mmap_mutex); 6729 put_event(event); 6730 6731 /* 6732 * Restart the iteration; either we're on the wrong list or 6733 * destroyed its integrity by doing a deletion. 6734 */ 6735 goto again; 6736 } 6737 rcu_read_unlock(); 6738 6739 /* 6740 * It could be there's still a few 0-ref events on the list; they'll 6741 * get cleaned up by free_event() -- they'll also still have their 6742 * ref on the rb and will free it whenever they are done with it. 6743 * 6744 * Aside from that, this buffer is 'fully' detached and unmapped, 6745 * undo the VM accounting. 6746 */ 6747 6748 atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, 6749 &mmap_user->locked_vm); 6750 atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); 6751 free_uid(mmap_user); 6752 6753 out_put: 6754 ring_buffer_put(rb); /* could be last */ 6755 } 6756 6757 static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) 6758 { 6759 /* The first page is the user control page, others are read-only. */ 6760 return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; 6761 } 6762 6763 static const struct vm_operations_struct perf_mmap_vmops = { 6764 .open = perf_mmap_open, 6765 .close = perf_mmap_close, /* non mergeable */ 6766 .pfn_mkwrite = perf_mmap_pfn_mkwrite, 6767 }; 6768 6769 static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) 6770 { 6771 unsigned long nr_pages = vma_pages(vma); 6772 int err = 0; 6773 unsigned long pagenum; 6774 6775 /* 6776 * We map this as a VM_PFNMAP VMA. 6777 * 6778 * This is not ideal as this is designed broadly for mappings of PFNs 6779 * referencing memory-mapped I/O ranges or non-system RAM i.e. for which 6780 * !pfn_valid(pfn). 6781 * 6782 * We are mapping kernel-allocated memory (memory we manage ourselves) 6783 * which would more ideally be mapped using vm_insert_page() or a 6784 * similar mechanism, that is as a VM_MIXEDMAP mapping. 6785 * 6786 * However this won't work here, because: 6787 * 6788 * 1. It uses vma->vm_page_prot, but this field has not been completely 6789 * setup at the point of the f_op->mmp() hook, so we are unable to 6790 * indicate that this should be mapped CoW in order that the 6791 * mkwrite() hook can be invoked to make the first page R/W and the 6792 * rest R/O as desired. 6793 * 6794 * 2. Anything other than a VM_PFNMAP of valid PFNs will result in 6795 * vm_normal_page() returning a struct page * pointer, which means 6796 * vm_ops->page_mkwrite() will be invoked rather than 6797 * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping 6798 * to work around retry logic in the fault handler, however this 6799 * field is no longer allowed to be used within struct page. 6800 * 6801 * 3. Having a struct page * made available in the fault logic also 6802 * means that the page gets put on the rmap and becomes 6803 * inappropriately accessible and subject to map and ref counting. 6804 * 6805 * Ideally we would have a mechanism that could explicitly express our 6806 * desires, but this is not currently the case, so we instead use 6807 * VM_PFNMAP. 6808 * 6809 * We manage the lifetime of these mappings with internal refcounts (see 6810 * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of 6811 * this mapping is maintained correctly. 6812 */ 6813 for (pagenum = 0; pagenum < nr_pages; pagenum++) { 6814 unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; 6815 struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); 6816 6817 if (page == NULL) { 6818 err = -EINVAL; 6819 break; 6820 } 6821 6822 /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ 6823 err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, 6824 vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); 6825 if (err) 6826 break; 6827 } 6828 6829 #ifdef CONFIG_MMU 6830 /* Clear any partial mappings on error. */ 6831 if (err) 6832 zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); 6833 #endif 6834 6835 return err; 6836 } 6837 6838 static int perf_mmap(struct file *file, struct vm_area_struct *vma) 6839 { 6840 struct perf_event *event = file->private_data; 6841 unsigned long user_locked, user_lock_limit; 6842 struct user_struct *user = current_user(); 6843 struct mutex *aux_mutex = NULL; 6844 struct perf_buffer *rb = NULL; 6845 unsigned long locked, lock_limit; 6846 unsigned long vma_size; 6847 unsigned long nr_pages; 6848 long user_extra = 0, extra = 0; 6849 int ret, flags = 0; 6850 6851 /* 6852 * Don't allow mmap() of inherited per-task counters. This would 6853 * create a performance issue due to all children writing to the 6854 * same rb. 6855 */ 6856 if (event->cpu == -1 && event->attr.inherit) 6857 return -EINVAL; 6858 6859 if (!(vma->vm_flags & VM_SHARED)) 6860 return -EINVAL; 6861 6862 ret = security_perf_event_read(event); 6863 if (ret) 6864 return ret; 6865 6866 vma_size = vma->vm_end - vma->vm_start; 6867 nr_pages = vma_size / PAGE_SIZE; 6868 6869 if (nr_pages > INT_MAX) 6870 return -ENOMEM; 6871 6872 if (vma_size != PAGE_SIZE * nr_pages) 6873 return -EINVAL; 6874 6875 user_extra = nr_pages; 6876 6877 mutex_lock(&event->mmap_mutex); 6878 ret = -EINVAL; 6879 6880 if (vma->vm_pgoff == 0) { 6881 nr_pages -= 1; 6882 6883 /* 6884 * If we have rb pages ensure they're a power-of-two number, so we 6885 * can do bitmasks instead of modulo. 6886 */ 6887 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 6888 goto unlock; 6889 6890 WARN_ON_ONCE(event->ctx->parent_ctx); 6891 6892 if (event->rb) { 6893 if (data_page_nr(event->rb) != nr_pages) 6894 goto unlock; 6895 6896 if (atomic_inc_not_zero(&event->rb->mmap_count)) { 6897 /* 6898 * Success -- managed to mmap() the same buffer 6899 * multiple times. 6900 */ 6901 ret = 0; 6902 /* We need the rb to map pages. */ 6903 rb = event->rb; 6904 goto unlock; 6905 } 6906 6907 /* 6908 * Raced against perf_mmap_close()'s 6909 * atomic_dec_and_mutex_lock() remove the 6910 * event and continue as if !event->rb 6911 */ 6912 ring_buffer_attach(event, NULL); 6913 } 6914 6915 } else { 6916 /* 6917 * AUX area mapping: if rb->aux_nr_pages != 0, it's already 6918 * mapped, all subsequent mappings should have the same size 6919 * and offset. Must be above the normal perf buffer. 6920 */ 6921 u64 aux_offset, aux_size; 6922 6923 rb = event->rb; 6924 if (!rb) 6925 goto aux_unlock; 6926 6927 aux_mutex = &rb->aux_mutex; 6928 mutex_lock(aux_mutex); 6929 6930 aux_offset = READ_ONCE(rb->user_page->aux_offset); 6931 aux_size = READ_ONCE(rb->user_page->aux_size); 6932 6933 if (aux_offset < perf_data_size(rb) + PAGE_SIZE) 6934 goto aux_unlock; 6935 6936 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) 6937 goto aux_unlock; 6938 6939 /* already mapped with a different offset */ 6940 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) 6941 goto aux_unlock; 6942 6943 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) 6944 goto aux_unlock; 6945 6946 /* already mapped with a different size */ 6947 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) 6948 goto aux_unlock; 6949 6950 if (!is_power_of_2(nr_pages)) 6951 goto aux_unlock; 6952 6953 if (!atomic_inc_not_zero(&rb->mmap_count)) 6954 goto aux_unlock; 6955 6956 if (rb_has_aux(rb)) { 6957 atomic_inc(&rb->aux_mmap_count); 6958 ret = 0; 6959 goto unlock; 6960 } 6961 6962 atomic_set(&rb->aux_mmap_count, 1); 6963 } 6964 6965 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 6966 6967 /* 6968 * Increase the limit linearly with more CPUs: 6969 */ 6970 user_lock_limit *= num_online_cpus(); 6971 6972 user_locked = atomic_long_read(&user->locked_vm); 6973 6974 /* 6975 * sysctl_perf_event_mlock may have changed, so that 6976 * user->locked_vm > user_lock_limit 6977 */ 6978 if (user_locked > user_lock_limit) 6979 user_locked = user_lock_limit; 6980 user_locked += user_extra; 6981 6982 if (user_locked > user_lock_limit) { 6983 /* 6984 * charge locked_vm until it hits user_lock_limit; 6985 * charge the rest from pinned_vm 6986 */ 6987 extra = user_locked - user_lock_limit; 6988 user_extra -= extra; 6989 } 6990 6991 lock_limit = rlimit(RLIMIT_MEMLOCK); 6992 lock_limit >>= PAGE_SHIFT; 6993 locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; 6994 6995 if ((locked > lock_limit) && perf_is_paranoid() && 6996 !capable(CAP_IPC_LOCK)) { 6997 ret = -EPERM; 6998 goto unlock; 6999 } 7000 7001 WARN_ON(!rb && event->rb); 7002 7003 if (vma->vm_flags & VM_WRITE) 7004 flags |= RING_BUFFER_WRITABLE; 7005 7006 if (!rb) { 7007 rb = rb_alloc(nr_pages, 7008 event->attr.watermark ? event->attr.wakeup_watermark : 0, 7009 event->cpu, flags); 7010 7011 if (!rb) { 7012 ret = -ENOMEM; 7013 goto unlock; 7014 } 7015 7016 atomic_set(&rb->mmap_count, 1); 7017 rb->mmap_user = get_current_user(); 7018 rb->mmap_locked = extra; 7019 7020 ring_buffer_attach(event, rb); 7021 7022 perf_event_update_time(event); 7023 perf_event_init_userpage(event); 7024 perf_event_update_userpage(event); 7025 } else { 7026 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, 7027 event->attr.aux_watermark, flags); 7028 if (!ret) 7029 rb->aux_mmap_locked = extra; 7030 } 7031 7032 ret = 0; 7033 7034 unlock: 7035 if (!ret) { 7036 atomic_long_add(user_extra, &user->locked_vm); 7037 atomic64_add(extra, &vma->vm_mm->pinned_vm); 7038 7039 atomic_inc(&event->mmap_count); 7040 } else if (rb) { 7041 atomic_dec(&rb->mmap_count); 7042 } 7043 aux_unlock: 7044 if (aux_mutex) 7045 mutex_unlock(aux_mutex); 7046 mutex_unlock(&event->mmap_mutex); 7047 7048 /* 7049 * Since pinned accounting is per vm we cannot allow fork() to copy our 7050 * vma. 7051 */ 7052 vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); 7053 vma->vm_ops = &perf_mmap_vmops; 7054 7055 if (!ret) 7056 ret = map_range(rb, vma); 7057 7058 if (!ret && event->pmu->event_mapped) 7059 event->pmu->event_mapped(event, vma->vm_mm); 7060 7061 return ret; 7062 } 7063 7064 static int perf_fasync(int fd, struct file *filp, int on) 7065 { 7066 struct inode *inode = file_inode(filp); 7067 struct perf_event *event = filp->private_data; 7068 int retval; 7069 7070 inode_lock(inode); 7071 retval = fasync_helper(fd, filp, on, &event->fasync); 7072 inode_unlock(inode); 7073 7074 if (retval < 0) 7075 return retval; 7076 7077 return 0; 7078 } 7079 7080 static const struct file_operations perf_fops = { 7081 .release = perf_release, 7082 .read = perf_read, 7083 .poll = perf_poll, 7084 .unlocked_ioctl = perf_ioctl, 7085 .compat_ioctl = perf_compat_ioctl, 7086 .mmap = perf_mmap, 7087 .fasync = perf_fasync, 7088 }; 7089 7090 /* 7091 * Perf event wakeup 7092 * 7093 * If there's data, ensure we set the poll() state and publish everything 7094 * to user-space before waking everybody up. 7095 */ 7096 7097 void perf_event_wakeup(struct perf_event *event) 7098 { 7099 ring_buffer_wakeup(event); 7100 7101 if (event->pending_kill) { 7102 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); 7103 event->pending_kill = 0; 7104 } 7105 } 7106 7107 static void perf_sigtrap(struct perf_event *event) 7108 { 7109 /* 7110 * We'd expect this to only occur if the irq_work is delayed and either 7111 * ctx->task or current has changed in the meantime. This can be the 7112 * case on architectures that do not implement arch_irq_work_raise(). 7113 */ 7114 if (WARN_ON_ONCE(event->ctx->task != current)) 7115 return; 7116 7117 /* 7118 * Both perf_pending_task() and perf_pending_irq() can race with the 7119 * task exiting. 7120 */ 7121 if (current->flags & PF_EXITING) 7122 return; 7123 7124 send_sig_perf((void __user *)event->pending_addr, 7125 event->orig_type, event->attr.sig_data); 7126 } 7127 7128 /* 7129 * Deliver the pending work in-event-context or follow the context. 7130 */ 7131 static void __perf_pending_disable(struct perf_event *event) 7132 { 7133 int cpu = READ_ONCE(event->oncpu); 7134 7135 /* 7136 * If the event isn't running; we done. event_sched_out() will have 7137 * taken care of things. 7138 */ 7139 if (cpu < 0) 7140 return; 7141 7142 /* 7143 * Yay, we hit home and are in the context of the event. 7144 */ 7145 if (cpu == smp_processor_id()) { 7146 if (event->pending_disable) { 7147 event->pending_disable = 0; 7148 perf_event_disable_local(event); 7149 } 7150 return; 7151 } 7152 7153 /* 7154 * CPU-A CPU-B 7155 * 7156 * perf_event_disable_inatomic() 7157 * @pending_disable = CPU-A; 7158 * irq_work_queue(); 7159 * 7160 * sched-out 7161 * @pending_disable = -1; 7162 * 7163 * sched-in 7164 * perf_event_disable_inatomic() 7165 * @pending_disable = CPU-B; 7166 * irq_work_queue(); // FAILS 7167 * 7168 * irq_work_run() 7169 * perf_pending_disable() 7170 * 7171 * But the event runs on CPU-B and wants disabling there. 7172 */ 7173 irq_work_queue_on(&event->pending_disable_irq, cpu); 7174 } 7175 7176 static void perf_pending_disable(struct irq_work *entry) 7177 { 7178 struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); 7179 int rctx; 7180 7181 /* 7182 * If we 'fail' here, that's OK, it means recursion is already disabled 7183 * and we won't recurse 'further'. 7184 */ 7185 rctx = perf_swevent_get_recursion_context(); 7186 __perf_pending_disable(event); 7187 if (rctx >= 0) 7188 perf_swevent_put_recursion_context(rctx); 7189 } 7190 7191 static void perf_pending_irq(struct irq_work *entry) 7192 { 7193 struct perf_event *event = container_of(entry, struct perf_event, pending_irq); 7194 int rctx; 7195 7196 /* 7197 * If we 'fail' here, that's OK, it means recursion is already disabled 7198 * and we won't recurse 'further'. 7199 */ 7200 rctx = perf_swevent_get_recursion_context(); 7201 7202 /* 7203 * The wakeup isn't bound to the context of the event -- it can happen 7204 * irrespective of where the event is. 7205 */ 7206 if (event->pending_wakeup) { 7207 event->pending_wakeup = 0; 7208 perf_event_wakeup(event); 7209 } 7210 7211 if (rctx >= 0) 7212 perf_swevent_put_recursion_context(rctx); 7213 } 7214 7215 static void perf_pending_task(struct callback_head *head) 7216 { 7217 struct perf_event *event = container_of(head, struct perf_event, pending_task); 7218 int rctx; 7219 7220 /* 7221 * If we 'fail' here, that's OK, it means recursion is already disabled 7222 * and we won't recurse 'further'. 7223 */ 7224 rctx = perf_swevent_get_recursion_context(); 7225 7226 if (event->pending_work) { 7227 event->pending_work = 0; 7228 perf_sigtrap(event); 7229 local_dec(&event->ctx->nr_no_switch_fast); 7230 } 7231 put_event(event); 7232 7233 if (rctx >= 0) 7234 perf_swevent_put_recursion_context(rctx); 7235 } 7236 7237 #ifdef CONFIG_GUEST_PERF_EVENTS 7238 struct perf_guest_info_callbacks __rcu *perf_guest_cbs; 7239 7240 DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); 7241 DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); 7242 DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); 7243 7244 void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 7245 { 7246 if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) 7247 return; 7248 7249 rcu_assign_pointer(perf_guest_cbs, cbs); 7250 static_call_update(__perf_guest_state, cbs->state); 7251 static_call_update(__perf_guest_get_ip, cbs->get_ip); 7252 7253 /* Implementing ->handle_intel_pt_intr is optional. */ 7254 if (cbs->handle_intel_pt_intr) 7255 static_call_update(__perf_guest_handle_intel_pt_intr, 7256 cbs->handle_intel_pt_intr); 7257 } 7258 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); 7259 7260 void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 7261 { 7262 if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) 7263 return; 7264 7265 rcu_assign_pointer(perf_guest_cbs, NULL); 7266 static_call_update(__perf_guest_state, (void *)&__static_call_return0); 7267 static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); 7268 static_call_update(__perf_guest_handle_intel_pt_intr, 7269 (void *)&__static_call_return0); 7270 synchronize_rcu(); 7271 } 7272 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 7273 #endif 7274 7275 static bool should_sample_guest(struct perf_event *event) 7276 { 7277 return !event->attr.exclude_guest && perf_guest_state(); 7278 } 7279 7280 unsigned long perf_misc_flags(struct perf_event *event, 7281 struct pt_regs *regs) 7282 { 7283 if (should_sample_guest(event)) 7284 return perf_arch_guest_misc_flags(regs); 7285 7286 return perf_arch_misc_flags(regs); 7287 } 7288 7289 unsigned long perf_instruction_pointer(struct perf_event *event, 7290 struct pt_regs *regs) 7291 { 7292 if (should_sample_guest(event)) 7293 return perf_guest_get_ip(); 7294 7295 return perf_arch_instruction_pointer(regs); 7296 } 7297 7298 static void 7299 perf_output_sample_regs(struct perf_output_handle *handle, 7300 struct pt_regs *regs, u64 mask) 7301 { 7302 int bit; 7303 DECLARE_BITMAP(_mask, 64); 7304 7305 bitmap_from_u64(_mask, mask); 7306 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { 7307 u64 val; 7308 7309 val = perf_reg_value(regs, bit); 7310 perf_output_put(handle, val); 7311 } 7312 } 7313 7314 static void perf_sample_regs_user(struct perf_regs *regs_user, 7315 struct pt_regs *regs) 7316 { 7317 if (user_mode(regs)) { 7318 regs_user->abi = perf_reg_abi(current); 7319 regs_user->regs = regs; 7320 } else if (!(current->flags & PF_KTHREAD)) { 7321 perf_get_regs_user(regs_user, regs); 7322 } else { 7323 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; 7324 regs_user->regs = NULL; 7325 } 7326 } 7327 7328 static void perf_sample_regs_intr(struct perf_regs *regs_intr, 7329 struct pt_regs *regs) 7330 { 7331 regs_intr->regs = regs; 7332 regs_intr->abi = perf_reg_abi(current); 7333 } 7334 7335 7336 /* 7337 * Get remaining task size from user stack pointer. 7338 * 7339 * It'd be better to take stack vma map and limit this more 7340 * precisely, but there's no way to get it safely under interrupt, 7341 * so using TASK_SIZE as limit. 7342 */ 7343 static u64 perf_ustack_task_size(struct pt_regs *regs) 7344 { 7345 unsigned long addr = perf_user_stack_pointer(regs); 7346 7347 if (!addr || addr >= TASK_SIZE) 7348 return 0; 7349 7350 return TASK_SIZE - addr; 7351 } 7352 7353 static u16 7354 perf_sample_ustack_size(u16 stack_size, u16 header_size, 7355 struct pt_regs *regs) 7356 { 7357 u64 task_size; 7358 7359 /* No regs, no stack pointer, no dump. */ 7360 if (!regs) 7361 return 0; 7362 7363 /* 7364 * Check if we fit in with the requested stack size into the: 7365 * - TASK_SIZE 7366 * If we don't, we limit the size to the TASK_SIZE. 7367 * 7368 * - remaining sample size 7369 * If we don't, we customize the stack size to 7370 * fit in to the remaining sample size. 7371 */ 7372 7373 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); 7374 stack_size = min(stack_size, (u16) task_size); 7375 7376 /* Current header size plus static size and dynamic size. */ 7377 header_size += 2 * sizeof(u64); 7378 7379 /* Do we fit in with the current stack dump size? */ 7380 if ((u16) (header_size + stack_size) < header_size) { 7381 /* 7382 * If we overflow the maximum size for the sample, 7383 * we customize the stack dump size to fit in. 7384 */ 7385 stack_size = USHRT_MAX - header_size - sizeof(u64); 7386 stack_size = round_up(stack_size, sizeof(u64)); 7387 } 7388 7389 return stack_size; 7390 } 7391 7392 static void 7393 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, 7394 struct pt_regs *regs) 7395 { 7396 /* Case of a kernel thread, nothing to dump */ 7397 if (!regs) { 7398 u64 size = 0; 7399 perf_output_put(handle, size); 7400 } else { 7401 unsigned long sp; 7402 unsigned int rem; 7403 u64 dyn_size; 7404 7405 /* 7406 * We dump: 7407 * static size 7408 * - the size requested by user or the best one we can fit 7409 * in to the sample max size 7410 * data 7411 * - user stack dump data 7412 * dynamic size 7413 * - the actual dumped size 7414 */ 7415 7416 /* Static size. */ 7417 perf_output_put(handle, dump_size); 7418 7419 /* Data. */ 7420 sp = perf_user_stack_pointer(regs); 7421 rem = __output_copy_user(handle, (void *) sp, dump_size); 7422 dyn_size = dump_size - rem; 7423 7424 perf_output_skip(handle, rem); 7425 7426 /* Dynamic size. */ 7427 perf_output_put(handle, dyn_size); 7428 } 7429 } 7430 7431 static unsigned long perf_prepare_sample_aux(struct perf_event *event, 7432 struct perf_sample_data *data, 7433 size_t size) 7434 { 7435 struct perf_event *sampler = event->aux_event; 7436 struct perf_buffer *rb; 7437 7438 data->aux_size = 0; 7439 7440 if (!sampler) 7441 goto out; 7442 7443 if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) 7444 goto out; 7445 7446 if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) 7447 goto out; 7448 7449 rb = ring_buffer_get(sampler); 7450 if (!rb) 7451 goto out; 7452 7453 /* 7454 * If this is an NMI hit inside sampling code, don't take 7455 * the sample. See also perf_aux_sample_output(). 7456 */ 7457 if (READ_ONCE(rb->aux_in_sampling)) { 7458 data->aux_size = 0; 7459 } else { 7460 size = min_t(size_t, size, perf_aux_size(rb)); 7461 data->aux_size = ALIGN(size, sizeof(u64)); 7462 } 7463 ring_buffer_put(rb); 7464 7465 out: 7466 return data->aux_size; 7467 } 7468 7469 static long perf_pmu_snapshot_aux(struct perf_buffer *rb, 7470 struct perf_event *event, 7471 struct perf_output_handle *handle, 7472 unsigned long size) 7473 { 7474 unsigned long flags; 7475 long ret; 7476 7477 /* 7478 * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler 7479 * paths. If we start calling them in NMI context, they may race with 7480 * the IRQ ones, that is, for example, re-starting an event that's just 7481 * been stopped, which is why we're using a separate callback that 7482 * doesn't change the event state. 7483 * 7484 * IRQs need to be disabled to prevent IPIs from racing with us. 7485 */ 7486 local_irq_save(flags); 7487 /* 7488 * Guard against NMI hits inside the critical section; 7489 * see also perf_prepare_sample_aux(). 7490 */ 7491 WRITE_ONCE(rb->aux_in_sampling, 1); 7492 barrier(); 7493 7494 ret = event->pmu->snapshot_aux(event, handle, size); 7495 7496 barrier(); 7497 WRITE_ONCE(rb->aux_in_sampling, 0); 7498 local_irq_restore(flags); 7499 7500 return ret; 7501 } 7502 7503 static void perf_aux_sample_output(struct perf_event *event, 7504 struct perf_output_handle *handle, 7505 struct perf_sample_data *data) 7506 { 7507 struct perf_event *sampler = event->aux_event; 7508 struct perf_buffer *rb; 7509 unsigned long pad; 7510 long size; 7511 7512 if (WARN_ON_ONCE(!sampler || !data->aux_size)) 7513 return; 7514 7515 rb = ring_buffer_get(sampler); 7516 if (!rb) 7517 return; 7518 7519 size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); 7520 7521 /* 7522 * An error here means that perf_output_copy() failed (returned a 7523 * non-zero surplus that it didn't copy), which in its current 7524 * enlightened implementation is not possible. If that changes, we'd 7525 * like to know. 7526 */ 7527 if (WARN_ON_ONCE(size < 0)) 7528 goto out_put; 7529 7530 /* 7531 * The pad comes from ALIGN()ing data->aux_size up to u64 in 7532 * perf_prepare_sample_aux(), so should not be more than that. 7533 */ 7534 pad = data->aux_size - size; 7535 if (WARN_ON_ONCE(pad >= sizeof(u64))) 7536 pad = 8; 7537 7538 if (pad) { 7539 u64 zero = 0; 7540 perf_output_copy(handle, &zero, pad); 7541 } 7542 7543 out_put: 7544 ring_buffer_put(rb); 7545 } 7546 7547 /* 7548 * A set of common sample data types saved even for non-sample records 7549 * when event->attr.sample_id_all is set. 7550 */ 7551 #define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ 7552 PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ 7553 PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) 7554 7555 static void __perf_event_header__init_id(struct perf_sample_data *data, 7556 struct perf_event *event, 7557 u64 sample_type) 7558 { 7559 data->type = event->attr.sample_type; 7560 data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL; 7561 7562 if (sample_type & PERF_SAMPLE_TID) { 7563 /* namespace issues */ 7564 data->tid_entry.pid = perf_event_pid(event, current); 7565 data->tid_entry.tid = perf_event_tid(event, current); 7566 } 7567 7568 if (sample_type & PERF_SAMPLE_TIME) 7569 data->time = perf_event_clock(event); 7570 7571 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) 7572 data->id = primary_event_id(event); 7573 7574 if (sample_type & PERF_SAMPLE_STREAM_ID) 7575 data->stream_id = event->id; 7576 7577 if (sample_type & PERF_SAMPLE_CPU) { 7578 data->cpu_entry.cpu = raw_smp_processor_id(); 7579 data->cpu_entry.reserved = 0; 7580 } 7581 } 7582 7583 void perf_event_header__init_id(struct perf_event_header *header, 7584 struct perf_sample_data *data, 7585 struct perf_event *event) 7586 { 7587 if (event->attr.sample_id_all) { 7588 header->size += event->id_header_size; 7589 __perf_event_header__init_id(data, event, event->attr.sample_type); 7590 } 7591 } 7592 7593 static void __perf_event__output_id_sample(struct perf_output_handle *handle, 7594 struct perf_sample_data *data) 7595 { 7596 u64 sample_type = data->type; 7597 7598 if (sample_type & PERF_SAMPLE_TID) 7599 perf_output_put(handle, data->tid_entry); 7600 7601 if (sample_type & PERF_SAMPLE_TIME) 7602 perf_output_put(handle, data->time); 7603 7604 if (sample_type & PERF_SAMPLE_ID) 7605 perf_output_put(handle, data->id); 7606 7607 if (sample_type & PERF_SAMPLE_STREAM_ID) 7608 perf_output_put(handle, data->stream_id); 7609 7610 if (sample_type & PERF_SAMPLE_CPU) 7611 perf_output_put(handle, data->cpu_entry); 7612 7613 if (sample_type & PERF_SAMPLE_IDENTIFIER) 7614 perf_output_put(handle, data->id); 7615 } 7616 7617 void perf_event__output_id_sample(struct perf_event *event, 7618 struct perf_output_handle *handle, 7619 struct perf_sample_data *sample) 7620 { 7621 if (event->attr.sample_id_all) 7622 __perf_event__output_id_sample(handle, sample); 7623 } 7624 7625 static void perf_output_read_one(struct perf_output_handle *handle, 7626 struct perf_event *event, 7627 u64 enabled, u64 running) 7628 { 7629 u64 read_format = event->attr.read_format; 7630 u64 values[5]; 7631 int n = 0; 7632 7633 values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); 7634 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 7635 values[n++] = enabled + 7636 atomic64_read(&event->child_total_time_enabled); 7637 } 7638 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 7639 values[n++] = running + 7640 atomic64_read(&event->child_total_time_running); 7641 } 7642 if (read_format & PERF_FORMAT_ID) 7643 values[n++] = primary_event_id(event); 7644 if (read_format & PERF_FORMAT_LOST) 7645 values[n++] = atomic64_read(&event->lost_samples); 7646 7647 __output_copy(handle, values, n * sizeof(u64)); 7648 } 7649 7650 static void perf_output_read_group(struct perf_output_handle *handle, 7651 struct perf_event *event, 7652 u64 enabled, u64 running) 7653 { 7654 struct perf_event *leader = event->group_leader, *sub; 7655 u64 read_format = event->attr.read_format; 7656 unsigned long flags; 7657 u64 values[6]; 7658 int n = 0; 7659 bool self = has_inherit_and_sample_read(&event->attr); 7660 7661 /* 7662 * Disabling interrupts avoids all counter scheduling 7663 * (context switches, timer based rotation and IPIs). 7664 */ 7665 local_irq_save(flags); 7666 7667 values[n++] = 1 + leader->nr_siblings; 7668 7669 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 7670 values[n++] = enabled; 7671 7672 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 7673 values[n++] = running; 7674 7675 if ((leader != event) && !handle->skip_read) 7676 perf_pmu_read(leader); 7677 7678 values[n++] = perf_event_count(leader, self); 7679 if (read_format & PERF_FORMAT_ID) 7680 values[n++] = primary_event_id(leader); 7681 if (read_format & PERF_FORMAT_LOST) 7682 values[n++] = atomic64_read(&leader->lost_samples); 7683 7684 __output_copy(handle, values, n * sizeof(u64)); 7685 7686 for_each_sibling_event(sub, leader) { 7687 n = 0; 7688 7689 if ((sub != event) && !handle->skip_read) 7690 perf_pmu_read(sub); 7691 7692 values[n++] = perf_event_count(sub, self); 7693 if (read_format & PERF_FORMAT_ID) 7694 values[n++] = primary_event_id(sub); 7695 if (read_format & PERF_FORMAT_LOST) 7696 values[n++] = atomic64_read(&sub->lost_samples); 7697 7698 __output_copy(handle, values, n * sizeof(u64)); 7699 } 7700 7701 local_irq_restore(flags); 7702 } 7703 7704 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ 7705 PERF_FORMAT_TOTAL_TIME_RUNNING) 7706 7707 /* 7708 * XXX PERF_SAMPLE_READ vs inherited events seems difficult. 7709 * 7710 * The problem is that its both hard and excessively expensive to iterate the 7711 * child list, not to mention that its impossible to IPI the children running 7712 * on another CPU, from interrupt/NMI context. 7713 * 7714 * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread 7715 * counts rather than attempting to accumulate some value across all children on 7716 * all cores. 7717 */ 7718 static void perf_output_read(struct perf_output_handle *handle, 7719 struct perf_event *event) 7720 { 7721 u64 enabled = 0, running = 0, now; 7722 u64 read_format = event->attr.read_format; 7723 7724 /* 7725 * compute total_time_enabled, total_time_running 7726 * based on snapshot values taken when the event 7727 * was last scheduled in. 7728 * 7729 * we cannot simply called update_context_time() 7730 * because of locking issue as we are called in 7731 * NMI context 7732 */ 7733 if (read_format & PERF_FORMAT_TOTAL_TIMES) 7734 calc_timer_values(event, &now, &enabled, &running); 7735 7736 if (event->attr.read_format & PERF_FORMAT_GROUP) 7737 perf_output_read_group(handle, event, enabled, running); 7738 else 7739 perf_output_read_one(handle, event, enabled, running); 7740 } 7741 7742 void perf_output_sample(struct perf_output_handle *handle, 7743 struct perf_event_header *header, 7744 struct perf_sample_data *data, 7745 struct perf_event *event) 7746 { 7747 u64 sample_type = data->type; 7748 7749 if (data->sample_flags & PERF_SAMPLE_READ) 7750 handle->skip_read = 1; 7751 7752 perf_output_put(handle, *header); 7753 7754 if (sample_type & PERF_SAMPLE_IDENTIFIER) 7755 perf_output_put(handle, data->id); 7756 7757 if (sample_type & PERF_SAMPLE_IP) 7758 perf_output_put(handle, data->ip); 7759 7760 if (sample_type & PERF_SAMPLE_TID) 7761 perf_output_put(handle, data->tid_entry); 7762 7763 if (sample_type & PERF_SAMPLE_TIME) 7764 perf_output_put(handle, data->time); 7765 7766 if (sample_type & PERF_SAMPLE_ADDR) 7767 perf_output_put(handle, data->addr); 7768 7769 if (sample_type & PERF_SAMPLE_ID) 7770 perf_output_put(handle, data->id); 7771 7772 if (sample_type & PERF_SAMPLE_STREAM_ID) 7773 perf_output_put(handle, data->stream_id); 7774 7775 if (sample_type & PERF_SAMPLE_CPU) 7776 perf_output_put(handle, data->cpu_entry); 7777 7778 if (sample_type & PERF_SAMPLE_PERIOD) 7779 perf_output_put(handle, data->period); 7780 7781 if (sample_type & PERF_SAMPLE_READ) 7782 perf_output_read(handle, event); 7783 7784 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 7785 int size = 1; 7786 7787 size += data->callchain->nr; 7788 size *= sizeof(u64); 7789 __output_copy(handle, data->callchain, size); 7790 } 7791 7792 if (sample_type & PERF_SAMPLE_RAW) { 7793 struct perf_raw_record *raw = data->raw; 7794 7795 if (raw) { 7796 struct perf_raw_frag *frag = &raw->frag; 7797 7798 perf_output_put(handle, raw->size); 7799 do { 7800 if (frag->copy) { 7801 __output_custom(handle, frag->copy, 7802 frag->data, frag->size); 7803 } else { 7804 __output_copy(handle, frag->data, 7805 frag->size); 7806 } 7807 if (perf_raw_frag_last(frag)) 7808 break; 7809 frag = frag->next; 7810 } while (1); 7811 if (frag->pad) 7812 __output_skip(handle, NULL, frag->pad); 7813 } else { 7814 struct { 7815 u32 size; 7816 u32 data; 7817 } raw = { 7818 .size = sizeof(u32), 7819 .data = 0, 7820 }; 7821 perf_output_put(handle, raw); 7822 } 7823 } 7824 7825 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 7826 if (data->br_stack) { 7827 size_t size; 7828 7829 size = data->br_stack->nr 7830 * sizeof(struct perf_branch_entry); 7831 7832 perf_output_put(handle, data->br_stack->nr); 7833 if (branch_sample_hw_index(event)) 7834 perf_output_put(handle, data->br_stack->hw_idx); 7835 perf_output_copy(handle, data->br_stack->entries, size); 7836 /* 7837 * Add the extension space which is appended 7838 * right after the struct perf_branch_stack. 7839 */ 7840 if (data->br_stack_cntr) { 7841 size = data->br_stack->nr * sizeof(u64); 7842 perf_output_copy(handle, data->br_stack_cntr, size); 7843 } 7844 } else { 7845 /* 7846 * we always store at least the value of nr 7847 */ 7848 u64 nr = 0; 7849 perf_output_put(handle, nr); 7850 } 7851 } 7852 7853 if (sample_type & PERF_SAMPLE_REGS_USER) { 7854 u64 abi = data->regs_user.abi; 7855 7856 /* 7857 * If there are no regs to dump, notice it through 7858 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). 7859 */ 7860 perf_output_put(handle, abi); 7861 7862 if (abi) { 7863 u64 mask = event->attr.sample_regs_user; 7864 perf_output_sample_regs(handle, 7865 data->regs_user.regs, 7866 mask); 7867 } 7868 } 7869 7870 if (sample_type & PERF_SAMPLE_STACK_USER) { 7871 perf_output_sample_ustack(handle, 7872 data->stack_user_size, 7873 data->regs_user.regs); 7874 } 7875 7876 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) 7877 perf_output_put(handle, data->weight.full); 7878 7879 if (sample_type & PERF_SAMPLE_DATA_SRC) 7880 perf_output_put(handle, data->data_src.val); 7881 7882 if (sample_type & PERF_SAMPLE_TRANSACTION) 7883 perf_output_put(handle, data->txn); 7884 7885 if (sample_type & PERF_SAMPLE_REGS_INTR) { 7886 u64 abi = data->regs_intr.abi; 7887 /* 7888 * If there are no regs to dump, notice it through 7889 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). 7890 */ 7891 perf_output_put(handle, abi); 7892 7893 if (abi) { 7894 u64 mask = event->attr.sample_regs_intr; 7895 7896 perf_output_sample_regs(handle, 7897 data->regs_intr.regs, 7898 mask); 7899 } 7900 } 7901 7902 if (sample_type & PERF_SAMPLE_PHYS_ADDR) 7903 perf_output_put(handle, data->phys_addr); 7904 7905 if (sample_type & PERF_SAMPLE_CGROUP) 7906 perf_output_put(handle, data->cgroup); 7907 7908 if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) 7909 perf_output_put(handle, data->data_page_size); 7910 7911 if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) 7912 perf_output_put(handle, data->code_page_size); 7913 7914 if (sample_type & PERF_SAMPLE_AUX) { 7915 perf_output_put(handle, data->aux_size); 7916 7917 if (data->aux_size) 7918 perf_aux_sample_output(event, handle, data); 7919 } 7920 7921 if (!event->attr.watermark) { 7922 int wakeup_events = event->attr.wakeup_events; 7923 7924 if (wakeup_events) { 7925 struct perf_buffer *rb = handle->rb; 7926 int events = local_inc_return(&rb->events); 7927 7928 if (events >= wakeup_events) { 7929 local_sub(wakeup_events, &rb->events); 7930 local_inc(&rb->wakeup); 7931 } 7932 } 7933 } 7934 } 7935 7936 static u64 perf_virt_to_phys(u64 virt) 7937 { 7938 u64 phys_addr = 0; 7939 7940 if (!virt) 7941 return 0; 7942 7943 if (virt >= TASK_SIZE) { 7944 /* If it's vmalloc()d memory, leave phys_addr as 0 */ 7945 if (virt_addr_valid((void *)(uintptr_t)virt) && 7946 !(virt >= VMALLOC_START && virt < VMALLOC_END)) 7947 phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); 7948 } else { 7949 /* 7950 * Walking the pages tables for user address. 7951 * Interrupts are disabled, so it prevents any tear down 7952 * of the page tables. 7953 * Try IRQ-safe get_user_page_fast_only first. 7954 * If failed, leave phys_addr as 0. 7955 */ 7956 if (current->mm != NULL) { 7957 struct page *p; 7958 7959 pagefault_disable(); 7960 if (get_user_page_fast_only(virt, 0, &p)) { 7961 phys_addr = page_to_phys(p) + virt % PAGE_SIZE; 7962 put_page(p); 7963 } 7964 pagefault_enable(); 7965 } 7966 } 7967 7968 return phys_addr; 7969 } 7970 7971 /* 7972 * Return the pagetable size of a given virtual address. 7973 */ 7974 static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) 7975 { 7976 u64 size = 0; 7977 7978 #ifdef CONFIG_HAVE_GUP_FAST 7979 pgd_t *pgdp, pgd; 7980 p4d_t *p4dp, p4d; 7981 pud_t *pudp, pud; 7982 pmd_t *pmdp, pmd; 7983 pte_t *ptep, pte; 7984 7985 pgdp = pgd_offset(mm, addr); 7986 pgd = READ_ONCE(*pgdp); 7987 if (pgd_none(pgd)) 7988 return 0; 7989 7990 if (pgd_leaf(pgd)) 7991 return pgd_leaf_size(pgd); 7992 7993 p4dp = p4d_offset_lockless(pgdp, pgd, addr); 7994 p4d = READ_ONCE(*p4dp); 7995 if (!p4d_present(p4d)) 7996 return 0; 7997 7998 if (p4d_leaf(p4d)) 7999 return p4d_leaf_size(p4d); 8000 8001 pudp = pud_offset_lockless(p4dp, p4d, addr); 8002 pud = READ_ONCE(*pudp); 8003 if (!pud_present(pud)) 8004 return 0; 8005 8006 if (pud_leaf(pud)) 8007 return pud_leaf_size(pud); 8008 8009 pmdp = pmd_offset_lockless(pudp, pud, addr); 8010 again: 8011 pmd = pmdp_get_lockless(pmdp); 8012 if (!pmd_present(pmd)) 8013 return 0; 8014 8015 if (pmd_leaf(pmd)) 8016 return pmd_leaf_size(pmd); 8017 8018 ptep = pte_offset_map(&pmd, addr); 8019 if (!ptep) 8020 goto again; 8021 8022 pte = ptep_get_lockless(ptep); 8023 if (pte_present(pte)) 8024 size = __pte_leaf_size(pmd, pte); 8025 pte_unmap(ptep); 8026 #endif /* CONFIG_HAVE_GUP_FAST */ 8027 8028 return size; 8029 } 8030 8031 static u64 perf_get_page_size(unsigned long addr) 8032 { 8033 struct mm_struct *mm; 8034 unsigned long flags; 8035 u64 size; 8036 8037 if (!addr) 8038 return 0; 8039 8040 /* 8041 * Software page-table walkers must disable IRQs, 8042 * which prevents any tear down of the page tables. 8043 */ 8044 local_irq_save(flags); 8045 8046 mm = current->mm; 8047 if (!mm) { 8048 /* 8049 * For kernel threads and the like, use init_mm so that 8050 * we can find kernel memory. 8051 */ 8052 mm = &init_mm; 8053 } 8054 8055 size = perf_get_pgtable_size(mm, addr); 8056 8057 local_irq_restore(flags); 8058 8059 return size; 8060 } 8061 8062 static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; 8063 8064 struct perf_callchain_entry * 8065 perf_callchain(struct perf_event *event, struct pt_regs *regs) 8066 { 8067 bool kernel = !event->attr.exclude_callchain_kernel; 8068 bool user = !event->attr.exclude_callchain_user; 8069 /* Disallow cross-task user callchains. */ 8070 bool crosstask = event->ctx->task && event->ctx->task != current; 8071 const u32 max_stack = event->attr.sample_max_stack; 8072 struct perf_callchain_entry *callchain; 8073 8074 if (!kernel && !user) 8075 return &__empty_callchain; 8076 8077 callchain = get_perf_callchain(regs, 0, kernel, user, 8078 max_stack, crosstask, true); 8079 return callchain ?: &__empty_callchain; 8080 } 8081 8082 static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) 8083 { 8084 return d * !!(flags & s); 8085 } 8086 8087 void perf_prepare_sample(struct perf_sample_data *data, 8088 struct perf_event *event, 8089 struct pt_regs *regs) 8090 { 8091 u64 sample_type = event->attr.sample_type; 8092 u64 filtered_sample_type; 8093 8094 /* 8095 * Add the sample flags that are dependent to others. And clear the 8096 * sample flags that have already been done by the PMU driver. 8097 */ 8098 filtered_sample_type = sample_type; 8099 filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, 8100 PERF_SAMPLE_IP); 8101 filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | 8102 PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); 8103 filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, 8104 PERF_SAMPLE_REGS_USER); 8105 filtered_sample_type &= ~data->sample_flags; 8106 8107 if (filtered_sample_type == 0) { 8108 /* Make sure it has the correct data->type for output */ 8109 data->type = event->attr.sample_type; 8110 return; 8111 } 8112 8113 __perf_event_header__init_id(data, event, filtered_sample_type); 8114 8115 if (filtered_sample_type & PERF_SAMPLE_IP) { 8116 data->ip = perf_instruction_pointer(event, regs); 8117 data->sample_flags |= PERF_SAMPLE_IP; 8118 } 8119 8120 if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) 8121 perf_sample_save_callchain(data, event, regs); 8122 8123 if (filtered_sample_type & PERF_SAMPLE_RAW) { 8124 data->raw = NULL; 8125 data->dyn_size += sizeof(u64); 8126 data->sample_flags |= PERF_SAMPLE_RAW; 8127 } 8128 8129 if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { 8130 data->br_stack = NULL; 8131 data->dyn_size += sizeof(u64); 8132 data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; 8133 } 8134 8135 if (filtered_sample_type & PERF_SAMPLE_REGS_USER) 8136 perf_sample_regs_user(&data->regs_user, regs); 8137 8138 /* 8139 * It cannot use the filtered_sample_type here as REGS_USER can be set 8140 * by STACK_USER (using __cond_set() above) and we don't want to update 8141 * the dyn_size if it's not requested by users. 8142 */ 8143 if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { 8144 /* regs dump ABI info */ 8145 int size = sizeof(u64); 8146 8147 if (data->regs_user.regs) { 8148 u64 mask = event->attr.sample_regs_user; 8149 size += hweight64(mask) * sizeof(u64); 8150 } 8151 8152 data->dyn_size += size; 8153 data->sample_flags |= PERF_SAMPLE_REGS_USER; 8154 } 8155 8156 if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { 8157 /* 8158 * Either we need PERF_SAMPLE_STACK_USER bit to be always 8159 * processed as the last one or have additional check added 8160 * in case new sample type is added, because we could eat 8161 * up the rest of the sample size. 8162 */ 8163 u16 stack_size = event->attr.sample_stack_user; 8164 u16 header_size = perf_sample_data_size(data, event); 8165 u16 size = sizeof(u64); 8166 8167 stack_size = perf_sample_ustack_size(stack_size, header_size, 8168 data->regs_user.regs); 8169 8170 /* 8171 * If there is something to dump, add space for the dump 8172 * itself and for the field that tells the dynamic size, 8173 * which is how many have been actually dumped. 8174 */ 8175 if (stack_size) 8176 size += sizeof(u64) + stack_size; 8177 8178 data->stack_user_size = stack_size; 8179 data->dyn_size += size; 8180 data->sample_flags |= PERF_SAMPLE_STACK_USER; 8181 } 8182 8183 if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { 8184 data->weight.full = 0; 8185 data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 8186 } 8187 8188 if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) { 8189 data->data_src.val = PERF_MEM_NA; 8190 data->sample_flags |= PERF_SAMPLE_DATA_SRC; 8191 } 8192 8193 if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) { 8194 data->txn = 0; 8195 data->sample_flags |= PERF_SAMPLE_TRANSACTION; 8196 } 8197 8198 if (filtered_sample_type & PERF_SAMPLE_ADDR) { 8199 data->addr = 0; 8200 data->sample_flags |= PERF_SAMPLE_ADDR; 8201 } 8202 8203 if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) { 8204 /* regs dump ABI info */ 8205 int size = sizeof(u64); 8206 8207 perf_sample_regs_intr(&data->regs_intr, regs); 8208 8209 if (data->regs_intr.regs) { 8210 u64 mask = event->attr.sample_regs_intr; 8211 8212 size += hweight64(mask) * sizeof(u64); 8213 } 8214 8215 data->dyn_size += size; 8216 data->sample_flags |= PERF_SAMPLE_REGS_INTR; 8217 } 8218 8219 if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) { 8220 data->phys_addr = perf_virt_to_phys(data->addr); 8221 data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; 8222 } 8223 8224 #ifdef CONFIG_CGROUP_PERF 8225 if (filtered_sample_type & PERF_SAMPLE_CGROUP) { 8226 struct cgroup *cgrp; 8227 8228 /* protected by RCU */ 8229 cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; 8230 data->cgroup = cgroup_id(cgrp); 8231 data->sample_flags |= PERF_SAMPLE_CGROUP; 8232 } 8233 #endif 8234 8235 /* 8236 * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't 8237 * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, 8238 * but the value will not dump to the userspace. 8239 */ 8240 if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) { 8241 data->data_page_size = perf_get_page_size(data->addr); 8242 data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; 8243 } 8244 8245 if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) { 8246 data->code_page_size = perf_get_page_size(data->ip); 8247 data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; 8248 } 8249 8250 if (filtered_sample_type & PERF_SAMPLE_AUX) { 8251 u64 size; 8252 u16 header_size = perf_sample_data_size(data, event); 8253 8254 header_size += sizeof(u64); /* size */ 8255 8256 /* 8257 * Given the 16bit nature of header::size, an AUX sample can 8258 * easily overflow it, what with all the preceding sample bits. 8259 * Make sure this doesn't happen by using up to U16_MAX bytes 8260 * per sample in total (rounded down to 8 byte boundary). 8261 */ 8262 size = min_t(size_t, U16_MAX - header_size, 8263 event->attr.aux_sample_size); 8264 size = rounddown(size, 8); 8265 size = perf_prepare_sample_aux(event, data, size); 8266 8267 WARN_ON_ONCE(size + header_size > U16_MAX); 8268 data->dyn_size += size + sizeof(u64); /* size above */ 8269 data->sample_flags |= PERF_SAMPLE_AUX; 8270 } 8271 } 8272 8273 void perf_prepare_header(struct perf_event_header *header, 8274 struct perf_sample_data *data, 8275 struct perf_event *event, 8276 struct pt_regs *regs) 8277 { 8278 header->type = PERF_RECORD_SAMPLE; 8279 header->size = perf_sample_data_size(data, event); 8280 header->misc = perf_misc_flags(event, regs); 8281 8282 /* 8283 * If you're adding more sample types here, you likely need to do 8284 * something about the overflowing header::size, like repurpose the 8285 * lowest 3 bits of size, which should be always zero at the moment. 8286 * This raises a more important question, do we really need 512k sized 8287 * samples and why, so good argumentation is in order for whatever you 8288 * do here next. 8289 */ 8290 WARN_ON_ONCE(header->size & 7); 8291 } 8292 8293 static void __perf_event_aux_pause(struct perf_event *event, bool pause) 8294 { 8295 if (pause) { 8296 if (!event->hw.aux_paused) { 8297 event->hw.aux_paused = 1; 8298 event->pmu->stop(event, PERF_EF_PAUSE); 8299 } 8300 } else { 8301 if (event->hw.aux_paused) { 8302 event->hw.aux_paused = 0; 8303 event->pmu->start(event, PERF_EF_RESUME); 8304 } 8305 } 8306 } 8307 8308 static void perf_event_aux_pause(struct perf_event *event, bool pause) 8309 { 8310 struct perf_buffer *rb; 8311 8312 if (WARN_ON_ONCE(!event)) 8313 return; 8314 8315 rb = ring_buffer_get(event); 8316 if (!rb) 8317 return; 8318 8319 scoped_guard (irqsave) { 8320 /* 8321 * Guard against self-recursion here. Another event could trip 8322 * this same from NMI context. 8323 */ 8324 if (READ_ONCE(rb->aux_in_pause_resume)) 8325 break; 8326 8327 WRITE_ONCE(rb->aux_in_pause_resume, 1); 8328 barrier(); 8329 __perf_event_aux_pause(event, pause); 8330 barrier(); 8331 WRITE_ONCE(rb->aux_in_pause_resume, 0); 8332 } 8333 ring_buffer_put(rb); 8334 } 8335 8336 static __always_inline int 8337 __perf_event_output(struct perf_event *event, 8338 struct perf_sample_data *data, 8339 struct pt_regs *regs, 8340 int (*output_begin)(struct perf_output_handle *, 8341 struct perf_sample_data *, 8342 struct perf_event *, 8343 unsigned int)) 8344 { 8345 struct perf_output_handle handle; 8346 struct perf_event_header header; 8347 int err; 8348 8349 /* protect the callchain buffers */ 8350 rcu_read_lock(); 8351 8352 perf_prepare_sample(data, event, regs); 8353 perf_prepare_header(&header, data, event, regs); 8354 8355 err = output_begin(&handle, data, event, header.size); 8356 if (err) 8357 goto exit; 8358 8359 perf_output_sample(&handle, &header, data, event); 8360 8361 perf_output_end(&handle); 8362 8363 exit: 8364 rcu_read_unlock(); 8365 return err; 8366 } 8367 8368 void 8369 perf_event_output_forward(struct perf_event *event, 8370 struct perf_sample_data *data, 8371 struct pt_regs *regs) 8372 { 8373 __perf_event_output(event, data, regs, perf_output_begin_forward); 8374 } 8375 8376 void 8377 perf_event_output_backward(struct perf_event *event, 8378 struct perf_sample_data *data, 8379 struct pt_regs *regs) 8380 { 8381 __perf_event_output(event, data, regs, perf_output_begin_backward); 8382 } 8383 8384 int 8385 perf_event_output(struct perf_event *event, 8386 struct perf_sample_data *data, 8387 struct pt_regs *regs) 8388 { 8389 return __perf_event_output(event, data, regs, perf_output_begin); 8390 } 8391 8392 /* 8393 * read event_id 8394 */ 8395 8396 struct perf_read_event { 8397 struct perf_event_header header; 8398 8399 u32 pid; 8400 u32 tid; 8401 }; 8402 8403 static void 8404 perf_event_read_event(struct perf_event *event, 8405 struct task_struct *task) 8406 { 8407 struct perf_output_handle handle; 8408 struct perf_sample_data sample; 8409 struct perf_read_event read_event = { 8410 .header = { 8411 .type = PERF_RECORD_READ, 8412 .misc = 0, 8413 .size = sizeof(read_event) + event->read_size, 8414 }, 8415 .pid = perf_event_pid(event, task), 8416 .tid = perf_event_tid(event, task), 8417 }; 8418 int ret; 8419 8420 perf_event_header__init_id(&read_event.header, &sample, event); 8421 ret = perf_output_begin(&handle, &sample, event, read_event.header.size); 8422 if (ret) 8423 return; 8424 8425 perf_output_put(&handle, read_event); 8426 perf_output_read(&handle, event); 8427 perf_event__output_id_sample(event, &handle, &sample); 8428 8429 perf_output_end(&handle); 8430 } 8431 8432 typedef void (perf_iterate_f)(struct perf_event *event, void *data); 8433 8434 static void 8435 perf_iterate_ctx(struct perf_event_context *ctx, 8436 perf_iterate_f output, 8437 void *data, bool all) 8438 { 8439 struct perf_event *event; 8440 8441 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 8442 if (!all) { 8443 if (event->state < PERF_EVENT_STATE_INACTIVE) 8444 continue; 8445 if (!event_filter_match(event)) 8446 continue; 8447 } 8448 8449 output(event, data); 8450 } 8451 } 8452 8453 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) 8454 { 8455 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); 8456 struct perf_event *event; 8457 8458 list_for_each_entry_rcu(event, &pel->list, sb_list) { 8459 /* 8460 * Skip events that are not fully formed yet; ensure that 8461 * if we observe event->ctx, both event and ctx will be 8462 * complete enough. See perf_install_in_context(). 8463 */ 8464 if (!smp_load_acquire(&event->ctx)) 8465 continue; 8466 8467 if (event->state < PERF_EVENT_STATE_INACTIVE) 8468 continue; 8469 if (!event_filter_match(event)) 8470 continue; 8471 output(event, data); 8472 } 8473 } 8474 8475 /* 8476 * Iterate all events that need to receive side-band events. 8477 * 8478 * For new callers; ensure that account_pmu_sb_event() includes 8479 * your event, otherwise it might not get delivered. 8480 */ 8481 static void 8482 perf_iterate_sb(perf_iterate_f output, void *data, 8483 struct perf_event_context *task_ctx) 8484 { 8485 struct perf_event_context *ctx; 8486 8487 rcu_read_lock(); 8488 preempt_disable(); 8489 8490 /* 8491 * If we have task_ctx != NULL we only notify the task context itself. 8492 * The task_ctx is set only for EXIT events before releasing task 8493 * context. 8494 */ 8495 if (task_ctx) { 8496 perf_iterate_ctx(task_ctx, output, data, false); 8497 goto done; 8498 } 8499 8500 perf_iterate_sb_cpu(output, data); 8501 8502 ctx = rcu_dereference(current->perf_event_ctxp); 8503 if (ctx) 8504 perf_iterate_ctx(ctx, output, data, false); 8505 done: 8506 preempt_enable(); 8507 rcu_read_unlock(); 8508 } 8509 8510 /* 8511 * Clear all file-based filters at exec, they'll have to be 8512 * re-instated when/if these objects are mmapped again. 8513 */ 8514 static void perf_event_addr_filters_exec(struct perf_event *event, void *data) 8515 { 8516 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 8517 struct perf_addr_filter *filter; 8518 unsigned int restart = 0, count = 0; 8519 unsigned long flags; 8520 8521 if (!has_addr_filter(event)) 8522 return; 8523 8524 raw_spin_lock_irqsave(&ifh->lock, flags); 8525 list_for_each_entry(filter, &ifh->list, entry) { 8526 if (filter->path.dentry) { 8527 event->addr_filter_ranges[count].start = 0; 8528 event->addr_filter_ranges[count].size = 0; 8529 restart++; 8530 } 8531 8532 count++; 8533 } 8534 8535 if (restart) 8536 event->addr_filters_gen++; 8537 raw_spin_unlock_irqrestore(&ifh->lock, flags); 8538 8539 if (restart) 8540 perf_event_stop(event, 1); 8541 } 8542 8543 void perf_event_exec(void) 8544 { 8545 struct perf_event_context *ctx; 8546 8547 ctx = perf_pin_task_context(current); 8548 if (!ctx) 8549 return; 8550 8551 perf_event_enable_on_exec(ctx); 8552 perf_event_remove_on_exec(ctx); 8553 scoped_guard(rcu) 8554 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); 8555 8556 perf_unpin_context(ctx); 8557 put_ctx(ctx); 8558 } 8559 8560 struct remote_output { 8561 struct perf_buffer *rb; 8562 int err; 8563 }; 8564 8565 static void __perf_event_output_stop(struct perf_event *event, void *data) 8566 { 8567 struct perf_event *parent = event->parent; 8568 struct remote_output *ro = data; 8569 struct perf_buffer *rb = ro->rb; 8570 struct stop_event_data sd = { 8571 .event = event, 8572 }; 8573 8574 if (!has_aux(event)) 8575 return; 8576 8577 if (!parent) 8578 parent = event; 8579 8580 /* 8581 * In case of inheritance, it will be the parent that links to the 8582 * ring-buffer, but it will be the child that's actually using it. 8583 * 8584 * We are using event::rb to determine if the event should be stopped, 8585 * however this may race with ring_buffer_attach() (through set_output), 8586 * which will make us skip the event that actually needs to be stopped. 8587 * So ring_buffer_attach() has to stop an aux event before re-assigning 8588 * its rb pointer. 8589 */ 8590 if (rcu_dereference(parent->rb) == rb) 8591 ro->err = __perf_event_stop(&sd); 8592 } 8593 8594 static int __perf_pmu_output_stop(void *info) 8595 { 8596 struct perf_event *event = info; 8597 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 8598 struct remote_output ro = { 8599 .rb = event->rb, 8600 }; 8601 8602 rcu_read_lock(); 8603 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); 8604 if (cpuctx->task_ctx) 8605 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, 8606 &ro, false); 8607 rcu_read_unlock(); 8608 8609 return ro.err; 8610 } 8611 8612 static void perf_pmu_output_stop(struct perf_event *event) 8613 { 8614 struct perf_event *iter; 8615 int err, cpu; 8616 8617 restart: 8618 rcu_read_lock(); 8619 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { 8620 /* 8621 * For per-CPU events, we need to make sure that neither they 8622 * nor their children are running; for cpu==-1 events it's 8623 * sufficient to stop the event itself if it's active, since 8624 * it can't have children. 8625 */ 8626 cpu = iter->cpu; 8627 if (cpu == -1) 8628 cpu = READ_ONCE(iter->oncpu); 8629 8630 if (cpu == -1) 8631 continue; 8632 8633 err = cpu_function_call(cpu, __perf_pmu_output_stop, event); 8634 if (err == -EAGAIN) { 8635 rcu_read_unlock(); 8636 goto restart; 8637 } 8638 } 8639 rcu_read_unlock(); 8640 } 8641 8642 /* 8643 * task tracking -- fork/exit 8644 * 8645 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task 8646 */ 8647 8648 struct perf_task_event { 8649 struct task_struct *task; 8650 struct perf_event_context *task_ctx; 8651 8652 struct { 8653 struct perf_event_header header; 8654 8655 u32 pid; 8656 u32 ppid; 8657 u32 tid; 8658 u32 ptid; 8659 u64 time; 8660 } event_id; 8661 }; 8662 8663 static int perf_event_task_match(struct perf_event *event) 8664 { 8665 return event->attr.comm || event->attr.mmap || 8666 event->attr.mmap2 || event->attr.mmap_data || 8667 event->attr.task; 8668 } 8669 8670 static void perf_event_task_output(struct perf_event *event, 8671 void *data) 8672 { 8673 struct perf_task_event *task_event = data; 8674 struct perf_output_handle handle; 8675 struct perf_sample_data sample; 8676 struct task_struct *task = task_event->task; 8677 int ret, size = task_event->event_id.header.size; 8678 8679 if (!perf_event_task_match(event)) 8680 return; 8681 8682 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 8683 8684 ret = perf_output_begin(&handle, &sample, event, 8685 task_event->event_id.header.size); 8686 if (ret) 8687 goto out; 8688 8689 task_event->event_id.pid = perf_event_pid(event, task); 8690 task_event->event_id.tid = perf_event_tid(event, task); 8691 8692 if (task_event->event_id.header.type == PERF_RECORD_EXIT) { 8693 task_event->event_id.ppid = perf_event_pid(event, 8694 task->real_parent); 8695 task_event->event_id.ptid = perf_event_pid(event, 8696 task->real_parent); 8697 } else { /* PERF_RECORD_FORK */ 8698 task_event->event_id.ppid = perf_event_pid(event, current); 8699 task_event->event_id.ptid = perf_event_tid(event, current); 8700 } 8701 8702 task_event->event_id.time = perf_event_clock(event); 8703 8704 perf_output_put(&handle, task_event->event_id); 8705 8706 perf_event__output_id_sample(event, &handle, &sample); 8707 8708 perf_output_end(&handle); 8709 out: 8710 task_event->event_id.header.size = size; 8711 } 8712 8713 static void perf_event_task(struct task_struct *task, 8714 struct perf_event_context *task_ctx, 8715 int new) 8716 { 8717 struct perf_task_event task_event; 8718 8719 if (!atomic_read(&nr_comm_events) && 8720 !atomic_read(&nr_mmap_events) && 8721 !atomic_read(&nr_task_events)) 8722 return; 8723 8724 task_event = (struct perf_task_event){ 8725 .task = task, 8726 .task_ctx = task_ctx, 8727 .event_id = { 8728 .header = { 8729 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, 8730 .misc = 0, 8731 .size = sizeof(task_event.event_id), 8732 }, 8733 /* .pid */ 8734 /* .ppid */ 8735 /* .tid */ 8736 /* .ptid */ 8737 /* .time */ 8738 }, 8739 }; 8740 8741 perf_iterate_sb(perf_event_task_output, 8742 &task_event, 8743 task_ctx); 8744 } 8745 8746 /* 8747 * Allocate data for a new task when profiling system-wide 8748 * events which require PMU specific data 8749 */ 8750 static void 8751 perf_event_alloc_task_data(struct task_struct *child, 8752 struct task_struct *parent) 8753 { 8754 struct kmem_cache *ctx_cache = NULL; 8755 struct perf_ctx_data *cd; 8756 8757 if (!refcount_read(&global_ctx_data_ref)) 8758 return; 8759 8760 scoped_guard (rcu) { 8761 cd = rcu_dereference(parent->perf_ctx_data); 8762 if (cd) 8763 ctx_cache = cd->ctx_cache; 8764 } 8765 8766 if (!ctx_cache) 8767 return; 8768 8769 guard(percpu_read)(&global_ctx_data_rwsem); 8770 scoped_guard (rcu) { 8771 cd = rcu_dereference(child->perf_ctx_data); 8772 if (!cd) { 8773 /* 8774 * A system-wide event may be unaccount, 8775 * when attaching the perf_ctx_data. 8776 */ 8777 if (!refcount_read(&global_ctx_data_ref)) 8778 return; 8779 goto attach; 8780 } 8781 8782 if (!cd->global) { 8783 cd->global = 1; 8784 refcount_inc(&cd->refcount); 8785 } 8786 } 8787 8788 return; 8789 attach: 8790 attach_task_ctx_data(child, ctx_cache, true); 8791 } 8792 8793 void perf_event_fork(struct task_struct *task) 8794 { 8795 perf_event_task(task, NULL, 1); 8796 perf_event_namespaces(task); 8797 perf_event_alloc_task_data(task, current); 8798 } 8799 8800 /* 8801 * comm tracking 8802 */ 8803 8804 struct perf_comm_event { 8805 struct task_struct *task; 8806 char *comm; 8807 int comm_size; 8808 8809 struct { 8810 struct perf_event_header header; 8811 8812 u32 pid; 8813 u32 tid; 8814 } event_id; 8815 }; 8816 8817 static int perf_event_comm_match(struct perf_event *event) 8818 { 8819 return event->attr.comm; 8820 } 8821 8822 static void perf_event_comm_output(struct perf_event *event, 8823 void *data) 8824 { 8825 struct perf_comm_event *comm_event = data; 8826 struct perf_output_handle handle; 8827 struct perf_sample_data sample; 8828 int size = comm_event->event_id.header.size; 8829 int ret; 8830 8831 if (!perf_event_comm_match(event)) 8832 return; 8833 8834 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 8835 ret = perf_output_begin(&handle, &sample, event, 8836 comm_event->event_id.header.size); 8837 8838 if (ret) 8839 goto out; 8840 8841 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 8842 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 8843 8844 perf_output_put(&handle, comm_event->event_id); 8845 __output_copy(&handle, comm_event->comm, 8846 comm_event->comm_size); 8847 8848 perf_event__output_id_sample(event, &handle, &sample); 8849 8850 perf_output_end(&handle); 8851 out: 8852 comm_event->event_id.header.size = size; 8853 } 8854 8855 static void perf_event_comm_event(struct perf_comm_event *comm_event) 8856 { 8857 char comm[TASK_COMM_LEN]; 8858 unsigned int size; 8859 8860 memset(comm, 0, sizeof(comm)); 8861 strscpy(comm, comm_event->task->comm); 8862 size = ALIGN(strlen(comm)+1, sizeof(u64)); 8863 8864 comm_event->comm = comm; 8865 comm_event->comm_size = size; 8866 8867 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 8868 8869 perf_iterate_sb(perf_event_comm_output, 8870 comm_event, 8871 NULL); 8872 } 8873 8874 void perf_event_comm(struct task_struct *task, bool exec) 8875 { 8876 struct perf_comm_event comm_event; 8877 8878 if (!atomic_read(&nr_comm_events)) 8879 return; 8880 8881 comm_event = (struct perf_comm_event){ 8882 .task = task, 8883 /* .comm */ 8884 /* .comm_size */ 8885 .event_id = { 8886 .header = { 8887 .type = PERF_RECORD_COMM, 8888 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, 8889 /* .size */ 8890 }, 8891 /* .pid */ 8892 /* .tid */ 8893 }, 8894 }; 8895 8896 perf_event_comm_event(&comm_event); 8897 } 8898 8899 /* 8900 * namespaces tracking 8901 */ 8902 8903 struct perf_namespaces_event { 8904 struct task_struct *task; 8905 8906 struct { 8907 struct perf_event_header header; 8908 8909 u32 pid; 8910 u32 tid; 8911 u64 nr_namespaces; 8912 struct perf_ns_link_info link_info[NR_NAMESPACES]; 8913 } event_id; 8914 }; 8915 8916 static int perf_event_namespaces_match(struct perf_event *event) 8917 { 8918 return event->attr.namespaces; 8919 } 8920 8921 static void perf_event_namespaces_output(struct perf_event *event, 8922 void *data) 8923 { 8924 struct perf_namespaces_event *namespaces_event = data; 8925 struct perf_output_handle handle; 8926 struct perf_sample_data sample; 8927 u16 header_size = namespaces_event->event_id.header.size; 8928 int ret; 8929 8930 if (!perf_event_namespaces_match(event)) 8931 return; 8932 8933 perf_event_header__init_id(&namespaces_event->event_id.header, 8934 &sample, event); 8935 ret = perf_output_begin(&handle, &sample, event, 8936 namespaces_event->event_id.header.size); 8937 if (ret) 8938 goto out; 8939 8940 namespaces_event->event_id.pid = perf_event_pid(event, 8941 namespaces_event->task); 8942 namespaces_event->event_id.tid = perf_event_tid(event, 8943 namespaces_event->task); 8944 8945 perf_output_put(&handle, namespaces_event->event_id); 8946 8947 perf_event__output_id_sample(event, &handle, &sample); 8948 8949 perf_output_end(&handle); 8950 out: 8951 namespaces_event->event_id.header.size = header_size; 8952 } 8953 8954 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, 8955 struct task_struct *task, 8956 const struct proc_ns_operations *ns_ops) 8957 { 8958 struct path ns_path; 8959 struct inode *ns_inode; 8960 int error; 8961 8962 error = ns_get_path(&ns_path, task, ns_ops); 8963 if (!error) { 8964 ns_inode = ns_path.dentry->d_inode; 8965 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); 8966 ns_link_info->ino = ns_inode->i_ino; 8967 path_put(&ns_path); 8968 } 8969 } 8970 8971 void perf_event_namespaces(struct task_struct *task) 8972 { 8973 struct perf_namespaces_event namespaces_event; 8974 struct perf_ns_link_info *ns_link_info; 8975 8976 if (!atomic_read(&nr_namespaces_events)) 8977 return; 8978 8979 namespaces_event = (struct perf_namespaces_event){ 8980 .task = task, 8981 .event_id = { 8982 .header = { 8983 .type = PERF_RECORD_NAMESPACES, 8984 .misc = 0, 8985 .size = sizeof(namespaces_event.event_id), 8986 }, 8987 /* .pid */ 8988 /* .tid */ 8989 .nr_namespaces = NR_NAMESPACES, 8990 /* .link_info[NR_NAMESPACES] */ 8991 }, 8992 }; 8993 8994 ns_link_info = namespaces_event.event_id.link_info; 8995 8996 perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], 8997 task, &mntns_operations); 8998 8999 #ifdef CONFIG_USER_NS 9000 perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], 9001 task, &userns_operations); 9002 #endif 9003 #ifdef CONFIG_NET_NS 9004 perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], 9005 task, &netns_operations); 9006 #endif 9007 #ifdef CONFIG_UTS_NS 9008 perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], 9009 task, &utsns_operations); 9010 #endif 9011 #ifdef CONFIG_IPC_NS 9012 perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], 9013 task, &ipcns_operations); 9014 #endif 9015 #ifdef CONFIG_PID_NS 9016 perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], 9017 task, &pidns_operations); 9018 #endif 9019 #ifdef CONFIG_CGROUPS 9020 perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], 9021 task, &cgroupns_operations); 9022 #endif 9023 9024 perf_iterate_sb(perf_event_namespaces_output, 9025 &namespaces_event, 9026 NULL); 9027 } 9028 9029 /* 9030 * cgroup tracking 9031 */ 9032 #ifdef CONFIG_CGROUP_PERF 9033 9034 struct perf_cgroup_event { 9035 char *path; 9036 int path_size; 9037 struct { 9038 struct perf_event_header header; 9039 u64 id; 9040 char path[]; 9041 } event_id; 9042 }; 9043 9044 static int perf_event_cgroup_match(struct perf_event *event) 9045 { 9046 return event->attr.cgroup; 9047 } 9048 9049 static void perf_event_cgroup_output(struct perf_event *event, void *data) 9050 { 9051 struct perf_cgroup_event *cgroup_event = data; 9052 struct perf_output_handle handle; 9053 struct perf_sample_data sample; 9054 u16 header_size = cgroup_event->event_id.header.size; 9055 int ret; 9056 9057 if (!perf_event_cgroup_match(event)) 9058 return; 9059 9060 perf_event_header__init_id(&cgroup_event->event_id.header, 9061 &sample, event); 9062 ret = perf_output_begin(&handle, &sample, event, 9063 cgroup_event->event_id.header.size); 9064 if (ret) 9065 goto out; 9066 9067 perf_output_put(&handle, cgroup_event->event_id); 9068 __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); 9069 9070 perf_event__output_id_sample(event, &handle, &sample); 9071 9072 perf_output_end(&handle); 9073 out: 9074 cgroup_event->event_id.header.size = header_size; 9075 } 9076 9077 static void perf_event_cgroup(struct cgroup *cgrp) 9078 { 9079 struct perf_cgroup_event cgroup_event; 9080 char path_enomem[16] = "//enomem"; 9081 char *pathname; 9082 size_t size; 9083 9084 if (!atomic_read(&nr_cgroup_events)) 9085 return; 9086 9087 cgroup_event = (struct perf_cgroup_event){ 9088 .event_id = { 9089 .header = { 9090 .type = PERF_RECORD_CGROUP, 9091 .misc = 0, 9092 .size = sizeof(cgroup_event.event_id), 9093 }, 9094 .id = cgroup_id(cgrp), 9095 }, 9096 }; 9097 9098 pathname = kmalloc(PATH_MAX, GFP_KERNEL); 9099 if (pathname == NULL) { 9100 cgroup_event.path = path_enomem; 9101 } else { 9102 /* just to be sure to have enough space for alignment */ 9103 cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); 9104 cgroup_event.path = pathname; 9105 } 9106 9107 /* 9108 * Since our buffer works in 8 byte units we need to align our string 9109 * size to a multiple of 8. However, we must guarantee the tail end is 9110 * zero'd out to avoid leaking random bits to userspace. 9111 */ 9112 size = strlen(cgroup_event.path) + 1; 9113 while (!IS_ALIGNED(size, sizeof(u64))) 9114 cgroup_event.path[size++] = '\0'; 9115 9116 cgroup_event.event_id.header.size += size; 9117 cgroup_event.path_size = size; 9118 9119 perf_iterate_sb(perf_event_cgroup_output, 9120 &cgroup_event, 9121 NULL); 9122 9123 kfree(pathname); 9124 } 9125 9126 #endif 9127 9128 /* 9129 * mmap tracking 9130 */ 9131 9132 struct perf_mmap_event { 9133 struct vm_area_struct *vma; 9134 9135 const char *file_name; 9136 int file_size; 9137 int maj, min; 9138 u64 ino; 9139 u64 ino_generation; 9140 u32 prot, flags; 9141 u8 build_id[BUILD_ID_SIZE_MAX]; 9142 u32 build_id_size; 9143 9144 struct { 9145 struct perf_event_header header; 9146 9147 u32 pid; 9148 u32 tid; 9149 u64 start; 9150 u64 len; 9151 u64 pgoff; 9152 } event_id; 9153 }; 9154 9155 static int perf_event_mmap_match(struct perf_event *event, 9156 void *data) 9157 { 9158 struct perf_mmap_event *mmap_event = data; 9159 struct vm_area_struct *vma = mmap_event->vma; 9160 int executable = vma->vm_flags & VM_EXEC; 9161 9162 return (!executable && event->attr.mmap_data) || 9163 (executable && (event->attr.mmap || event->attr.mmap2)); 9164 } 9165 9166 static void perf_event_mmap_output(struct perf_event *event, 9167 void *data) 9168 { 9169 struct perf_mmap_event *mmap_event = data; 9170 struct perf_output_handle handle; 9171 struct perf_sample_data sample; 9172 int size = mmap_event->event_id.header.size; 9173 u32 type = mmap_event->event_id.header.type; 9174 bool use_build_id; 9175 int ret; 9176 9177 if (!perf_event_mmap_match(event, data)) 9178 return; 9179 9180 if (event->attr.mmap2) { 9181 mmap_event->event_id.header.type = PERF_RECORD_MMAP2; 9182 mmap_event->event_id.header.size += sizeof(mmap_event->maj); 9183 mmap_event->event_id.header.size += sizeof(mmap_event->min); 9184 mmap_event->event_id.header.size += sizeof(mmap_event->ino); 9185 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); 9186 mmap_event->event_id.header.size += sizeof(mmap_event->prot); 9187 mmap_event->event_id.header.size += sizeof(mmap_event->flags); 9188 } 9189 9190 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 9191 ret = perf_output_begin(&handle, &sample, event, 9192 mmap_event->event_id.header.size); 9193 if (ret) 9194 goto out; 9195 9196 mmap_event->event_id.pid = perf_event_pid(event, current); 9197 mmap_event->event_id.tid = perf_event_tid(event, current); 9198 9199 use_build_id = event->attr.build_id && mmap_event->build_id_size; 9200 9201 if (event->attr.mmap2 && use_build_id) 9202 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; 9203 9204 perf_output_put(&handle, mmap_event->event_id); 9205 9206 if (event->attr.mmap2) { 9207 if (use_build_id) { 9208 u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; 9209 9210 __output_copy(&handle, size, 4); 9211 __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); 9212 } else { 9213 perf_output_put(&handle, mmap_event->maj); 9214 perf_output_put(&handle, mmap_event->min); 9215 perf_output_put(&handle, mmap_event->ino); 9216 perf_output_put(&handle, mmap_event->ino_generation); 9217 } 9218 perf_output_put(&handle, mmap_event->prot); 9219 perf_output_put(&handle, mmap_event->flags); 9220 } 9221 9222 __output_copy(&handle, mmap_event->file_name, 9223 mmap_event->file_size); 9224 9225 perf_event__output_id_sample(event, &handle, &sample); 9226 9227 perf_output_end(&handle); 9228 out: 9229 mmap_event->event_id.header.size = size; 9230 mmap_event->event_id.header.type = type; 9231 } 9232 9233 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 9234 { 9235 struct vm_area_struct *vma = mmap_event->vma; 9236 struct file *file = vma->vm_file; 9237 int maj = 0, min = 0; 9238 u64 ino = 0, gen = 0; 9239 u32 prot = 0, flags = 0; 9240 unsigned int size; 9241 char tmp[16]; 9242 char *buf = NULL; 9243 char *name = NULL; 9244 9245 if (vma->vm_flags & VM_READ) 9246 prot |= PROT_READ; 9247 if (vma->vm_flags & VM_WRITE) 9248 prot |= PROT_WRITE; 9249 if (vma->vm_flags & VM_EXEC) 9250 prot |= PROT_EXEC; 9251 9252 if (vma->vm_flags & VM_MAYSHARE) 9253 flags = MAP_SHARED; 9254 else 9255 flags = MAP_PRIVATE; 9256 9257 if (vma->vm_flags & VM_LOCKED) 9258 flags |= MAP_LOCKED; 9259 if (is_vm_hugetlb_page(vma)) 9260 flags |= MAP_HUGETLB; 9261 9262 if (file) { 9263 struct inode *inode; 9264 dev_t dev; 9265 9266 buf = kmalloc(PATH_MAX, GFP_KERNEL); 9267 if (!buf) { 9268 name = "//enomem"; 9269 goto cpy_name; 9270 } 9271 /* 9272 * d_path() works from the end of the rb backwards, so we 9273 * need to add enough zero bytes after the string to handle 9274 * the 64bit alignment we do later. 9275 */ 9276 name = file_path(file, buf, PATH_MAX - sizeof(u64)); 9277 if (IS_ERR(name)) { 9278 name = "//toolong"; 9279 goto cpy_name; 9280 } 9281 inode = file_inode(vma->vm_file); 9282 dev = inode->i_sb->s_dev; 9283 ino = inode->i_ino; 9284 gen = inode->i_generation; 9285 maj = MAJOR(dev); 9286 min = MINOR(dev); 9287 9288 goto got_name; 9289 } else { 9290 if (vma->vm_ops && vma->vm_ops->name) 9291 name = (char *) vma->vm_ops->name(vma); 9292 if (!name) 9293 name = (char *)arch_vma_name(vma); 9294 if (!name) { 9295 if (vma_is_initial_heap(vma)) 9296 name = "[heap]"; 9297 else if (vma_is_initial_stack(vma)) 9298 name = "[stack]"; 9299 else 9300 name = "//anon"; 9301 } 9302 } 9303 9304 cpy_name: 9305 strscpy(tmp, name); 9306 name = tmp; 9307 got_name: 9308 /* 9309 * Since our buffer works in 8 byte units we need to align our string 9310 * size to a multiple of 8. However, we must guarantee the tail end is 9311 * zero'd out to avoid leaking random bits to userspace. 9312 */ 9313 size = strlen(name)+1; 9314 while (!IS_ALIGNED(size, sizeof(u64))) 9315 name[size++] = '\0'; 9316 9317 mmap_event->file_name = name; 9318 mmap_event->file_size = size; 9319 mmap_event->maj = maj; 9320 mmap_event->min = min; 9321 mmap_event->ino = ino; 9322 mmap_event->ino_generation = gen; 9323 mmap_event->prot = prot; 9324 mmap_event->flags = flags; 9325 9326 if (!(vma->vm_flags & VM_EXEC)) 9327 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 9328 9329 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 9330 9331 if (atomic_read(&nr_build_id_events)) 9332 build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size); 9333 9334 perf_iterate_sb(perf_event_mmap_output, 9335 mmap_event, 9336 NULL); 9337 9338 kfree(buf); 9339 } 9340 9341 /* 9342 * Check whether inode and address range match filter criteria. 9343 */ 9344 static bool perf_addr_filter_match(struct perf_addr_filter *filter, 9345 struct file *file, unsigned long offset, 9346 unsigned long size) 9347 { 9348 /* d_inode(NULL) won't be equal to any mapped user-space file */ 9349 if (!filter->path.dentry) 9350 return false; 9351 9352 if (d_inode(filter->path.dentry) != file_inode(file)) 9353 return false; 9354 9355 if (filter->offset > offset + size) 9356 return false; 9357 9358 if (filter->offset + filter->size < offset) 9359 return false; 9360 9361 return true; 9362 } 9363 9364 static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, 9365 struct vm_area_struct *vma, 9366 struct perf_addr_filter_range *fr) 9367 { 9368 unsigned long vma_size = vma->vm_end - vma->vm_start; 9369 unsigned long off = vma->vm_pgoff << PAGE_SHIFT; 9370 struct file *file = vma->vm_file; 9371 9372 if (!perf_addr_filter_match(filter, file, off, vma_size)) 9373 return false; 9374 9375 if (filter->offset < off) { 9376 fr->start = vma->vm_start; 9377 fr->size = min(vma_size, filter->size - (off - filter->offset)); 9378 } else { 9379 fr->start = vma->vm_start + filter->offset - off; 9380 fr->size = min(vma->vm_end - fr->start, filter->size); 9381 } 9382 9383 return true; 9384 } 9385 9386 static void __perf_addr_filters_adjust(struct perf_event *event, void *data) 9387 { 9388 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 9389 struct vm_area_struct *vma = data; 9390 struct perf_addr_filter *filter; 9391 unsigned int restart = 0, count = 0; 9392 unsigned long flags; 9393 9394 if (!has_addr_filter(event)) 9395 return; 9396 9397 if (!vma->vm_file) 9398 return; 9399 9400 raw_spin_lock_irqsave(&ifh->lock, flags); 9401 list_for_each_entry(filter, &ifh->list, entry) { 9402 if (perf_addr_filter_vma_adjust(filter, vma, 9403 &event->addr_filter_ranges[count])) 9404 restart++; 9405 9406 count++; 9407 } 9408 9409 if (restart) 9410 event->addr_filters_gen++; 9411 raw_spin_unlock_irqrestore(&ifh->lock, flags); 9412 9413 if (restart) 9414 perf_event_stop(event, 1); 9415 } 9416 9417 /* 9418 * Adjust all task's events' filters to the new vma 9419 */ 9420 static void perf_addr_filters_adjust(struct vm_area_struct *vma) 9421 { 9422 struct perf_event_context *ctx; 9423 9424 /* 9425 * Data tracing isn't supported yet and as such there is no need 9426 * to keep track of anything that isn't related to executable code: 9427 */ 9428 if (!(vma->vm_flags & VM_EXEC)) 9429 return; 9430 9431 rcu_read_lock(); 9432 ctx = rcu_dereference(current->perf_event_ctxp); 9433 if (ctx) 9434 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); 9435 rcu_read_unlock(); 9436 } 9437 9438 void perf_event_mmap(struct vm_area_struct *vma) 9439 { 9440 struct perf_mmap_event mmap_event; 9441 9442 if (!atomic_read(&nr_mmap_events)) 9443 return; 9444 9445 mmap_event = (struct perf_mmap_event){ 9446 .vma = vma, 9447 /* .file_name */ 9448 /* .file_size */ 9449 .event_id = { 9450 .header = { 9451 .type = PERF_RECORD_MMAP, 9452 .misc = PERF_RECORD_MISC_USER, 9453 /* .size */ 9454 }, 9455 /* .pid */ 9456 /* .tid */ 9457 .start = vma->vm_start, 9458 .len = vma->vm_end - vma->vm_start, 9459 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, 9460 }, 9461 /* .maj (attr_mmap2 only) */ 9462 /* .min (attr_mmap2 only) */ 9463 /* .ino (attr_mmap2 only) */ 9464 /* .ino_generation (attr_mmap2 only) */ 9465 /* .prot (attr_mmap2 only) */ 9466 /* .flags (attr_mmap2 only) */ 9467 }; 9468 9469 perf_addr_filters_adjust(vma); 9470 perf_event_mmap_event(&mmap_event); 9471 } 9472 9473 void perf_event_aux_event(struct perf_event *event, unsigned long head, 9474 unsigned long size, u64 flags) 9475 { 9476 struct perf_output_handle handle; 9477 struct perf_sample_data sample; 9478 struct perf_aux_event { 9479 struct perf_event_header header; 9480 u64 offset; 9481 u64 size; 9482 u64 flags; 9483 } rec = { 9484 .header = { 9485 .type = PERF_RECORD_AUX, 9486 .misc = 0, 9487 .size = sizeof(rec), 9488 }, 9489 .offset = head, 9490 .size = size, 9491 .flags = flags, 9492 }; 9493 int ret; 9494 9495 perf_event_header__init_id(&rec.header, &sample, event); 9496 ret = perf_output_begin(&handle, &sample, event, rec.header.size); 9497 9498 if (ret) 9499 return; 9500 9501 perf_output_put(&handle, rec); 9502 perf_event__output_id_sample(event, &handle, &sample); 9503 9504 perf_output_end(&handle); 9505 } 9506 9507 /* 9508 * Lost/dropped samples logging 9509 */ 9510 void perf_log_lost_samples(struct perf_event *event, u64 lost) 9511 { 9512 struct perf_output_handle handle; 9513 struct perf_sample_data sample; 9514 int ret; 9515 9516 struct { 9517 struct perf_event_header header; 9518 u64 lost; 9519 } lost_samples_event = { 9520 .header = { 9521 .type = PERF_RECORD_LOST_SAMPLES, 9522 .misc = 0, 9523 .size = sizeof(lost_samples_event), 9524 }, 9525 .lost = lost, 9526 }; 9527 9528 perf_event_header__init_id(&lost_samples_event.header, &sample, event); 9529 9530 ret = perf_output_begin(&handle, &sample, event, 9531 lost_samples_event.header.size); 9532 if (ret) 9533 return; 9534 9535 perf_output_put(&handle, lost_samples_event); 9536 perf_event__output_id_sample(event, &handle, &sample); 9537 perf_output_end(&handle); 9538 } 9539 9540 /* 9541 * context_switch tracking 9542 */ 9543 9544 struct perf_switch_event { 9545 struct task_struct *task; 9546 struct task_struct *next_prev; 9547 9548 struct { 9549 struct perf_event_header header; 9550 u32 next_prev_pid; 9551 u32 next_prev_tid; 9552 } event_id; 9553 }; 9554 9555 static int perf_event_switch_match(struct perf_event *event) 9556 { 9557 return event->attr.context_switch; 9558 } 9559 9560 static void perf_event_switch_output(struct perf_event *event, void *data) 9561 { 9562 struct perf_switch_event *se = data; 9563 struct perf_output_handle handle; 9564 struct perf_sample_data sample; 9565 int ret; 9566 9567 if (!perf_event_switch_match(event)) 9568 return; 9569 9570 /* Only CPU-wide events are allowed to see next/prev pid/tid */ 9571 if (event->ctx->task) { 9572 se->event_id.header.type = PERF_RECORD_SWITCH; 9573 se->event_id.header.size = sizeof(se->event_id.header); 9574 } else { 9575 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; 9576 se->event_id.header.size = sizeof(se->event_id); 9577 se->event_id.next_prev_pid = 9578 perf_event_pid(event, se->next_prev); 9579 se->event_id.next_prev_tid = 9580 perf_event_tid(event, se->next_prev); 9581 } 9582 9583 perf_event_header__init_id(&se->event_id.header, &sample, event); 9584 9585 ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); 9586 if (ret) 9587 return; 9588 9589 if (event->ctx->task) 9590 perf_output_put(&handle, se->event_id.header); 9591 else 9592 perf_output_put(&handle, se->event_id); 9593 9594 perf_event__output_id_sample(event, &handle, &sample); 9595 9596 perf_output_end(&handle); 9597 } 9598 9599 static void perf_event_switch(struct task_struct *task, 9600 struct task_struct *next_prev, bool sched_in) 9601 { 9602 struct perf_switch_event switch_event; 9603 9604 /* N.B. caller checks nr_switch_events != 0 */ 9605 9606 switch_event = (struct perf_switch_event){ 9607 .task = task, 9608 .next_prev = next_prev, 9609 .event_id = { 9610 .header = { 9611 /* .type */ 9612 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, 9613 /* .size */ 9614 }, 9615 /* .next_prev_pid */ 9616 /* .next_prev_tid */ 9617 }, 9618 }; 9619 9620 if (!sched_in && task_is_runnable(task)) { 9621 switch_event.event_id.header.misc |= 9622 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; 9623 } 9624 9625 perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); 9626 } 9627 9628 /* 9629 * IRQ throttle logging 9630 */ 9631 9632 static void perf_log_throttle(struct perf_event *event, int enable) 9633 { 9634 struct perf_output_handle handle; 9635 struct perf_sample_data sample; 9636 int ret; 9637 9638 struct { 9639 struct perf_event_header header; 9640 u64 time; 9641 u64 id; 9642 u64 stream_id; 9643 } throttle_event = { 9644 .header = { 9645 .type = PERF_RECORD_THROTTLE, 9646 .misc = 0, 9647 .size = sizeof(throttle_event), 9648 }, 9649 .time = perf_event_clock(event), 9650 .id = primary_event_id(event), 9651 .stream_id = event->id, 9652 }; 9653 9654 if (enable) 9655 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 9656 9657 perf_event_header__init_id(&throttle_event.header, &sample, event); 9658 9659 ret = perf_output_begin(&handle, &sample, event, 9660 throttle_event.header.size); 9661 if (ret) 9662 return; 9663 9664 perf_output_put(&handle, throttle_event); 9665 perf_event__output_id_sample(event, &handle, &sample); 9666 perf_output_end(&handle); 9667 } 9668 9669 /* 9670 * ksymbol register/unregister tracking 9671 */ 9672 9673 struct perf_ksymbol_event { 9674 const char *name; 9675 int name_len; 9676 struct { 9677 struct perf_event_header header; 9678 u64 addr; 9679 u32 len; 9680 u16 ksym_type; 9681 u16 flags; 9682 } event_id; 9683 }; 9684 9685 static int perf_event_ksymbol_match(struct perf_event *event) 9686 { 9687 return event->attr.ksymbol; 9688 } 9689 9690 static void perf_event_ksymbol_output(struct perf_event *event, void *data) 9691 { 9692 struct perf_ksymbol_event *ksymbol_event = data; 9693 struct perf_output_handle handle; 9694 struct perf_sample_data sample; 9695 int ret; 9696 9697 if (!perf_event_ksymbol_match(event)) 9698 return; 9699 9700 perf_event_header__init_id(&ksymbol_event->event_id.header, 9701 &sample, event); 9702 ret = perf_output_begin(&handle, &sample, event, 9703 ksymbol_event->event_id.header.size); 9704 if (ret) 9705 return; 9706 9707 perf_output_put(&handle, ksymbol_event->event_id); 9708 __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); 9709 perf_event__output_id_sample(event, &handle, &sample); 9710 9711 perf_output_end(&handle); 9712 } 9713 9714 void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, 9715 const char *sym) 9716 { 9717 struct perf_ksymbol_event ksymbol_event; 9718 char name[KSYM_NAME_LEN]; 9719 u16 flags = 0; 9720 int name_len; 9721 9722 if (!atomic_read(&nr_ksymbol_events)) 9723 return; 9724 9725 if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || 9726 ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) 9727 goto err; 9728 9729 strscpy(name, sym); 9730 name_len = strlen(name) + 1; 9731 while (!IS_ALIGNED(name_len, sizeof(u64))) 9732 name[name_len++] = '\0'; 9733 BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); 9734 9735 if (unregister) 9736 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; 9737 9738 ksymbol_event = (struct perf_ksymbol_event){ 9739 .name = name, 9740 .name_len = name_len, 9741 .event_id = { 9742 .header = { 9743 .type = PERF_RECORD_KSYMBOL, 9744 .size = sizeof(ksymbol_event.event_id) + 9745 name_len, 9746 }, 9747 .addr = addr, 9748 .len = len, 9749 .ksym_type = ksym_type, 9750 .flags = flags, 9751 }, 9752 }; 9753 9754 perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); 9755 return; 9756 err: 9757 WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); 9758 } 9759 9760 /* 9761 * bpf program load/unload tracking 9762 */ 9763 9764 struct perf_bpf_event { 9765 struct bpf_prog *prog; 9766 struct { 9767 struct perf_event_header header; 9768 u16 type; 9769 u16 flags; 9770 u32 id; 9771 u8 tag[BPF_TAG_SIZE]; 9772 } event_id; 9773 }; 9774 9775 static int perf_event_bpf_match(struct perf_event *event) 9776 { 9777 return event->attr.bpf_event; 9778 } 9779 9780 static void perf_event_bpf_output(struct perf_event *event, void *data) 9781 { 9782 struct perf_bpf_event *bpf_event = data; 9783 struct perf_output_handle handle; 9784 struct perf_sample_data sample; 9785 int ret; 9786 9787 if (!perf_event_bpf_match(event)) 9788 return; 9789 9790 perf_event_header__init_id(&bpf_event->event_id.header, 9791 &sample, event); 9792 ret = perf_output_begin(&handle, &sample, event, 9793 bpf_event->event_id.header.size); 9794 if (ret) 9795 return; 9796 9797 perf_output_put(&handle, bpf_event->event_id); 9798 perf_event__output_id_sample(event, &handle, &sample); 9799 9800 perf_output_end(&handle); 9801 } 9802 9803 static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, 9804 enum perf_bpf_event_type type) 9805 { 9806 bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; 9807 int i; 9808 9809 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, 9810 (u64)(unsigned long)prog->bpf_func, 9811 prog->jited_len, unregister, 9812 prog->aux->ksym.name); 9813 9814 for (i = 1; i < prog->aux->func_cnt; i++) { 9815 struct bpf_prog *subprog = prog->aux->func[i]; 9816 9817 perf_event_ksymbol( 9818 PERF_RECORD_KSYMBOL_TYPE_BPF, 9819 (u64)(unsigned long)subprog->bpf_func, 9820 subprog->jited_len, unregister, 9821 subprog->aux->ksym.name); 9822 } 9823 } 9824 9825 void perf_event_bpf_event(struct bpf_prog *prog, 9826 enum perf_bpf_event_type type, 9827 u16 flags) 9828 { 9829 struct perf_bpf_event bpf_event; 9830 9831 switch (type) { 9832 case PERF_BPF_EVENT_PROG_LOAD: 9833 case PERF_BPF_EVENT_PROG_UNLOAD: 9834 if (atomic_read(&nr_ksymbol_events)) 9835 perf_event_bpf_emit_ksymbols(prog, type); 9836 break; 9837 default: 9838 return; 9839 } 9840 9841 if (!atomic_read(&nr_bpf_events)) 9842 return; 9843 9844 bpf_event = (struct perf_bpf_event){ 9845 .prog = prog, 9846 .event_id = { 9847 .header = { 9848 .type = PERF_RECORD_BPF_EVENT, 9849 .size = sizeof(bpf_event.event_id), 9850 }, 9851 .type = type, 9852 .flags = flags, 9853 .id = prog->aux->id, 9854 }, 9855 }; 9856 9857 BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); 9858 9859 memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); 9860 perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); 9861 } 9862 9863 struct perf_text_poke_event { 9864 const void *old_bytes; 9865 const void *new_bytes; 9866 size_t pad; 9867 u16 old_len; 9868 u16 new_len; 9869 9870 struct { 9871 struct perf_event_header header; 9872 9873 u64 addr; 9874 } event_id; 9875 }; 9876 9877 static int perf_event_text_poke_match(struct perf_event *event) 9878 { 9879 return event->attr.text_poke; 9880 } 9881 9882 static void perf_event_text_poke_output(struct perf_event *event, void *data) 9883 { 9884 struct perf_text_poke_event *text_poke_event = data; 9885 struct perf_output_handle handle; 9886 struct perf_sample_data sample; 9887 u64 padding = 0; 9888 int ret; 9889 9890 if (!perf_event_text_poke_match(event)) 9891 return; 9892 9893 perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); 9894 9895 ret = perf_output_begin(&handle, &sample, event, 9896 text_poke_event->event_id.header.size); 9897 if (ret) 9898 return; 9899 9900 perf_output_put(&handle, text_poke_event->event_id); 9901 perf_output_put(&handle, text_poke_event->old_len); 9902 perf_output_put(&handle, text_poke_event->new_len); 9903 9904 __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); 9905 __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); 9906 9907 if (text_poke_event->pad) 9908 __output_copy(&handle, &padding, text_poke_event->pad); 9909 9910 perf_event__output_id_sample(event, &handle, &sample); 9911 9912 perf_output_end(&handle); 9913 } 9914 9915 void perf_event_text_poke(const void *addr, const void *old_bytes, 9916 size_t old_len, const void *new_bytes, size_t new_len) 9917 { 9918 struct perf_text_poke_event text_poke_event; 9919 size_t tot, pad; 9920 9921 if (!atomic_read(&nr_text_poke_events)) 9922 return; 9923 9924 tot = sizeof(text_poke_event.old_len) + old_len; 9925 tot += sizeof(text_poke_event.new_len) + new_len; 9926 pad = ALIGN(tot, sizeof(u64)) - tot; 9927 9928 text_poke_event = (struct perf_text_poke_event){ 9929 .old_bytes = old_bytes, 9930 .new_bytes = new_bytes, 9931 .pad = pad, 9932 .old_len = old_len, 9933 .new_len = new_len, 9934 .event_id = { 9935 .header = { 9936 .type = PERF_RECORD_TEXT_POKE, 9937 .misc = PERF_RECORD_MISC_KERNEL, 9938 .size = sizeof(text_poke_event.event_id) + tot + pad, 9939 }, 9940 .addr = (unsigned long)addr, 9941 }, 9942 }; 9943 9944 perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); 9945 } 9946 9947 void perf_event_itrace_started(struct perf_event *event) 9948 { 9949 event->attach_state |= PERF_ATTACH_ITRACE; 9950 } 9951 9952 static void perf_log_itrace_start(struct perf_event *event) 9953 { 9954 struct perf_output_handle handle; 9955 struct perf_sample_data sample; 9956 struct perf_aux_event { 9957 struct perf_event_header header; 9958 u32 pid; 9959 u32 tid; 9960 } rec; 9961 int ret; 9962 9963 if (event->parent) 9964 event = event->parent; 9965 9966 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || 9967 event->attach_state & PERF_ATTACH_ITRACE) 9968 return; 9969 9970 rec.header.type = PERF_RECORD_ITRACE_START; 9971 rec.header.misc = 0; 9972 rec.header.size = sizeof(rec); 9973 rec.pid = perf_event_pid(event, current); 9974 rec.tid = perf_event_tid(event, current); 9975 9976 perf_event_header__init_id(&rec.header, &sample, event); 9977 ret = perf_output_begin(&handle, &sample, event, rec.header.size); 9978 9979 if (ret) 9980 return; 9981 9982 perf_output_put(&handle, rec); 9983 perf_event__output_id_sample(event, &handle, &sample); 9984 9985 perf_output_end(&handle); 9986 } 9987 9988 void perf_report_aux_output_id(struct perf_event *event, u64 hw_id) 9989 { 9990 struct perf_output_handle handle; 9991 struct perf_sample_data sample; 9992 struct perf_aux_event { 9993 struct perf_event_header header; 9994 u64 hw_id; 9995 } rec; 9996 int ret; 9997 9998 if (event->parent) 9999 event = event->parent; 10000 10001 rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID; 10002 rec.header.misc = 0; 10003 rec.header.size = sizeof(rec); 10004 rec.hw_id = hw_id; 10005 10006 perf_event_header__init_id(&rec.header, &sample, event); 10007 ret = perf_output_begin(&handle, &sample, event, rec.header.size); 10008 10009 if (ret) 10010 return; 10011 10012 perf_output_put(&handle, rec); 10013 perf_event__output_id_sample(event, &handle, &sample); 10014 10015 perf_output_end(&handle); 10016 } 10017 EXPORT_SYMBOL_GPL(perf_report_aux_output_id); 10018 10019 static int 10020 __perf_event_account_interrupt(struct perf_event *event, int throttle) 10021 { 10022 struct hw_perf_event *hwc = &event->hw; 10023 int ret = 0; 10024 u64 seq; 10025 10026 seq = __this_cpu_read(perf_throttled_seq); 10027 if (seq != hwc->interrupts_seq) { 10028 hwc->interrupts_seq = seq; 10029 hwc->interrupts = 1; 10030 } else { 10031 hwc->interrupts++; 10032 if (unlikely(throttle && 10033 hwc->interrupts > max_samples_per_tick)) { 10034 __this_cpu_inc(perf_throttled_count); 10035 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); 10036 hwc->interrupts = MAX_INTERRUPTS; 10037 perf_log_throttle(event, 0); 10038 ret = 1; 10039 } 10040 } 10041 10042 if (event->attr.freq) { 10043 u64 now = perf_clock(); 10044 s64 delta = now - hwc->freq_time_stamp; 10045 10046 hwc->freq_time_stamp = now; 10047 10048 if (delta > 0 && delta < 2*TICK_NSEC) 10049 perf_adjust_period(event, delta, hwc->last_period, true); 10050 } 10051 10052 return ret; 10053 } 10054 10055 int perf_event_account_interrupt(struct perf_event *event) 10056 { 10057 return __perf_event_account_interrupt(event, 1); 10058 } 10059 10060 static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) 10061 { 10062 /* 10063 * Due to interrupt latency (AKA "skid"), we may enter the 10064 * kernel before taking an overflow, even if the PMU is only 10065 * counting user events. 10066 */ 10067 if (event->attr.exclude_kernel && !user_mode(regs)) 10068 return false; 10069 10070 return true; 10071 } 10072 10073 #ifdef CONFIG_BPF_SYSCALL 10074 static int bpf_overflow_handler(struct perf_event *event, 10075 struct perf_sample_data *data, 10076 struct pt_regs *regs) 10077 { 10078 struct bpf_perf_event_data_kern ctx = { 10079 .data = data, 10080 .event = event, 10081 }; 10082 struct bpf_prog *prog; 10083 int ret = 0; 10084 10085 ctx.regs = perf_arch_bpf_user_pt_regs(regs); 10086 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) 10087 goto out; 10088 rcu_read_lock(); 10089 prog = READ_ONCE(event->prog); 10090 if (prog) { 10091 perf_prepare_sample(data, event, regs); 10092 ret = bpf_prog_run(prog, &ctx); 10093 } 10094 rcu_read_unlock(); 10095 out: 10096 __this_cpu_dec(bpf_prog_active); 10097 10098 return ret; 10099 } 10100 10101 static inline int perf_event_set_bpf_handler(struct perf_event *event, 10102 struct bpf_prog *prog, 10103 u64 bpf_cookie) 10104 { 10105 if (event->overflow_handler_context) 10106 /* hw breakpoint or kernel counter */ 10107 return -EINVAL; 10108 10109 if (event->prog) 10110 return -EEXIST; 10111 10112 if (prog->type != BPF_PROG_TYPE_PERF_EVENT) 10113 return -EINVAL; 10114 10115 if (event->attr.precise_ip && 10116 prog->call_get_stack && 10117 (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || 10118 event->attr.exclude_callchain_kernel || 10119 event->attr.exclude_callchain_user)) { 10120 /* 10121 * On perf_event with precise_ip, calling bpf_get_stack() 10122 * may trigger unwinder warnings and occasional crashes. 10123 * bpf_get_[stack|stackid] works around this issue by using 10124 * callchain attached to perf_sample_data. If the 10125 * perf_event does not full (kernel and user) callchain 10126 * attached to perf_sample_data, do not allow attaching BPF 10127 * program that calls bpf_get_[stack|stackid]. 10128 */ 10129 return -EPROTO; 10130 } 10131 10132 event->prog = prog; 10133 event->bpf_cookie = bpf_cookie; 10134 return 0; 10135 } 10136 10137 static inline void perf_event_free_bpf_handler(struct perf_event *event) 10138 { 10139 struct bpf_prog *prog = event->prog; 10140 10141 if (!prog) 10142 return; 10143 10144 event->prog = NULL; 10145 bpf_prog_put(prog); 10146 } 10147 #else 10148 static inline int bpf_overflow_handler(struct perf_event *event, 10149 struct perf_sample_data *data, 10150 struct pt_regs *regs) 10151 { 10152 return 1; 10153 } 10154 10155 static inline int perf_event_set_bpf_handler(struct perf_event *event, 10156 struct bpf_prog *prog, 10157 u64 bpf_cookie) 10158 { 10159 return -EOPNOTSUPP; 10160 } 10161 10162 static inline void perf_event_free_bpf_handler(struct perf_event *event) 10163 { 10164 } 10165 #endif 10166 10167 /* 10168 * Generic event overflow handling, sampling. 10169 */ 10170 10171 static int __perf_event_overflow(struct perf_event *event, 10172 int throttle, struct perf_sample_data *data, 10173 struct pt_regs *regs) 10174 { 10175 int events = atomic_read(&event->event_limit); 10176 int ret = 0; 10177 10178 /* 10179 * Non-sampling counters might still use the PMI to fold short 10180 * hardware counters, ignore those. 10181 */ 10182 if (unlikely(!is_sampling_event(event))) 10183 return 0; 10184 10185 ret = __perf_event_account_interrupt(event, throttle); 10186 10187 if (event->attr.aux_pause) 10188 perf_event_aux_pause(event->aux_event, true); 10189 10190 if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && 10191 !bpf_overflow_handler(event, data, regs)) 10192 goto out; 10193 10194 /* 10195 * XXX event_limit might not quite work as expected on inherited 10196 * events 10197 */ 10198 10199 event->pending_kill = POLL_IN; 10200 if (events && atomic_dec_and_test(&event->event_limit)) { 10201 ret = 1; 10202 event->pending_kill = POLL_HUP; 10203 perf_event_disable_inatomic(event); 10204 } 10205 10206 if (event->attr.sigtrap) { 10207 /* 10208 * The desired behaviour of sigtrap vs invalid samples is a bit 10209 * tricky; on the one hand, one should not loose the SIGTRAP if 10210 * it is the first event, on the other hand, we should also not 10211 * trigger the WARN or override the data address. 10212 */ 10213 bool valid_sample = sample_is_allowed(event, regs); 10214 unsigned int pending_id = 1; 10215 enum task_work_notify_mode notify_mode; 10216 10217 if (regs) 10218 pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; 10219 10220 notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; 10221 10222 if (!event->pending_work && 10223 !task_work_add(current, &event->pending_task, notify_mode)) { 10224 event->pending_work = pending_id; 10225 local_inc(&event->ctx->nr_no_switch_fast); 10226 WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); 10227 10228 event->pending_addr = 0; 10229 if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) 10230 event->pending_addr = data->addr; 10231 10232 } else if (event->attr.exclude_kernel && valid_sample) { 10233 /* 10234 * Should not be able to return to user space without 10235 * consuming pending_work; with exceptions: 10236 * 10237 * 1. Where !exclude_kernel, events can overflow again 10238 * in the kernel without returning to user space. 10239 * 10240 * 2. Events that can overflow again before the IRQ- 10241 * work without user space progress (e.g. hrtimer). 10242 * To approximate progress (with false negatives), 10243 * check 32-bit hash of the current IP. 10244 */ 10245 WARN_ON_ONCE(event->pending_work != pending_id); 10246 } 10247 } 10248 10249 READ_ONCE(event->overflow_handler)(event, data, regs); 10250 10251 if (*perf_event_fasync(event) && event->pending_kill) { 10252 event->pending_wakeup = 1; 10253 irq_work_queue(&event->pending_irq); 10254 } 10255 out: 10256 if (event->attr.aux_resume) 10257 perf_event_aux_pause(event->aux_event, false); 10258 10259 return ret; 10260 } 10261 10262 int perf_event_overflow(struct perf_event *event, 10263 struct perf_sample_data *data, 10264 struct pt_regs *regs) 10265 { 10266 return __perf_event_overflow(event, 1, data, regs); 10267 } 10268 10269 /* 10270 * Generic software event infrastructure 10271 */ 10272 10273 struct swevent_htable { 10274 struct swevent_hlist *swevent_hlist; 10275 struct mutex hlist_mutex; 10276 int hlist_refcount; 10277 }; 10278 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); 10279 10280 /* 10281 * We directly increment event->count and keep a second value in 10282 * event->hw.period_left to count intervals. This period event 10283 * is kept in the range [-sample_period, 0] so that we can use the 10284 * sign as trigger. 10285 */ 10286 10287 u64 perf_swevent_set_period(struct perf_event *event) 10288 { 10289 struct hw_perf_event *hwc = &event->hw; 10290 u64 period = hwc->last_period; 10291 u64 nr, offset; 10292 s64 old, val; 10293 10294 hwc->last_period = hwc->sample_period; 10295 10296 old = local64_read(&hwc->period_left); 10297 do { 10298 val = old; 10299 if (val < 0) 10300 return 0; 10301 10302 nr = div64_u64(period + val, period); 10303 offset = nr * period; 10304 val -= offset; 10305 } while (!local64_try_cmpxchg(&hwc->period_left, &old, val)); 10306 10307 return nr; 10308 } 10309 10310 static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 10311 struct perf_sample_data *data, 10312 struct pt_regs *regs) 10313 { 10314 struct hw_perf_event *hwc = &event->hw; 10315 int throttle = 0; 10316 10317 if (!overflow) 10318 overflow = perf_swevent_set_period(event); 10319 10320 if (hwc->interrupts == MAX_INTERRUPTS) 10321 return; 10322 10323 for (; overflow; overflow--) { 10324 if (__perf_event_overflow(event, throttle, 10325 data, regs)) { 10326 /* 10327 * We inhibit the overflow from happening when 10328 * hwc->interrupts == MAX_INTERRUPTS. 10329 */ 10330 break; 10331 } 10332 throttle = 1; 10333 } 10334 } 10335 10336 static void perf_swevent_event(struct perf_event *event, u64 nr, 10337 struct perf_sample_data *data, 10338 struct pt_regs *regs) 10339 { 10340 struct hw_perf_event *hwc = &event->hw; 10341 10342 local64_add(nr, &event->count); 10343 10344 if (!regs) 10345 return; 10346 10347 if (!is_sampling_event(event)) 10348 return; 10349 10350 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { 10351 data->period = nr; 10352 return perf_swevent_overflow(event, 1, data, regs); 10353 } else 10354 data->period = event->hw.last_period; 10355 10356 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 10357 return perf_swevent_overflow(event, 1, data, regs); 10358 10359 if (local64_add_negative(nr, &hwc->period_left)) 10360 return; 10361 10362 perf_swevent_overflow(event, 0, data, regs); 10363 } 10364 10365 int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) 10366 { 10367 if (event->hw.state & PERF_HES_STOPPED) 10368 return 1; 10369 10370 if (regs) { 10371 if (event->attr.exclude_user && user_mode(regs)) 10372 return 1; 10373 10374 if (event->attr.exclude_kernel && !user_mode(regs)) 10375 return 1; 10376 } 10377 10378 return 0; 10379 } 10380 10381 static int perf_swevent_match(struct perf_event *event, 10382 enum perf_type_id type, 10383 u32 event_id, 10384 struct perf_sample_data *data, 10385 struct pt_regs *regs) 10386 { 10387 if (event->attr.type != type) 10388 return 0; 10389 10390 if (event->attr.config != event_id) 10391 return 0; 10392 10393 if (perf_exclude_event(event, regs)) 10394 return 0; 10395 10396 return 1; 10397 } 10398 10399 static inline u64 swevent_hash(u64 type, u32 event_id) 10400 { 10401 u64 val = event_id | (type << 32); 10402 10403 return hash_64(val, SWEVENT_HLIST_BITS); 10404 } 10405 10406 static inline struct hlist_head * 10407 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) 10408 { 10409 u64 hash = swevent_hash(type, event_id); 10410 10411 return &hlist->heads[hash]; 10412 } 10413 10414 /* For the read side: events when they trigger */ 10415 static inline struct hlist_head * 10416 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) 10417 { 10418 struct swevent_hlist *hlist; 10419 10420 hlist = rcu_dereference(swhash->swevent_hlist); 10421 if (!hlist) 10422 return NULL; 10423 10424 return __find_swevent_head(hlist, type, event_id); 10425 } 10426 10427 /* For the event head insertion and removal in the hlist */ 10428 static inline struct hlist_head * 10429 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) 10430 { 10431 struct swevent_hlist *hlist; 10432 u32 event_id = event->attr.config; 10433 u64 type = event->attr.type; 10434 10435 /* 10436 * Event scheduling is always serialized against hlist allocation 10437 * and release. Which makes the protected version suitable here. 10438 * The context lock guarantees that. 10439 */ 10440 hlist = rcu_dereference_protected(swhash->swevent_hlist, 10441 lockdep_is_held(&event->ctx->lock)); 10442 if (!hlist) 10443 return NULL; 10444 10445 return __find_swevent_head(hlist, type, event_id); 10446 } 10447 10448 static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 10449 u64 nr, 10450 struct perf_sample_data *data, 10451 struct pt_regs *regs) 10452 { 10453 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 10454 struct perf_event *event; 10455 struct hlist_head *head; 10456 10457 rcu_read_lock(); 10458 head = find_swevent_head_rcu(swhash, type, event_id); 10459 if (!head) 10460 goto end; 10461 10462 hlist_for_each_entry_rcu(event, head, hlist_entry) { 10463 if (perf_swevent_match(event, type, event_id, data, regs)) 10464 perf_swevent_event(event, nr, data, regs); 10465 } 10466 end: 10467 rcu_read_unlock(); 10468 } 10469 10470 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); 10471 10472 int perf_swevent_get_recursion_context(void) 10473 { 10474 return get_recursion_context(current->perf_recursion); 10475 } 10476 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 10477 10478 void perf_swevent_put_recursion_context(int rctx) 10479 { 10480 put_recursion_context(current->perf_recursion, rctx); 10481 } 10482 10483 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 10484 { 10485 struct perf_sample_data data; 10486 10487 if (WARN_ON_ONCE(!regs)) 10488 return; 10489 10490 perf_sample_data_init(&data, addr, 0); 10491 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 10492 } 10493 10494 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 10495 { 10496 int rctx; 10497 10498 preempt_disable_notrace(); 10499 rctx = perf_swevent_get_recursion_context(); 10500 if (unlikely(rctx < 0)) 10501 goto fail; 10502 10503 ___perf_sw_event(event_id, nr, regs, addr); 10504 10505 perf_swevent_put_recursion_context(rctx); 10506 fail: 10507 preempt_enable_notrace(); 10508 } 10509 10510 static void perf_swevent_read(struct perf_event *event) 10511 { 10512 } 10513 10514 static int perf_swevent_add(struct perf_event *event, int flags) 10515 { 10516 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 10517 struct hw_perf_event *hwc = &event->hw; 10518 struct hlist_head *head; 10519 10520 if (is_sampling_event(event)) { 10521 hwc->last_period = hwc->sample_period; 10522 perf_swevent_set_period(event); 10523 } 10524 10525 hwc->state = !(flags & PERF_EF_START); 10526 10527 head = find_swevent_head(swhash, event); 10528 if (WARN_ON_ONCE(!head)) 10529 return -EINVAL; 10530 10531 hlist_add_head_rcu(&event->hlist_entry, head); 10532 perf_event_update_userpage(event); 10533 10534 return 0; 10535 } 10536 10537 static void perf_swevent_del(struct perf_event *event, int flags) 10538 { 10539 hlist_del_rcu(&event->hlist_entry); 10540 } 10541 10542 static void perf_swevent_start(struct perf_event *event, int flags) 10543 { 10544 event->hw.state = 0; 10545 } 10546 10547 static void perf_swevent_stop(struct perf_event *event, int flags) 10548 { 10549 event->hw.state = PERF_HES_STOPPED; 10550 } 10551 10552 /* Deref the hlist from the update side */ 10553 static inline struct swevent_hlist * 10554 swevent_hlist_deref(struct swevent_htable *swhash) 10555 { 10556 return rcu_dereference_protected(swhash->swevent_hlist, 10557 lockdep_is_held(&swhash->hlist_mutex)); 10558 } 10559 10560 static void swevent_hlist_release(struct swevent_htable *swhash) 10561 { 10562 struct swevent_hlist *hlist = swevent_hlist_deref(swhash); 10563 10564 if (!hlist) 10565 return; 10566 10567 RCU_INIT_POINTER(swhash->swevent_hlist, NULL); 10568 kfree_rcu(hlist, rcu_head); 10569 } 10570 10571 static void swevent_hlist_put_cpu(int cpu) 10572 { 10573 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 10574 10575 mutex_lock(&swhash->hlist_mutex); 10576 10577 if (!--swhash->hlist_refcount) 10578 swevent_hlist_release(swhash); 10579 10580 mutex_unlock(&swhash->hlist_mutex); 10581 } 10582 10583 static void swevent_hlist_put(void) 10584 { 10585 int cpu; 10586 10587 for_each_possible_cpu(cpu) 10588 swevent_hlist_put_cpu(cpu); 10589 } 10590 10591 static int swevent_hlist_get_cpu(int cpu) 10592 { 10593 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 10594 int err = 0; 10595 10596 mutex_lock(&swhash->hlist_mutex); 10597 if (!swevent_hlist_deref(swhash) && 10598 cpumask_test_cpu(cpu, perf_online_mask)) { 10599 struct swevent_hlist *hlist; 10600 10601 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 10602 if (!hlist) { 10603 err = -ENOMEM; 10604 goto exit; 10605 } 10606 rcu_assign_pointer(swhash->swevent_hlist, hlist); 10607 } 10608 swhash->hlist_refcount++; 10609 exit: 10610 mutex_unlock(&swhash->hlist_mutex); 10611 10612 return err; 10613 } 10614 10615 static int swevent_hlist_get(void) 10616 { 10617 int err, cpu, failed_cpu; 10618 10619 mutex_lock(&pmus_lock); 10620 for_each_possible_cpu(cpu) { 10621 err = swevent_hlist_get_cpu(cpu); 10622 if (err) { 10623 failed_cpu = cpu; 10624 goto fail; 10625 } 10626 } 10627 mutex_unlock(&pmus_lock); 10628 return 0; 10629 fail: 10630 for_each_possible_cpu(cpu) { 10631 if (cpu == failed_cpu) 10632 break; 10633 swevent_hlist_put_cpu(cpu); 10634 } 10635 mutex_unlock(&pmus_lock); 10636 return err; 10637 } 10638 10639 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 10640 10641 static void sw_perf_event_destroy(struct perf_event *event) 10642 { 10643 u64 event_id = event->attr.config; 10644 10645 WARN_ON(event->parent); 10646 10647 static_key_slow_dec(&perf_swevent_enabled[event_id]); 10648 swevent_hlist_put(); 10649 } 10650 10651 static struct pmu perf_cpu_clock; /* fwd declaration */ 10652 static struct pmu perf_task_clock; 10653 10654 static int perf_swevent_init(struct perf_event *event) 10655 { 10656 u64 event_id = event->attr.config; 10657 10658 if (event->attr.type != PERF_TYPE_SOFTWARE) 10659 return -ENOENT; 10660 10661 /* 10662 * no branch sampling for software events 10663 */ 10664 if (has_branch_stack(event)) 10665 return -EOPNOTSUPP; 10666 10667 switch (event_id) { 10668 case PERF_COUNT_SW_CPU_CLOCK: 10669 event->attr.type = perf_cpu_clock.type; 10670 return -ENOENT; 10671 case PERF_COUNT_SW_TASK_CLOCK: 10672 event->attr.type = perf_task_clock.type; 10673 return -ENOENT; 10674 10675 default: 10676 break; 10677 } 10678 10679 if (event_id >= PERF_COUNT_SW_MAX) 10680 return -ENOENT; 10681 10682 if (!event->parent) { 10683 int err; 10684 10685 err = swevent_hlist_get(); 10686 if (err) 10687 return err; 10688 10689 static_key_slow_inc(&perf_swevent_enabled[event_id]); 10690 event->destroy = sw_perf_event_destroy; 10691 } 10692 10693 return 0; 10694 } 10695 10696 static struct pmu perf_swevent = { 10697 .task_ctx_nr = perf_sw_context, 10698 10699 .capabilities = PERF_PMU_CAP_NO_NMI, 10700 10701 .event_init = perf_swevent_init, 10702 .add = perf_swevent_add, 10703 .del = perf_swevent_del, 10704 .start = perf_swevent_start, 10705 .stop = perf_swevent_stop, 10706 .read = perf_swevent_read, 10707 }; 10708 10709 #ifdef CONFIG_EVENT_TRACING 10710 10711 static void tp_perf_event_destroy(struct perf_event *event) 10712 { 10713 perf_trace_destroy(event); 10714 } 10715 10716 static int perf_tp_event_init(struct perf_event *event) 10717 { 10718 int err; 10719 10720 if (event->attr.type != PERF_TYPE_TRACEPOINT) 10721 return -ENOENT; 10722 10723 /* 10724 * no branch sampling for tracepoint events 10725 */ 10726 if (has_branch_stack(event)) 10727 return -EOPNOTSUPP; 10728 10729 err = perf_trace_init(event); 10730 if (err) 10731 return err; 10732 10733 event->destroy = tp_perf_event_destroy; 10734 10735 return 0; 10736 } 10737 10738 static struct pmu perf_tracepoint = { 10739 .task_ctx_nr = perf_sw_context, 10740 10741 .event_init = perf_tp_event_init, 10742 .add = perf_trace_add, 10743 .del = perf_trace_del, 10744 .start = perf_swevent_start, 10745 .stop = perf_swevent_stop, 10746 .read = perf_swevent_read, 10747 }; 10748 10749 static int perf_tp_filter_match(struct perf_event *event, 10750 struct perf_raw_record *raw) 10751 { 10752 void *record = raw->frag.data; 10753 10754 /* only top level events have filters set */ 10755 if (event->parent) 10756 event = event->parent; 10757 10758 if (likely(!event->filter) || filter_match_preds(event->filter, record)) 10759 return 1; 10760 return 0; 10761 } 10762 10763 static int perf_tp_event_match(struct perf_event *event, 10764 struct perf_raw_record *raw, 10765 struct pt_regs *regs) 10766 { 10767 if (event->hw.state & PERF_HES_STOPPED) 10768 return 0; 10769 /* 10770 * If exclude_kernel, only trace user-space tracepoints (uprobes) 10771 */ 10772 if (event->attr.exclude_kernel && !user_mode(regs)) 10773 return 0; 10774 10775 if (!perf_tp_filter_match(event, raw)) 10776 return 0; 10777 10778 return 1; 10779 } 10780 10781 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, 10782 struct trace_event_call *call, u64 count, 10783 struct pt_regs *regs, struct hlist_head *head, 10784 struct task_struct *task) 10785 { 10786 if (bpf_prog_array_valid(call)) { 10787 *(struct pt_regs **)raw_data = regs; 10788 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { 10789 perf_swevent_put_recursion_context(rctx); 10790 return; 10791 } 10792 } 10793 perf_tp_event(call->event.type, count, raw_data, size, regs, head, 10794 rctx, task); 10795 } 10796 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); 10797 10798 static void __perf_tp_event_target_task(u64 count, void *record, 10799 struct pt_regs *regs, 10800 struct perf_sample_data *data, 10801 struct perf_raw_record *raw, 10802 struct perf_event *event) 10803 { 10804 struct trace_entry *entry = record; 10805 10806 if (event->attr.config != entry->type) 10807 return; 10808 /* Cannot deliver synchronous signal to other task. */ 10809 if (event->attr.sigtrap) 10810 return; 10811 if (perf_tp_event_match(event, raw, regs)) { 10812 perf_sample_data_init(data, 0, 0); 10813 perf_sample_save_raw_data(data, event, raw); 10814 perf_swevent_event(event, count, data, regs); 10815 } 10816 } 10817 10818 static void perf_tp_event_target_task(u64 count, void *record, 10819 struct pt_regs *regs, 10820 struct perf_sample_data *data, 10821 struct perf_raw_record *raw, 10822 struct perf_event_context *ctx) 10823 { 10824 unsigned int cpu = smp_processor_id(); 10825 struct pmu *pmu = &perf_tracepoint; 10826 struct perf_event *event, *sibling; 10827 10828 perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { 10829 __perf_tp_event_target_task(count, record, regs, data, raw, event); 10830 for_each_sibling_event(sibling, event) 10831 __perf_tp_event_target_task(count, record, regs, data, raw, sibling); 10832 } 10833 10834 perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { 10835 __perf_tp_event_target_task(count, record, regs, data, raw, event); 10836 for_each_sibling_event(sibling, event) 10837 __perf_tp_event_target_task(count, record, regs, data, raw, sibling); 10838 } 10839 } 10840 10841 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, 10842 struct pt_regs *regs, struct hlist_head *head, int rctx, 10843 struct task_struct *task) 10844 { 10845 struct perf_sample_data data; 10846 struct perf_event *event; 10847 10848 struct perf_raw_record raw = { 10849 .frag = { 10850 .size = entry_size, 10851 .data = record, 10852 }, 10853 }; 10854 10855 perf_trace_buf_update(record, event_type); 10856 10857 hlist_for_each_entry_rcu(event, head, hlist_entry) { 10858 if (perf_tp_event_match(event, &raw, regs)) { 10859 /* 10860 * Here use the same on-stack perf_sample_data, 10861 * some members in data are event-specific and 10862 * need to be re-computed for different sweveents. 10863 * Re-initialize data->sample_flags safely to avoid 10864 * the problem that next event skips preparing data 10865 * because data->sample_flags is set. 10866 */ 10867 perf_sample_data_init(&data, 0, 0); 10868 perf_sample_save_raw_data(&data, event, &raw); 10869 perf_swevent_event(event, count, &data, regs); 10870 } 10871 } 10872 10873 /* 10874 * If we got specified a target task, also iterate its context and 10875 * deliver this event there too. 10876 */ 10877 if (task && task != current) { 10878 struct perf_event_context *ctx; 10879 10880 rcu_read_lock(); 10881 ctx = rcu_dereference(task->perf_event_ctxp); 10882 if (!ctx) 10883 goto unlock; 10884 10885 raw_spin_lock(&ctx->lock); 10886 perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); 10887 raw_spin_unlock(&ctx->lock); 10888 unlock: 10889 rcu_read_unlock(); 10890 } 10891 10892 perf_swevent_put_recursion_context(rctx); 10893 } 10894 EXPORT_SYMBOL_GPL(perf_tp_event); 10895 10896 #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) 10897 /* 10898 * Flags in config, used by dynamic PMU kprobe and uprobe 10899 * The flags should match following PMU_FORMAT_ATTR(). 10900 * 10901 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe 10902 * if not set, create kprobe/uprobe 10903 * 10904 * The following values specify a reference counter (or semaphore in the 10905 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically 10906 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. 10907 * 10908 * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset 10909 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left 10910 */ 10911 enum perf_probe_config { 10912 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ 10913 PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, 10914 PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, 10915 }; 10916 10917 PMU_FORMAT_ATTR(retprobe, "config:0"); 10918 #endif 10919 10920 #ifdef CONFIG_KPROBE_EVENTS 10921 static struct attribute *kprobe_attrs[] = { 10922 &format_attr_retprobe.attr, 10923 NULL, 10924 }; 10925 10926 static struct attribute_group kprobe_format_group = { 10927 .name = "format", 10928 .attrs = kprobe_attrs, 10929 }; 10930 10931 static const struct attribute_group *kprobe_attr_groups[] = { 10932 &kprobe_format_group, 10933 NULL, 10934 }; 10935 10936 static int perf_kprobe_event_init(struct perf_event *event); 10937 static struct pmu perf_kprobe = { 10938 .task_ctx_nr = perf_sw_context, 10939 .event_init = perf_kprobe_event_init, 10940 .add = perf_trace_add, 10941 .del = perf_trace_del, 10942 .start = perf_swevent_start, 10943 .stop = perf_swevent_stop, 10944 .read = perf_swevent_read, 10945 .attr_groups = kprobe_attr_groups, 10946 }; 10947 10948 static int perf_kprobe_event_init(struct perf_event *event) 10949 { 10950 int err; 10951 bool is_retprobe; 10952 10953 if (event->attr.type != perf_kprobe.type) 10954 return -ENOENT; 10955 10956 if (!perfmon_capable()) 10957 return -EACCES; 10958 10959 /* 10960 * no branch sampling for probe events 10961 */ 10962 if (has_branch_stack(event)) 10963 return -EOPNOTSUPP; 10964 10965 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; 10966 err = perf_kprobe_init(event, is_retprobe); 10967 if (err) 10968 return err; 10969 10970 event->destroy = perf_kprobe_destroy; 10971 10972 return 0; 10973 } 10974 #endif /* CONFIG_KPROBE_EVENTS */ 10975 10976 #ifdef CONFIG_UPROBE_EVENTS 10977 PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); 10978 10979 static struct attribute *uprobe_attrs[] = { 10980 &format_attr_retprobe.attr, 10981 &format_attr_ref_ctr_offset.attr, 10982 NULL, 10983 }; 10984 10985 static struct attribute_group uprobe_format_group = { 10986 .name = "format", 10987 .attrs = uprobe_attrs, 10988 }; 10989 10990 static const struct attribute_group *uprobe_attr_groups[] = { 10991 &uprobe_format_group, 10992 NULL, 10993 }; 10994 10995 static int perf_uprobe_event_init(struct perf_event *event); 10996 static struct pmu perf_uprobe = { 10997 .task_ctx_nr = perf_sw_context, 10998 .event_init = perf_uprobe_event_init, 10999 .add = perf_trace_add, 11000 .del = perf_trace_del, 11001 .start = perf_swevent_start, 11002 .stop = perf_swevent_stop, 11003 .read = perf_swevent_read, 11004 .attr_groups = uprobe_attr_groups, 11005 }; 11006 11007 static int perf_uprobe_event_init(struct perf_event *event) 11008 { 11009 int err; 11010 unsigned long ref_ctr_offset; 11011 bool is_retprobe; 11012 11013 if (event->attr.type != perf_uprobe.type) 11014 return -ENOENT; 11015 11016 if (!perfmon_capable()) 11017 return -EACCES; 11018 11019 /* 11020 * no branch sampling for probe events 11021 */ 11022 if (has_branch_stack(event)) 11023 return -EOPNOTSUPP; 11024 11025 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; 11026 ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; 11027 err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); 11028 if (err) 11029 return err; 11030 11031 event->destroy = perf_uprobe_destroy; 11032 11033 return 0; 11034 } 11035 #endif /* CONFIG_UPROBE_EVENTS */ 11036 11037 static inline void perf_tp_register(void) 11038 { 11039 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); 11040 #ifdef CONFIG_KPROBE_EVENTS 11041 perf_pmu_register(&perf_kprobe, "kprobe", -1); 11042 #endif 11043 #ifdef CONFIG_UPROBE_EVENTS 11044 perf_pmu_register(&perf_uprobe, "uprobe", -1); 11045 #endif 11046 } 11047 11048 static void perf_event_free_filter(struct perf_event *event) 11049 { 11050 ftrace_profile_free_filter(event); 11051 } 11052 11053 /* 11054 * returns true if the event is a tracepoint, or a kprobe/upprobe created 11055 * with perf_event_open() 11056 */ 11057 static inline bool perf_event_is_tracing(struct perf_event *event) 11058 { 11059 if (event->pmu == &perf_tracepoint) 11060 return true; 11061 #ifdef CONFIG_KPROBE_EVENTS 11062 if (event->pmu == &perf_kprobe) 11063 return true; 11064 #endif 11065 #ifdef CONFIG_UPROBE_EVENTS 11066 if (event->pmu == &perf_uprobe) 11067 return true; 11068 #endif 11069 return false; 11070 } 11071 11072 int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, 11073 u64 bpf_cookie) 11074 { 11075 bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; 11076 11077 if (!perf_event_is_tracing(event)) 11078 return perf_event_set_bpf_handler(event, prog, bpf_cookie); 11079 11080 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; 11081 is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; 11082 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; 11083 is_syscall_tp = is_syscall_trace_event(event->tp_event); 11084 if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) 11085 /* bpf programs can only be attached to u/kprobe or tracepoint */ 11086 return -EINVAL; 11087 11088 if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || 11089 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || 11090 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) 11091 return -EINVAL; 11092 11093 if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe) 11094 /* only uprobe programs are allowed to be sleepable */ 11095 return -EINVAL; 11096 11097 /* Kprobe override only works for kprobes, not uprobes. */ 11098 if (prog->kprobe_override && !is_kprobe) 11099 return -EINVAL; 11100 11101 if (is_tracepoint || is_syscall_tp) { 11102 int off = trace_event_get_offsets(event->tp_event); 11103 11104 if (prog->aux->max_ctx_offset > off) 11105 return -EACCES; 11106 } 11107 11108 return perf_event_attach_bpf_prog(event, prog, bpf_cookie); 11109 } 11110 11111 void perf_event_free_bpf_prog(struct perf_event *event) 11112 { 11113 if (!event->prog) 11114 return; 11115 11116 if (!perf_event_is_tracing(event)) { 11117 perf_event_free_bpf_handler(event); 11118 return; 11119 } 11120 perf_event_detach_bpf_prog(event); 11121 } 11122 11123 #else 11124 11125 static inline void perf_tp_register(void) 11126 { 11127 } 11128 11129 static void perf_event_free_filter(struct perf_event *event) 11130 { 11131 } 11132 11133 int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, 11134 u64 bpf_cookie) 11135 { 11136 return -ENOENT; 11137 } 11138 11139 void perf_event_free_bpf_prog(struct perf_event *event) 11140 { 11141 } 11142 #endif /* CONFIG_EVENT_TRACING */ 11143 11144 #ifdef CONFIG_HAVE_HW_BREAKPOINT 11145 void perf_bp_event(struct perf_event *bp, void *data) 11146 { 11147 struct perf_sample_data sample; 11148 struct pt_regs *regs = data; 11149 11150 perf_sample_data_init(&sample, bp->attr.bp_addr, 0); 11151 11152 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 11153 perf_swevent_event(bp, 1, &sample, regs); 11154 } 11155 #endif 11156 11157 /* 11158 * Allocate a new address filter 11159 */ 11160 static struct perf_addr_filter * 11161 perf_addr_filter_new(struct perf_event *event, struct list_head *filters) 11162 { 11163 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); 11164 struct perf_addr_filter *filter; 11165 11166 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); 11167 if (!filter) 11168 return NULL; 11169 11170 INIT_LIST_HEAD(&filter->entry); 11171 list_add_tail(&filter->entry, filters); 11172 11173 return filter; 11174 } 11175 11176 static void free_filters_list(struct list_head *filters) 11177 { 11178 struct perf_addr_filter *filter, *iter; 11179 11180 list_for_each_entry_safe(filter, iter, filters, entry) { 11181 path_put(&filter->path); 11182 list_del(&filter->entry); 11183 kfree(filter); 11184 } 11185 } 11186 11187 /* 11188 * Free existing address filters and optionally install new ones 11189 */ 11190 static void perf_addr_filters_splice(struct perf_event *event, 11191 struct list_head *head) 11192 { 11193 unsigned long flags; 11194 LIST_HEAD(list); 11195 11196 if (!has_addr_filter(event)) 11197 return; 11198 11199 /* don't bother with children, they don't have their own filters */ 11200 if (event->parent) 11201 return; 11202 11203 raw_spin_lock_irqsave(&event->addr_filters.lock, flags); 11204 11205 list_splice_init(&event->addr_filters.list, &list); 11206 if (head) 11207 list_splice(head, &event->addr_filters.list); 11208 11209 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); 11210 11211 free_filters_list(&list); 11212 } 11213 11214 static void perf_free_addr_filters(struct perf_event *event) 11215 { 11216 /* 11217 * Used during free paths, there is no concurrency. 11218 */ 11219 if (list_empty(&event->addr_filters.list)) 11220 return; 11221 11222 perf_addr_filters_splice(event, NULL); 11223 } 11224 11225 /* 11226 * Scan through mm's vmas and see if one of them matches the 11227 * @filter; if so, adjust filter's address range. 11228 * Called with mm::mmap_lock down for reading. 11229 */ 11230 static void perf_addr_filter_apply(struct perf_addr_filter *filter, 11231 struct mm_struct *mm, 11232 struct perf_addr_filter_range *fr) 11233 { 11234 struct vm_area_struct *vma; 11235 VMA_ITERATOR(vmi, mm, 0); 11236 11237 for_each_vma(vmi, vma) { 11238 if (!vma->vm_file) 11239 continue; 11240 11241 if (perf_addr_filter_vma_adjust(filter, vma, fr)) 11242 return; 11243 } 11244 } 11245 11246 /* 11247 * Update event's address range filters based on the 11248 * task's existing mappings, if any. 11249 */ 11250 static void perf_event_addr_filters_apply(struct perf_event *event) 11251 { 11252 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 11253 struct task_struct *task = READ_ONCE(event->ctx->task); 11254 struct perf_addr_filter *filter; 11255 struct mm_struct *mm = NULL; 11256 unsigned int count = 0; 11257 unsigned long flags; 11258 11259 /* 11260 * We may observe TASK_TOMBSTONE, which means that the event tear-down 11261 * will stop on the parent's child_mutex that our caller is also holding 11262 */ 11263 if (task == TASK_TOMBSTONE) 11264 return; 11265 11266 if (ifh->nr_file_filters) { 11267 mm = get_task_mm(task); 11268 if (!mm) 11269 goto restart; 11270 11271 mmap_read_lock(mm); 11272 } 11273 11274 raw_spin_lock_irqsave(&ifh->lock, flags); 11275 list_for_each_entry(filter, &ifh->list, entry) { 11276 if (filter->path.dentry) { 11277 /* 11278 * Adjust base offset if the filter is associated to a 11279 * binary that needs to be mapped: 11280 */ 11281 event->addr_filter_ranges[count].start = 0; 11282 event->addr_filter_ranges[count].size = 0; 11283 11284 perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); 11285 } else { 11286 event->addr_filter_ranges[count].start = filter->offset; 11287 event->addr_filter_ranges[count].size = filter->size; 11288 } 11289 11290 count++; 11291 } 11292 11293 event->addr_filters_gen++; 11294 raw_spin_unlock_irqrestore(&ifh->lock, flags); 11295 11296 if (ifh->nr_file_filters) { 11297 mmap_read_unlock(mm); 11298 11299 mmput(mm); 11300 } 11301 11302 restart: 11303 perf_event_stop(event, 1); 11304 } 11305 11306 /* 11307 * Address range filtering: limiting the data to certain 11308 * instruction address ranges. Filters are ioctl()ed to us from 11309 * userspace as ascii strings. 11310 * 11311 * Filter string format: 11312 * 11313 * ACTION RANGE_SPEC 11314 * where ACTION is one of the 11315 * * "filter": limit the trace to this region 11316 * * "start": start tracing from this address 11317 * * "stop": stop tracing at this address/region; 11318 * RANGE_SPEC is 11319 * * for kernel addresses: <start address>[/<size>] 11320 * * for object files: <start address>[/<size>]@</path/to/object/file> 11321 * 11322 * if <size> is not specified or is zero, the range is treated as a single 11323 * address; not valid for ACTION=="filter". 11324 */ 11325 enum { 11326 IF_ACT_NONE = -1, 11327 IF_ACT_FILTER, 11328 IF_ACT_START, 11329 IF_ACT_STOP, 11330 IF_SRC_FILE, 11331 IF_SRC_KERNEL, 11332 IF_SRC_FILEADDR, 11333 IF_SRC_KERNELADDR, 11334 }; 11335 11336 enum { 11337 IF_STATE_ACTION = 0, 11338 IF_STATE_SOURCE, 11339 IF_STATE_END, 11340 }; 11341 11342 static const match_table_t if_tokens = { 11343 { IF_ACT_FILTER, "filter" }, 11344 { IF_ACT_START, "start" }, 11345 { IF_ACT_STOP, "stop" }, 11346 { IF_SRC_FILE, "%u/%u@%s" }, 11347 { IF_SRC_KERNEL, "%u/%u" }, 11348 { IF_SRC_FILEADDR, "%u@%s" }, 11349 { IF_SRC_KERNELADDR, "%u" }, 11350 { IF_ACT_NONE, NULL }, 11351 }; 11352 11353 /* 11354 * Address filter string parser 11355 */ 11356 static int 11357 perf_event_parse_addr_filter(struct perf_event *event, char *fstr, 11358 struct list_head *filters) 11359 { 11360 struct perf_addr_filter *filter = NULL; 11361 char *start, *orig, *filename = NULL; 11362 substring_t args[MAX_OPT_ARGS]; 11363 int state = IF_STATE_ACTION, token; 11364 unsigned int kernel = 0; 11365 int ret = -EINVAL; 11366 11367 orig = fstr = kstrdup(fstr, GFP_KERNEL); 11368 if (!fstr) 11369 return -ENOMEM; 11370 11371 while ((start = strsep(&fstr, " ,\n")) != NULL) { 11372 static const enum perf_addr_filter_action_t actions[] = { 11373 [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, 11374 [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, 11375 [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, 11376 }; 11377 ret = -EINVAL; 11378 11379 if (!*start) 11380 continue; 11381 11382 /* filter definition begins */ 11383 if (state == IF_STATE_ACTION) { 11384 filter = perf_addr_filter_new(event, filters); 11385 if (!filter) 11386 goto fail; 11387 } 11388 11389 token = match_token(start, if_tokens, args); 11390 switch (token) { 11391 case IF_ACT_FILTER: 11392 case IF_ACT_START: 11393 case IF_ACT_STOP: 11394 if (state != IF_STATE_ACTION) 11395 goto fail; 11396 11397 filter->action = actions[token]; 11398 state = IF_STATE_SOURCE; 11399 break; 11400 11401 case IF_SRC_KERNELADDR: 11402 case IF_SRC_KERNEL: 11403 kernel = 1; 11404 fallthrough; 11405 11406 case IF_SRC_FILEADDR: 11407 case IF_SRC_FILE: 11408 if (state != IF_STATE_SOURCE) 11409 goto fail; 11410 11411 *args[0].to = 0; 11412 ret = kstrtoul(args[0].from, 0, &filter->offset); 11413 if (ret) 11414 goto fail; 11415 11416 if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { 11417 *args[1].to = 0; 11418 ret = kstrtoul(args[1].from, 0, &filter->size); 11419 if (ret) 11420 goto fail; 11421 } 11422 11423 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { 11424 int fpos = token == IF_SRC_FILE ? 2 : 1; 11425 11426 kfree(filename); 11427 filename = match_strdup(&args[fpos]); 11428 if (!filename) { 11429 ret = -ENOMEM; 11430 goto fail; 11431 } 11432 } 11433 11434 state = IF_STATE_END; 11435 break; 11436 11437 default: 11438 goto fail; 11439 } 11440 11441 /* 11442 * Filter definition is fully parsed, validate and install it. 11443 * Make sure that it doesn't contradict itself or the event's 11444 * attribute. 11445 */ 11446 if (state == IF_STATE_END) { 11447 ret = -EINVAL; 11448 11449 /* 11450 * ACTION "filter" must have a non-zero length region 11451 * specified. 11452 */ 11453 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && 11454 !filter->size) 11455 goto fail; 11456 11457 if (!kernel) { 11458 if (!filename) 11459 goto fail; 11460 11461 /* 11462 * For now, we only support file-based filters 11463 * in per-task events; doing so for CPU-wide 11464 * events requires additional context switching 11465 * trickery, since same object code will be 11466 * mapped at different virtual addresses in 11467 * different processes. 11468 */ 11469 ret = -EOPNOTSUPP; 11470 if (!event->ctx->task) 11471 goto fail; 11472 11473 /* look up the path and grab its inode */ 11474 ret = kern_path(filename, LOOKUP_FOLLOW, 11475 &filter->path); 11476 if (ret) 11477 goto fail; 11478 11479 ret = -EINVAL; 11480 if (!filter->path.dentry || 11481 !S_ISREG(d_inode(filter->path.dentry) 11482 ->i_mode)) 11483 goto fail; 11484 11485 event->addr_filters.nr_file_filters++; 11486 } 11487 11488 /* ready to consume more filters */ 11489 kfree(filename); 11490 filename = NULL; 11491 state = IF_STATE_ACTION; 11492 filter = NULL; 11493 kernel = 0; 11494 } 11495 } 11496 11497 if (state != IF_STATE_ACTION) 11498 goto fail; 11499 11500 kfree(filename); 11501 kfree(orig); 11502 11503 return 0; 11504 11505 fail: 11506 kfree(filename); 11507 free_filters_list(filters); 11508 kfree(orig); 11509 11510 return ret; 11511 } 11512 11513 static int 11514 perf_event_set_addr_filter(struct perf_event *event, char *filter_str) 11515 { 11516 LIST_HEAD(filters); 11517 int ret; 11518 11519 /* 11520 * Since this is called in perf_ioctl() path, we're already holding 11521 * ctx::mutex. 11522 */ 11523 lockdep_assert_held(&event->ctx->mutex); 11524 11525 if (WARN_ON_ONCE(event->parent)) 11526 return -EINVAL; 11527 11528 ret = perf_event_parse_addr_filter(event, filter_str, &filters); 11529 if (ret) 11530 goto fail_clear_files; 11531 11532 ret = event->pmu->addr_filters_validate(&filters); 11533 if (ret) 11534 goto fail_free_filters; 11535 11536 /* remove existing filters, if any */ 11537 perf_addr_filters_splice(event, &filters); 11538 11539 /* install new filters */ 11540 perf_event_for_each_child(event, perf_event_addr_filters_apply); 11541 11542 return ret; 11543 11544 fail_free_filters: 11545 free_filters_list(&filters); 11546 11547 fail_clear_files: 11548 event->addr_filters.nr_file_filters = 0; 11549 11550 return ret; 11551 } 11552 11553 static int perf_event_set_filter(struct perf_event *event, void __user *arg) 11554 { 11555 int ret = -EINVAL; 11556 char *filter_str; 11557 11558 filter_str = strndup_user(arg, PAGE_SIZE); 11559 if (IS_ERR(filter_str)) 11560 return PTR_ERR(filter_str); 11561 11562 #ifdef CONFIG_EVENT_TRACING 11563 if (perf_event_is_tracing(event)) { 11564 struct perf_event_context *ctx = event->ctx; 11565 11566 /* 11567 * Beware, here be dragons!! 11568 * 11569 * the tracepoint muck will deadlock against ctx->mutex, but 11570 * the tracepoint stuff does not actually need it. So 11571 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we 11572 * already have a reference on ctx. 11573 * 11574 * This can result in event getting moved to a different ctx, 11575 * but that does not affect the tracepoint state. 11576 */ 11577 mutex_unlock(&ctx->mutex); 11578 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); 11579 mutex_lock(&ctx->mutex); 11580 } else 11581 #endif 11582 if (has_addr_filter(event)) 11583 ret = perf_event_set_addr_filter(event, filter_str); 11584 11585 kfree(filter_str); 11586 return ret; 11587 } 11588 11589 /* 11590 * hrtimer based swevent callback 11591 */ 11592 11593 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) 11594 { 11595 enum hrtimer_restart ret = HRTIMER_RESTART; 11596 struct perf_sample_data data; 11597 struct pt_regs *regs; 11598 struct perf_event *event; 11599 u64 period; 11600 11601 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 11602 11603 if (event->state != PERF_EVENT_STATE_ACTIVE) 11604 return HRTIMER_NORESTART; 11605 11606 event->pmu->read(event); 11607 11608 perf_sample_data_init(&data, 0, event->hw.last_period); 11609 regs = get_irq_regs(); 11610 11611 if (regs && !perf_exclude_event(event, regs)) { 11612 if (!(event->attr.exclude_idle && is_idle_task(current))) 11613 if (__perf_event_overflow(event, 1, &data, regs)) 11614 ret = HRTIMER_NORESTART; 11615 } 11616 11617 period = max_t(u64, 10000, event->hw.sample_period); 11618 hrtimer_forward_now(hrtimer, ns_to_ktime(period)); 11619 11620 return ret; 11621 } 11622 11623 static void perf_swevent_start_hrtimer(struct perf_event *event) 11624 { 11625 struct hw_perf_event *hwc = &event->hw; 11626 s64 period; 11627 11628 if (!is_sampling_event(event)) 11629 return; 11630 11631 period = local64_read(&hwc->period_left); 11632 if (period) { 11633 if (period < 0) 11634 period = 10000; 11635 11636 local64_set(&hwc->period_left, 0); 11637 } else { 11638 period = max_t(u64, 10000, hwc->sample_period); 11639 } 11640 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), 11641 HRTIMER_MODE_REL_PINNED_HARD); 11642 } 11643 11644 static void perf_swevent_cancel_hrtimer(struct perf_event *event) 11645 { 11646 struct hw_perf_event *hwc = &event->hw; 11647 11648 if (is_sampling_event(event)) { 11649 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 11650 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 11651 11652 hrtimer_cancel(&hwc->hrtimer); 11653 } 11654 } 11655 11656 static void perf_swevent_init_hrtimer(struct perf_event *event) 11657 { 11658 struct hw_perf_event *hwc = &event->hw; 11659 11660 if (!is_sampling_event(event)) 11661 return; 11662 11663 hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); 11664 11665 /* 11666 * Since hrtimers have a fixed rate, we can do a static freq->period 11667 * mapping and avoid the whole period adjust feedback stuff. 11668 */ 11669 if (event->attr.freq) { 11670 long freq = event->attr.sample_freq; 11671 11672 event->attr.sample_period = NSEC_PER_SEC / freq; 11673 hwc->sample_period = event->attr.sample_period; 11674 local64_set(&hwc->period_left, hwc->sample_period); 11675 hwc->last_period = hwc->sample_period; 11676 event->attr.freq = 0; 11677 } 11678 } 11679 11680 /* 11681 * Software event: cpu wall time clock 11682 */ 11683 11684 static void cpu_clock_event_update(struct perf_event *event) 11685 { 11686 s64 prev; 11687 u64 now; 11688 11689 now = local_clock(); 11690 prev = local64_xchg(&event->hw.prev_count, now); 11691 local64_add(now - prev, &event->count); 11692 } 11693 11694 static void cpu_clock_event_start(struct perf_event *event, int flags) 11695 { 11696 local64_set(&event->hw.prev_count, local_clock()); 11697 perf_swevent_start_hrtimer(event); 11698 } 11699 11700 static void cpu_clock_event_stop(struct perf_event *event, int flags) 11701 { 11702 perf_swevent_cancel_hrtimer(event); 11703 cpu_clock_event_update(event); 11704 } 11705 11706 static int cpu_clock_event_add(struct perf_event *event, int flags) 11707 { 11708 if (flags & PERF_EF_START) 11709 cpu_clock_event_start(event, flags); 11710 perf_event_update_userpage(event); 11711 11712 return 0; 11713 } 11714 11715 static void cpu_clock_event_del(struct perf_event *event, int flags) 11716 { 11717 cpu_clock_event_stop(event, flags); 11718 } 11719 11720 static void cpu_clock_event_read(struct perf_event *event) 11721 { 11722 cpu_clock_event_update(event); 11723 } 11724 11725 static int cpu_clock_event_init(struct perf_event *event) 11726 { 11727 if (event->attr.type != perf_cpu_clock.type) 11728 return -ENOENT; 11729 11730 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 11731 return -ENOENT; 11732 11733 /* 11734 * no branch sampling for software events 11735 */ 11736 if (has_branch_stack(event)) 11737 return -EOPNOTSUPP; 11738 11739 perf_swevent_init_hrtimer(event); 11740 11741 return 0; 11742 } 11743 11744 static struct pmu perf_cpu_clock = { 11745 .task_ctx_nr = perf_sw_context, 11746 11747 .capabilities = PERF_PMU_CAP_NO_NMI, 11748 .dev = PMU_NULL_DEV, 11749 11750 .event_init = cpu_clock_event_init, 11751 .add = cpu_clock_event_add, 11752 .del = cpu_clock_event_del, 11753 .start = cpu_clock_event_start, 11754 .stop = cpu_clock_event_stop, 11755 .read = cpu_clock_event_read, 11756 }; 11757 11758 /* 11759 * Software event: task time clock 11760 */ 11761 11762 static void task_clock_event_update(struct perf_event *event, u64 now) 11763 { 11764 u64 prev; 11765 s64 delta; 11766 11767 prev = local64_xchg(&event->hw.prev_count, now); 11768 delta = now - prev; 11769 local64_add(delta, &event->count); 11770 } 11771 11772 static void task_clock_event_start(struct perf_event *event, int flags) 11773 { 11774 local64_set(&event->hw.prev_count, event->ctx->time); 11775 perf_swevent_start_hrtimer(event); 11776 } 11777 11778 static void task_clock_event_stop(struct perf_event *event, int flags) 11779 { 11780 perf_swevent_cancel_hrtimer(event); 11781 task_clock_event_update(event, event->ctx->time); 11782 } 11783 11784 static int task_clock_event_add(struct perf_event *event, int flags) 11785 { 11786 if (flags & PERF_EF_START) 11787 task_clock_event_start(event, flags); 11788 perf_event_update_userpage(event); 11789 11790 return 0; 11791 } 11792 11793 static void task_clock_event_del(struct perf_event *event, int flags) 11794 { 11795 task_clock_event_stop(event, PERF_EF_UPDATE); 11796 } 11797 11798 static void task_clock_event_read(struct perf_event *event) 11799 { 11800 u64 now = perf_clock(); 11801 u64 delta = now - event->ctx->timestamp; 11802 u64 time = event->ctx->time + delta; 11803 11804 task_clock_event_update(event, time); 11805 } 11806 11807 static int task_clock_event_init(struct perf_event *event) 11808 { 11809 if (event->attr.type != perf_task_clock.type) 11810 return -ENOENT; 11811 11812 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 11813 return -ENOENT; 11814 11815 /* 11816 * no branch sampling for software events 11817 */ 11818 if (has_branch_stack(event)) 11819 return -EOPNOTSUPP; 11820 11821 perf_swevent_init_hrtimer(event); 11822 11823 return 0; 11824 } 11825 11826 static struct pmu perf_task_clock = { 11827 .task_ctx_nr = perf_sw_context, 11828 11829 .capabilities = PERF_PMU_CAP_NO_NMI, 11830 .dev = PMU_NULL_DEV, 11831 11832 .event_init = task_clock_event_init, 11833 .add = task_clock_event_add, 11834 .del = task_clock_event_del, 11835 .start = task_clock_event_start, 11836 .stop = task_clock_event_stop, 11837 .read = task_clock_event_read, 11838 }; 11839 11840 static void perf_pmu_nop_void(struct pmu *pmu) 11841 { 11842 } 11843 11844 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) 11845 { 11846 } 11847 11848 static int perf_pmu_nop_int(struct pmu *pmu) 11849 { 11850 return 0; 11851 } 11852 11853 static int perf_event_nop_int(struct perf_event *event, u64 value) 11854 { 11855 return 0; 11856 } 11857 11858 static DEFINE_PER_CPU(unsigned int, nop_txn_flags); 11859 11860 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) 11861 { 11862 __this_cpu_write(nop_txn_flags, flags); 11863 11864 if (flags & ~PERF_PMU_TXN_ADD) 11865 return; 11866 11867 perf_pmu_disable(pmu); 11868 } 11869 11870 static int perf_pmu_commit_txn(struct pmu *pmu) 11871 { 11872 unsigned int flags = __this_cpu_read(nop_txn_flags); 11873 11874 __this_cpu_write(nop_txn_flags, 0); 11875 11876 if (flags & ~PERF_PMU_TXN_ADD) 11877 return 0; 11878 11879 perf_pmu_enable(pmu); 11880 return 0; 11881 } 11882 11883 static void perf_pmu_cancel_txn(struct pmu *pmu) 11884 { 11885 unsigned int flags = __this_cpu_read(nop_txn_flags); 11886 11887 __this_cpu_write(nop_txn_flags, 0); 11888 11889 if (flags & ~PERF_PMU_TXN_ADD) 11890 return; 11891 11892 perf_pmu_enable(pmu); 11893 } 11894 11895 static int perf_event_idx_default(struct perf_event *event) 11896 { 11897 return 0; 11898 } 11899 11900 /* 11901 * Let userspace know that this PMU supports address range filtering: 11902 */ 11903 static ssize_t nr_addr_filters_show(struct device *dev, 11904 struct device_attribute *attr, 11905 char *page) 11906 { 11907 struct pmu *pmu = dev_get_drvdata(dev); 11908 11909 return sysfs_emit(page, "%d\n", pmu->nr_addr_filters); 11910 } 11911 DEVICE_ATTR_RO(nr_addr_filters); 11912 11913 static struct idr pmu_idr; 11914 11915 static ssize_t 11916 type_show(struct device *dev, struct device_attribute *attr, char *page) 11917 { 11918 struct pmu *pmu = dev_get_drvdata(dev); 11919 11920 return sysfs_emit(page, "%d\n", pmu->type); 11921 } 11922 static DEVICE_ATTR_RO(type); 11923 11924 static ssize_t 11925 perf_event_mux_interval_ms_show(struct device *dev, 11926 struct device_attribute *attr, 11927 char *page) 11928 { 11929 struct pmu *pmu = dev_get_drvdata(dev); 11930 11931 return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms); 11932 } 11933 11934 static DEFINE_MUTEX(mux_interval_mutex); 11935 11936 static ssize_t 11937 perf_event_mux_interval_ms_store(struct device *dev, 11938 struct device_attribute *attr, 11939 const char *buf, size_t count) 11940 { 11941 struct pmu *pmu = dev_get_drvdata(dev); 11942 int timer, cpu, ret; 11943 11944 ret = kstrtoint(buf, 0, &timer); 11945 if (ret) 11946 return ret; 11947 11948 if (timer < 1) 11949 return -EINVAL; 11950 11951 /* same value, noting to do */ 11952 if (timer == pmu->hrtimer_interval_ms) 11953 return count; 11954 11955 mutex_lock(&mux_interval_mutex); 11956 pmu->hrtimer_interval_ms = timer; 11957 11958 /* update all cpuctx for this PMU */ 11959 cpus_read_lock(); 11960 for_each_online_cpu(cpu) { 11961 struct perf_cpu_pmu_context *cpc; 11962 cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); 11963 cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); 11964 11965 cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); 11966 } 11967 cpus_read_unlock(); 11968 mutex_unlock(&mux_interval_mutex); 11969 11970 return count; 11971 } 11972 static DEVICE_ATTR_RW(perf_event_mux_interval_ms); 11973 11974 static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) 11975 { 11976 switch (scope) { 11977 case PERF_PMU_SCOPE_CORE: 11978 return topology_sibling_cpumask(cpu); 11979 case PERF_PMU_SCOPE_DIE: 11980 return topology_die_cpumask(cpu); 11981 case PERF_PMU_SCOPE_CLUSTER: 11982 return topology_cluster_cpumask(cpu); 11983 case PERF_PMU_SCOPE_PKG: 11984 return topology_core_cpumask(cpu); 11985 case PERF_PMU_SCOPE_SYS_WIDE: 11986 return cpu_online_mask; 11987 } 11988 11989 return NULL; 11990 } 11991 11992 static inline struct cpumask *perf_scope_cpumask(unsigned int scope) 11993 { 11994 switch (scope) { 11995 case PERF_PMU_SCOPE_CORE: 11996 return perf_online_core_mask; 11997 case PERF_PMU_SCOPE_DIE: 11998 return perf_online_die_mask; 11999 case PERF_PMU_SCOPE_CLUSTER: 12000 return perf_online_cluster_mask; 12001 case PERF_PMU_SCOPE_PKG: 12002 return perf_online_pkg_mask; 12003 case PERF_PMU_SCOPE_SYS_WIDE: 12004 return perf_online_sys_mask; 12005 } 12006 12007 return NULL; 12008 } 12009 12010 static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, 12011 char *buf) 12012 { 12013 struct pmu *pmu = dev_get_drvdata(dev); 12014 struct cpumask *mask = perf_scope_cpumask(pmu->scope); 12015 12016 if (mask) 12017 return cpumap_print_to_pagebuf(true, buf, mask); 12018 return 0; 12019 } 12020 12021 static DEVICE_ATTR_RO(cpumask); 12022 12023 static struct attribute *pmu_dev_attrs[] = { 12024 &dev_attr_type.attr, 12025 &dev_attr_perf_event_mux_interval_ms.attr, 12026 &dev_attr_nr_addr_filters.attr, 12027 &dev_attr_cpumask.attr, 12028 NULL, 12029 }; 12030 12031 static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) 12032 { 12033 struct device *dev = kobj_to_dev(kobj); 12034 struct pmu *pmu = dev_get_drvdata(dev); 12035 12036 if (n == 2 && !pmu->nr_addr_filters) 12037 return 0; 12038 12039 /* cpumask */ 12040 if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) 12041 return 0; 12042 12043 return a->mode; 12044 } 12045 12046 static struct attribute_group pmu_dev_attr_group = { 12047 .is_visible = pmu_dev_is_visible, 12048 .attrs = pmu_dev_attrs, 12049 }; 12050 12051 static const struct attribute_group *pmu_dev_groups[] = { 12052 &pmu_dev_attr_group, 12053 NULL, 12054 }; 12055 12056 static int pmu_bus_running; 12057 static struct bus_type pmu_bus = { 12058 .name = "event_source", 12059 .dev_groups = pmu_dev_groups, 12060 }; 12061 12062 static void pmu_dev_release(struct device *dev) 12063 { 12064 kfree(dev); 12065 } 12066 12067 static int pmu_dev_alloc(struct pmu *pmu) 12068 { 12069 int ret = -ENOMEM; 12070 12071 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); 12072 if (!pmu->dev) 12073 goto out; 12074 12075 pmu->dev->groups = pmu->attr_groups; 12076 device_initialize(pmu->dev); 12077 12078 dev_set_drvdata(pmu->dev, pmu); 12079 pmu->dev->bus = &pmu_bus; 12080 pmu->dev->parent = pmu->parent; 12081 pmu->dev->release = pmu_dev_release; 12082 12083 ret = dev_set_name(pmu->dev, "%s", pmu->name); 12084 if (ret) 12085 goto free_dev; 12086 12087 ret = device_add(pmu->dev); 12088 if (ret) 12089 goto free_dev; 12090 12091 if (pmu->attr_update) { 12092 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); 12093 if (ret) 12094 goto del_dev; 12095 } 12096 12097 out: 12098 return ret; 12099 12100 del_dev: 12101 device_del(pmu->dev); 12102 12103 free_dev: 12104 put_device(pmu->dev); 12105 pmu->dev = NULL; 12106 goto out; 12107 } 12108 12109 static struct lock_class_key cpuctx_mutex; 12110 static struct lock_class_key cpuctx_lock; 12111 12112 static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) 12113 { 12114 void *tmp, *val = idr_find(idr, id); 12115 12116 if (val != old) 12117 return false; 12118 12119 tmp = idr_replace(idr, new, id); 12120 if (IS_ERR(tmp)) 12121 return false; 12122 12123 WARN_ON_ONCE(tmp != val); 12124 return true; 12125 } 12126 12127 static void perf_pmu_free(struct pmu *pmu) 12128 { 12129 if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { 12130 if (pmu->nr_addr_filters) 12131 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); 12132 device_del(pmu->dev); 12133 put_device(pmu->dev); 12134 } 12135 12136 if (pmu->cpu_pmu_context) { 12137 int cpu; 12138 12139 for_each_possible_cpu(cpu) { 12140 struct perf_cpu_pmu_context *cpc; 12141 12142 cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); 12143 if (!cpc) 12144 continue; 12145 if (cpc->epc.embedded) { 12146 /* refcount managed */ 12147 put_pmu_ctx(&cpc->epc); 12148 continue; 12149 } 12150 kfree(cpc); 12151 } 12152 free_percpu(pmu->cpu_pmu_context); 12153 } 12154 } 12155 12156 DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T)) 12157 12158 int perf_pmu_register(struct pmu *_pmu, const char *name, int type) 12159 { 12160 int cpu, max = PERF_TYPE_MAX; 12161 12162 struct pmu *pmu __free(pmu_unregister) = _pmu; 12163 guard(mutex)(&pmus_lock); 12164 12165 if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) 12166 return -EINVAL; 12167 12168 if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, 12169 "Can not register a pmu with an invalid scope.\n")) 12170 return -EINVAL; 12171 12172 pmu->name = name; 12173 12174 if (type >= 0) 12175 max = type; 12176 12177 CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL); 12178 if (pmu_type.id < 0) 12179 return pmu_type.id; 12180 12181 WARN_ON(type >= 0 && pmu_type.id != type); 12182 12183 pmu->type = pmu_type.id; 12184 atomic_set(&pmu->exclusive_cnt, 0); 12185 12186 if (pmu_bus_running && !pmu->dev) { 12187 int ret = pmu_dev_alloc(pmu); 12188 if (ret) 12189 return ret; 12190 } 12191 12192 pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *); 12193 if (!pmu->cpu_pmu_context) 12194 return -ENOMEM; 12195 12196 for_each_possible_cpu(cpu) { 12197 struct perf_cpu_pmu_context *cpc = 12198 kmalloc_node(sizeof(struct perf_cpu_pmu_context), 12199 GFP_KERNEL | __GFP_ZERO, 12200 cpu_to_node(cpu)); 12201 12202 if (!cpc) 12203 return -ENOMEM; 12204 12205 *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc; 12206 __perf_init_event_pmu_context(&cpc->epc, pmu); 12207 __perf_mux_hrtimer_init(cpc, cpu); 12208 } 12209 12210 if (!pmu->start_txn) { 12211 if (pmu->pmu_enable) { 12212 /* 12213 * If we have pmu_enable/pmu_disable calls, install 12214 * transaction stubs that use that to try and batch 12215 * hardware accesses. 12216 */ 12217 pmu->start_txn = perf_pmu_start_txn; 12218 pmu->commit_txn = perf_pmu_commit_txn; 12219 pmu->cancel_txn = perf_pmu_cancel_txn; 12220 } else { 12221 pmu->start_txn = perf_pmu_nop_txn; 12222 pmu->commit_txn = perf_pmu_nop_int; 12223 pmu->cancel_txn = perf_pmu_nop_void; 12224 } 12225 } 12226 12227 if (!pmu->pmu_enable) { 12228 pmu->pmu_enable = perf_pmu_nop_void; 12229 pmu->pmu_disable = perf_pmu_nop_void; 12230 } 12231 12232 if (!pmu->check_period) 12233 pmu->check_period = perf_event_nop_int; 12234 12235 if (!pmu->event_idx) 12236 pmu->event_idx = perf_event_idx_default; 12237 12238 /* 12239 * Now that the PMU is complete, make it visible to perf_try_init_event(). 12240 */ 12241 if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) 12242 return -EINVAL; 12243 list_add_rcu(&pmu->entry, &pmus); 12244 12245 take_idr_id(pmu_type); 12246 _pmu = no_free_ptr(pmu); // let it rip 12247 return 0; 12248 } 12249 EXPORT_SYMBOL_GPL(perf_pmu_register); 12250 12251 void perf_pmu_unregister(struct pmu *pmu) 12252 { 12253 scoped_guard (mutex, &pmus_lock) { 12254 list_del_rcu(&pmu->entry); 12255 idr_remove(&pmu_idr, pmu->type); 12256 } 12257 12258 /* 12259 * We dereference the pmu list under both SRCU and regular RCU, so 12260 * synchronize against both of those. 12261 */ 12262 synchronize_srcu(&pmus_srcu); 12263 synchronize_rcu(); 12264 12265 perf_pmu_free(pmu); 12266 } 12267 EXPORT_SYMBOL_GPL(perf_pmu_unregister); 12268 12269 static inline bool has_extended_regs(struct perf_event *event) 12270 { 12271 return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || 12272 (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); 12273 } 12274 12275 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) 12276 { 12277 struct perf_event_context *ctx = NULL; 12278 int ret; 12279 12280 if (!try_module_get(pmu->module)) 12281 return -ENODEV; 12282 12283 /* 12284 * A number of pmu->event_init() methods iterate the sibling_list to, 12285 * for example, validate if the group fits on the PMU. Therefore, 12286 * if this is a sibling event, acquire the ctx->mutex to protect 12287 * the sibling_list. 12288 */ 12289 if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { 12290 /* 12291 * This ctx->mutex can nest when we're called through 12292 * inheritance. See the perf_event_ctx_lock_nested() comment. 12293 */ 12294 ctx = perf_event_ctx_lock_nested(event->group_leader, 12295 SINGLE_DEPTH_NESTING); 12296 BUG_ON(!ctx); 12297 } 12298 12299 event->pmu = pmu; 12300 ret = pmu->event_init(event); 12301 12302 if (ctx) 12303 perf_event_ctx_unlock(event->group_leader, ctx); 12304 12305 if (ret) 12306 goto err_pmu; 12307 12308 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && 12309 has_extended_regs(event)) { 12310 ret = -EOPNOTSUPP; 12311 goto err_destroy; 12312 } 12313 12314 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && 12315 event_has_any_exclude_flag(event)) { 12316 ret = -EINVAL; 12317 goto err_destroy; 12318 } 12319 12320 if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { 12321 const struct cpumask *cpumask; 12322 struct cpumask *pmu_cpumask; 12323 int cpu; 12324 12325 cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); 12326 pmu_cpumask = perf_scope_cpumask(pmu->scope); 12327 12328 ret = -ENODEV; 12329 if (!pmu_cpumask || !cpumask) 12330 goto err_destroy; 12331 12332 cpu = cpumask_any_and(pmu_cpumask, cpumask); 12333 if (cpu >= nr_cpu_ids) 12334 goto err_destroy; 12335 12336 event->event_caps |= PERF_EV_CAP_READ_SCOPE; 12337 } 12338 12339 return 0; 12340 12341 err_destroy: 12342 if (event->destroy) { 12343 event->destroy(event); 12344 event->destroy = NULL; 12345 } 12346 12347 err_pmu: 12348 event->pmu = NULL; 12349 module_put(pmu->module); 12350 return ret; 12351 } 12352 12353 static struct pmu *perf_init_event(struct perf_event *event) 12354 { 12355 bool extended_type = false; 12356 struct pmu *pmu; 12357 int type, ret; 12358 12359 guard(srcu)(&pmus_srcu); 12360 12361 /* 12362 * Save original type before calling pmu->event_init() since certain 12363 * pmus overwrites event->attr.type to forward event to another pmu. 12364 */ 12365 event->orig_type = event->attr.type; 12366 12367 /* Try parent's PMU first: */ 12368 if (event->parent && event->parent->pmu) { 12369 pmu = event->parent->pmu; 12370 ret = perf_try_init_event(pmu, event); 12371 if (!ret) 12372 return pmu; 12373 } 12374 12375 /* 12376 * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE 12377 * are often aliases for PERF_TYPE_RAW. 12378 */ 12379 type = event->attr.type; 12380 if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { 12381 type = event->attr.config >> PERF_PMU_TYPE_SHIFT; 12382 if (!type) { 12383 type = PERF_TYPE_RAW; 12384 } else { 12385 extended_type = true; 12386 event->attr.config &= PERF_HW_EVENT_MASK; 12387 } 12388 } 12389 12390 again: 12391 scoped_guard (rcu) 12392 pmu = idr_find(&pmu_idr, type); 12393 if (pmu) { 12394 if (event->attr.type != type && type != PERF_TYPE_RAW && 12395 !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) 12396 return ERR_PTR(-ENOENT); 12397 12398 ret = perf_try_init_event(pmu, event); 12399 if (ret == -ENOENT && event->attr.type != type && !extended_type) { 12400 type = event->attr.type; 12401 goto again; 12402 } 12403 12404 if (ret) 12405 return ERR_PTR(ret); 12406 12407 return pmu; 12408 } 12409 12410 list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { 12411 ret = perf_try_init_event(pmu, event); 12412 if (!ret) 12413 return pmu; 12414 12415 if (ret != -ENOENT) 12416 return ERR_PTR(ret); 12417 } 12418 12419 return ERR_PTR(-ENOENT); 12420 } 12421 12422 static void attach_sb_event(struct perf_event *event) 12423 { 12424 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); 12425 12426 raw_spin_lock(&pel->lock); 12427 list_add_rcu(&event->sb_list, &pel->list); 12428 raw_spin_unlock(&pel->lock); 12429 } 12430 12431 /* 12432 * We keep a list of all !task (and therefore per-cpu) events 12433 * that need to receive side-band records. 12434 * 12435 * This avoids having to scan all the various PMU per-cpu contexts 12436 * looking for them. 12437 */ 12438 static void account_pmu_sb_event(struct perf_event *event) 12439 { 12440 if (is_sb_event(event)) 12441 attach_sb_event(event); 12442 } 12443 12444 /* Freq events need the tick to stay alive (see perf_event_task_tick). */ 12445 static void account_freq_event_nohz(void) 12446 { 12447 #ifdef CONFIG_NO_HZ_FULL 12448 /* Lock so we don't race with concurrent unaccount */ 12449 spin_lock(&nr_freq_lock); 12450 if (atomic_inc_return(&nr_freq_events) == 1) 12451 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); 12452 spin_unlock(&nr_freq_lock); 12453 #endif 12454 } 12455 12456 static void account_freq_event(void) 12457 { 12458 if (tick_nohz_full_enabled()) 12459 account_freq_event_nohz(); 12460 else 12461 atomic_inc(&nr_freq_events); 12462 } 12463 12464 12465 static void account_event(struct perf_event *event) 12466 { 12467 bool inc = false; 12468 12469 if (event->parent) 12470 return; 12471 12472 if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) 12473 inc = true; 12474 if (event->attr.mmap || event->attr.mmap_data) 12475 atomic_inc(&nr_mmap_events); 12476 if (event->attr.build_id) 12477 atomic_inc(&nr_build_id_events); 12478 if (event->attr.comm) 12479 atomic_inc(&nr_comm_events); 12480 if (event->attr.namespaces) 12481 atomic_inc(&nr_namespaces_events); 12482 if (event->attr.cgroup) 12483 atomic_inc(&nr_cgroup_events); 12484 if (event->attr.task) 12485 atomic_inc(&nr_task_events); 12486 if (event->attr.freq) 12487 account_freq_event(); 12488 if (event->attr.context_switch) { 12489 atomic_inc(&nr_switch_events); 12490 inc = true; 12491 } 12492 if (has_branch_stack(event)) 12493 inc = true; 12494 if (is_cgroup_event(event)) 12495 inc = true; 12496 if (event->attr.ksymbol) 12497 atomic_inc(&nr_ksymbol_events); 12498 if (event->attr.bpf_event) 12499 atomic_inc(&nr_bpf_events); 12500 if (event->attr.text_poke) 12501 atomic_inc(&nr_text_poke_events); 12502 12503 if (inc) { 12504 /* 12505 * We need the mutex here because static_branch_enable() 12506 * must complete *before* the perf_sched_count increment 12507 * becomes visible. 12508 */ 12509 if (atomic_inc_not_zero(&perf_sched_count)) 12510 goto enabled; 12511 12512 mutex_lock(&perf_sched_mutex); 12513 if (!atomic_read(&perf_sched_count)) { 12514 static_branch_enable(&perf_sched_events); 12515 /* 12516 * Guarantee that all CPUs observe they key change and 12517 * call the perf scheduling hooks before proceeding to 12518 * install events that need them. 12519 */ 12520 synchronize_rcu(); 12521 } 12522 /* 12523 * Now that we have waited for the sync_sched(), allow further 12524 * increments to by-pass the mutex. 12525 */ 12526 atomic_inc(&perf_sched_count); 12527 mutex_unlock(&perf_sched_mutex); 12528 } 12529 enabled: 12530 12531 account_pmu_sb_event(event); 12532 } 12533 12534 /* 12535 * Allocate and initialize an event structure 12536 */ 12537 static struct perf_event * 12538 perf_event_alloc(struct perf_event_attr *attr, int cpu, 12539 struct task_struct *task, 12540 struct perf_event *group_leader, 12541 struct perf_event *parent_event, 12542 perf_overflow_handler_t overflow_handler, 12543 void *context, int cgroup_fd) 12544 { 12545 struct pmu *pmu; 12546 struct hw_perf_event *hwc; 12547 long err = -EINVAL; 12548 int node; 12549 12550 if ((unsigned)cpu >= nr_cpu_ids) { 12551 if (!task || cpu != -1) 12552 return ERR_PTR(-EINVAL); 12553 } 12554 if (attr->sigtrap && !task) { 12555 /* Requires a task: avoid signalling random tasks. */ 12556 return ERR_PTR(-EINVAL); 12557 } 12558 12559 node = (cpu >= 0) ? cpu_to_node(cpu) : -1; 12560 struct perf_event *event __free(__free_event) = 12561 kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); 12562 if (!event) 12563 return ERR_PTR(-ENOMEM); 12564 12565 /* 12566 * Single events are their own group leaders, with an 12567 * empty sibling list: 12568 */ 12569 if (!group_leader) 12570 group_leader = event; 12571 12572 mutex_init(&event->child_mutex); 12573 INIT_LIST_HEAD(&event->child_list); 12574 12575 INIT_LIST_HEAD(&event->event_entry); 12576 INIT_LIST_HEAD(&event->sibling_list); 12577 INIT_LIST_HEAD(&event->active_list); 12578 init_event_group(event); 12579 INIT_LIST_HEAD(&event->rb_entry); 12580 INIT_LIST_HEAD(&event->active_entry); 12581 INIT_LIST_HEAD(&event->addr_filters.list); 12582 INIT_HLIST_NODE(&event->hlist_entry); 12583 12584 12585 init_waitqueue_head(&event->waitq); 12586 init_irq_work(&event->pending_irq, perf_pending_irq); 12587 event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); 12588 init_task_work(&event->pending_task, perf_pending_task); 12589 12590 mutex_init(&event->mmap_mutex); 12591 raw_spin_lock_init(&event->addr_filters.lock); 12592 12593 atomic_long_set(&event->refcount, 1); 12594 event->cpu = cpu; 12595 event->attr = *attr; 12596 event->group_leader = group_leader; 12597 event->pmu = NULL; 12598 event->oncpu = -1; 12599 12600 event->parent = parent_event; 12601 12602 event->ns = get_pid_ns(task_active_pid_ns(current)); 12603 event->id = atomic64_inc_return(&perf_event_id); 12604 12605 event->state = PERF_EVENT_STATE_INACTIVE; 12606 12607 if (parent_event) 12608 event->event_caps = parent_event->event_caps; 12609 12610 if (task) { 12611 event->attach_state = PERF_ATTACH_TASK; 12612 /* 12613 * XXX pmu::event_init needs to know what task to account to 12614 * and we cannot use the ctx information because we need the 12615 * pmu before we get a ctx. 12616 */ 12617 event->hw.target = get_task_struct(task); 12618 } 12619 12620 event->clock = &local_clock; 12621 if (parent_event) 12622 event->clock = parent_event->clock; 12623 12624 if (!overflow_handler && parent_event) { 12625 overflow_handler = parent_event->overflow_handler; 12626 context = parent_event->overflow_handler_context; 12627 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) 12628 if (parent_event->prog) { 12629 struct bpf_prog *prog = parent_event->prog; 12630 12631 bpf_prog_inc(prog); 12632 event->prog = prog; 12633 } 12634 #endif 12635 } 12636 12637 if (overflow_handler) { 12638 event->overflow_handler = overflow_handler; 12639 event->overflow_handler_context = context; 12640 } else if (is_write_backward(event)){ 12641 event->overflow_handler = perf_event_output_backward; 12642 event->overflow_handler_context = NULL; 12643 } else { 12644 event->overflow_handler = perf_event_output_forward; 12645 event->overflow_handler_context = NULL; 12646 } 12647 12648 perf_event__state_init(event); 12649 12650 pmu = NULL; 12651 12652 hwc = &event->hw; 12653 hwc->sample_period = attr->sample_period; 12654 if (attr->freq && attr->sample_freq) 12655 hwc->sample_period = 1; 12656 hwc->last_period = hwc->sample_period; 12657 12658 local64_set(&hwc->period_left, hwc->sample_period); 12659 12660 /* 12661 * We do not support PERF_SAMPLE_READ on inherited events unless 12662 * PERF_SAMPLE_TID is also selected, which allows inherited events to 12663 * collect per-thread samples. 12664 * See perf_output_read(). 12665 */ 12666 if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) 12667 return ERR_PTR(-EINVAL); 12668 12669 if (!has_branch_stack(event)) 12670 event->attr.branch_sample_type = 0; 12671 12672 pmu = perf_init_event(event); 12673 if (IS_ERR(pmu)) 12674 return (void*)pmu; 12675 12676 /* 12677 * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config(). 12678 * The attach should be right after the perf_init_event(). 12679 * Otherwise, the __free_event() would mistakenly detach the non-exist 12680 * perf_ctx_data because of the other errors between them. 12681 */ 12682 if (event->attach_state & PERF_ATTACH_TASK_DATA) { 12683 err = attach_perf_ctx_data(event); 12684 if (err) 12685 return ERR_PTR(err); 12686 } 12687 12688 /* 12689 * Disallow uncore-task events. Similarly, disallow uncore-cgroup 12690 * events (they don't make sense as the cgroup will be different 12691 * on other CPUs in the uncore mask). 12692 */ 12693 if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) 12694 return ERR_PTR(-EINVAL); 12695 12696 if (event->attr.aux_output && 12697 (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || 12698 event->attr.aux_pause || event->attr.aux_resume)) 12699 return ERR_PTR(-EOPNOTSUPP); 12700 12701 if (event->attr.aux_pause && event->attr.aux_resume) 12702 return ERR_PTR(-EINVAL); 12703 12704 if (event->attr.aux_start_paused) { 12705 if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) 12706 return ERR_PTR(-EOPNOTSUPP); 12707 event->hw.aux_paused = 1; 12708 } 12709 12710 if (cgroup_fd != -1) { 12711 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); 12712 if (err) 12713 return ERR_PTR(err); 12714 } 12715 12716 err = exclusive_event_init(event); 12717 if (err) 12718 return ERR_PTR(err); 12719 12720 if (has_addr_filter(event)) { 12721 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, 12722 sizeof(struct perf_addr_filter_range), 12723 GFP_KERNEL); 12724 if (!event->addr_filter_ranges) 12725 return ERR_PTR(-ENOMEM); 12726 12727 /* 12728 * Clone the parent's vma offsets: they are valid until exec() 12729 * even if the mm is not shared with the parent. 12730 */ 12731 if (event->parent) { 12732 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); 12733 12734 raw_spin_lock_irq(&ifh->lock); 12735 memcpy(event->addr_filter_ranges, 12736 event->parent->addr_filter_ranges, 12737 pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); 12738 raw_spin_unlock_irq(&ifh->lock); 12739 } 12740 12741 /* force hw sync on the address filters */ 12742 event->addr_filters_gen = 1; 12743 } 12744 12745 if (!event->parent) { 12746 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 12747 err = get_callchain_buffers(attr->sample_max_stack); 12748 if (err) 12749 return ERR_PTR(err); 12750 event->attach_state |= PERF_ATTACH_CALLCHAIN; 12751 } 12752 } 12753 12754 err = security_perf_event_alloc(event); 12755 if (err) 12756 return ERR_PTR(err); 12757 12758 /* symmetric to unaccount_event() in _free_event() */ 12759 account_event(event); 12760 12761 return_ptr(event); 12762 } 12763 12764 static int perf_copy_attr(struct perf_event_attr __user *uattr, 12765 struct perf_event_attr *attr) 12766 { 12767 u32 size; 12768 int ret; 12769 12770 /* Zero the full structure, so that a short copy will be nice. */ 12771 memset(attr, 0, sizeof(*attr)); 12772 12773 ret = get_user(size, &uattr->size); 12774 if (ret) 12775 return ret; 12776 12777 /* ABI compatibility quirk: */ 12778 if (!size) 12779 size = PERF_ATTR_SIZE_VER0; 12780 if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) 12781 goto err_size; 12782 12783 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); 12784 if (ret) { 12785 if (ret == -E2BIG) 12786 goto err_size; 12787 return ret; 12788 } 12789 12790 attr->size = size; 12791 12792 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) 12793 return -EINVAL; 12794 12795 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 12796 return -EINVAL; 12797 12798 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 12799 return -EINVAL; 12800 12801 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { 12802 u64 mask = attr->branch_sample_type; 12803 12804 /* only using defined bits */ 12805 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) 12806 return -EINVAL; 12807 12808 /* at least one branch bit must be set */ 12809 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) 12810 return -EINVAL; 12811 12812 /* propagate priv level, when not set for branch */ 12813 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { 12814 12815 /* exclude_kernel checked on syscall entry */ 12816 if (!attr->exclude_kernel) 12817 mask |= PERF_SAMPLE_BRANCH_KERNEL; 12818 12819 if (!attr->exclude_user) 12820 mask |= PERF_SAMPLE_BRANCH_USER; 12821 12822 if (!attr->exclude_hv) 12823 mask |= PERF_SAMPLE_BRANCH_HV; 12824 /* 12825 * adjust user setting (for HW filter setup) 12826 */ 12827 attr->branch_sample_type = mask; 12828 } 12829 /* privileged levels capture (kernel, hv): check permissions */ 12830 if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { 12831 ret = perf_allow_kernel(); 12832 if (ret) 12833 return ret; 12834 } 12835 } 12836 12837 if (attr->sample_type & PERF_SAMPLE_REGS_USER) { 12838 ret = perf_reg_validate(attr->sample_regs_user); 12839 if (ret) 12840 return ret; 12841 } 12842 12843 if (attr->sample_type & PERF_SAMPLE_STACK_USER) { 12844 if (!arch_perf_have_user_stack_dump()) 12845 return -ENOSYS; 12846 12847 /* 12848 * We have __u32 type for the size, but so far 12849 * we can only use __u16 as maximum due to the 12850 * __u16 sample size limit. 12851 */ 12852 if (attr->sample_stack_user >= USHRT_MAX) 12853 return -EINVAL; 12854 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) 12855 return -EINVAL; 12856 } 12857 12858 if (!attr->sample_max_stack) 12859 attr->sample_max_stack = sysctl_perf_event_max_stack; 12860 12861 if (attr->sample_type & PERF_SAMPLE_REGS_INTR) 12862 ret = perf_reg_validate(attr->sample_regs_intr); 12863 12864 #ifndef CONFIG_CGROUP_PERF 12865 if (attr->sample_type & PERF_SAMPLE_CGROUP) 12866 return -EINVAL; 12867 #endif 12868 if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && 12869 (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) 12870 return -EINVAL; 12871 12872 if (!attr->inherit && attr->inherit_thread) 12873 return -EINVAL; 12874 12875 if (attr->remove_on_exec && attr->enable_on_exec) 12876 return -EINVAL; 12877 12878 if (attr->sigtrap && !attr->remove_on_exec) 12879 return -EINVAL; 12880 12881 out: 12882 return ret; 12883 12884 err_size: 12885 put_user(sizeof(*attr), &uattr->size); 12886 ret = -E2BIG; 12887 goto out; 12888 } 12889 12890 static void mutex_lock_double(struct mutex *a, struct mutex *b) 12891 { 12892 if (b < a) 12893 swap(a, b); 12894 12895 mutex_lock(a); 12896 mutex_lock_nested(b, SINGLE_DEPTH_NESTING); 12897 } 12898 12899 static int 12900 perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 12901 { 12902 struct perf_buffer *rb = NULL; 12903 int ret = -EINVAL; 12904 12905 if (!output_event) { 12906 mutex_lock(&event->mmap_mutex); 12907 goto set; 12908 } 12909 12910 /* don't allow circular references */ 12911 if (event == output_event) 12912 goto out; 12913 12914 /* 12915 * Don't allow cross-cpu buffers 12916 */ 12917 if (output_event->cpu != event->cpu) 12918 goto out; 12919 12920 /* 12921 * If its not a per-cpu rb, it must be the same task. 12922 */ 12923 if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) 12924 goto out; 12925 12926 /* 12927 * Mixing clocks in the same buffer is trouble you don't need. 12928 */ 12929 if (output_event->clock != event->clock) 12930 goto out; 12931 12932 /* 12933 * Either writing ring buffer from beginning or from end. 12934 * Mixing is not allowed. 12935 */ 12936 if (is_write_backward(output_event) != is_write_backward(event)) 12937 goto out; 12938 12939 /* 12940 * If both events generate aux data, they must be on the same PMU 12941 */ 12942 if (has_aux(event) && has_aux(output_event) && 12943 event->pmu != output_event->pmu) 12944 goto out; 12945 12946 /* 12947 * Hold both mmap_mutex to serialize against perf_mmap_close(). Since 12948 * output_event is already on rb->event_list, and the list iteration 12949 * restarts after every removal, it is guaranteed this new event is 12950 * observed *OR* if output_event is already removed, it's guaranteed we 12951 * observe !rb->mmap_count. 12952 */ 12953 mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); 12954 set: 12955 /* Can't redirect output if we've got an active mmap() */ 12956 if (atomic_read(&event->mmap_count)) 12957 goto unlock; 12958 12959 if (output_event) { 12960 /* get the rb we want to redirect to */ 12961 rb = ring_buffer_get(output_event); 12962 if (!rb) 12963 goto unlock; 12964 12965 /* did we race against perf_mmap_close() */ 12966 if (!atomic_read(&rb->mmap_count)) { 12967 ring_buffer_put(rb); 12968 goto unlock; 12969 } 12970 } 12971 12972 ring_buffer_attach(event, rb); 12973 12974 ret = 0; 12975 unlock: 12976 mutex_unlock(&event->mmap_mutex); 12977 if (output_event) 12978 mutex_unlock(&output_event->mmap_mutex); 12979 12980 out: 12981 return ret; 12982 } 12983 12984 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) 12985 { 12986 bool nmi_safe = false; 12987 12988 switch (clk_id) { 12989 case CLOCK_MONOTONIC: 12990 event->clock = &ktime_get_mono_fast_ns; 12991 nmi_safe = true; 12992 break; 12993 12994 case CLOCK_MONOTONIC_RAW: 12995 event->clock = &ktime_get_raw_fast_ns; 12996 nmi_safe = true; 12997 break; 12998 12999 case CLOCK_REALTIME: 13000 event->clock = &ktime_get_real_ns; 13001 break; 13002 13003 case CLOCK_BOOTTIME: 13004 event->clock = &ktime_get_boottime_ns; 13005 break; 13006 13007 case CLOCK_TAI: 13008 event->clock = &ktime_get_clocktai_ns; 13009 break; 13010 13011 default: 13012 return -EINVAL; 13013 } 13014 13015 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) 13016 return -EINVAL; 13017 13018 return 0; 13019 } 13020 13021 static bool 13022 perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) 13023 { 13024 unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; 13025 bool is_capable = perfmon_capable(); 13026 13027 if (attr->sigtrap) { 13028 /* 13029 * perf_event_attr::sigtrap sends signals to the other task. 13030 * Require the current task to also have CAP_KILL. 13031 */ 13032 rcu_read_lock(); 13033 is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); 13034 rcu_read_unlock(); 13035 13036 /* 13037 * If the required capabilities aren't available, checks for 13038 * ptrace permissions: upgrade to ATTACH, since sending signals 13039 * can effectively change the target task. 13040 */ 13041 ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; 13042 } 13043 13044 /* 13045 * Preserve ptrace permission check for backwards compatibility. The 13046 * ptrace check also includes checks that the current task and other 13047 * task have matching uids, and is therefore not done here explicitly. 13048 */ 13049 return is_capable || ptrace_may_access(task, ptrace_mode); 13050 } 13051 13052 /** 13053 * sys_perf_event_open - open a performance event, associate it to a task/cpu 13054 * 13055 * @attr_uptr: event_id type attributes for monitoring/sampling 13056 * @pid: target pid 13057 * @cpu: target cpu 13058 * @group_fd: group leader event fd 13059 * @flags: perf event open flags 13060 */ 13061 SYSCALL_DEFINE5(perf_event_open, 13062 struct perf_event_attr __user *, attr_uptr, 13063 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 13064 { 13065 struct perf_event *group_leader = NULL, *output_event = NULL; 13066 struct perf_event_pmu_context *pmu_ctx; 13067 struct perf_event *event, *sibling; 13068 struct perf_event_attr attr; 13069 struct perf_event_context *ctx; 13070 struct file *event_file = NULL; 13071 struct task_struct *task = NULL; 13072 struct pmu *pmu; 13073 int event_fd; 13074 int move_group = 0; 13075 int err; 13076 int f_flags = O_RDWR; 13077 int cgroup_fd = -1; 13078 13079 /* for future expandability... */ 13080 if (flags & ~PERF_FLAG_ALL) 13081 return -EINVAL; 13082 13083 err = perf_copy_attr(attr_uptr, &attr); 13084 if (err) 13085 return err; 13086 13087 /* Do we allow access to perf_event_open(2) ? */ 13088 err = security_perf_event_open(PERF_SECURITY_OPEN); 13089 if (err) 13090 return err; 13091 13092 if (!attr.exclude_kernel) { 13093 err = perf_allow_kernel(); 13094 if (err) 13095 return err; 13096 } 13097 13098 if (attr.namespaces) { 13099 if (!perfmon_capable()) 13100 return -EACCES; 13101 } 13102 13103 if (attr.freq) { 13104 if (attr.sample_freq > sysctl_perf_event_sample_rate) 13105 return -EINVAL; 13106 } else { 13107 if (attr.sample_period & (1ULL << 63)) 13108 return -EINVAL; 13109 } 13110 13111 /* Only privileged users can get physical addresses */ 13112 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { 13113 err = perf_allow_kernel(); 13114 if (err) 13115 return err; 13116 } 13117 13118 /* REGS_INTR can leak data, lockdown must prevent this */ 13119 if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { 13120 err = security_locked_down(LOCKDOWN_PERF); 13121 if (err) 13122 return err; 13123 } 13124 13125 /* 13126 * In cgroup mode, the pid argument is used to pass the fd 13127 * opened to the cgroup directory in cgroupfs. The cpu argument 13128 * designates the cpu on which to monitor threads from that 13129 * cgroup. 13130 */ 13131 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) 13132 return -EINVAL; 13133 13134 if (flags & PERF_FLAG_FD_CLOEXEC) 13135 f_flags |= O_CLOEXEC; 13136 13137 event_fd = get_unused_fd_flags(f_flags); 13138 if (event_fd < 0) 13139 return event_fd; 13140 13141 CLASS(fd, group)(group_fd); // group_fd == -1 => empty 13142 if (group_fd != -1) { 13143 if (!is_perf_file(group)) { 13144 err = -EBADF; 13145 goto err_fd; 13146 } 13147 group_leader = fd_file(group)->private_data; 13148 if (flags & PERF_FLAG_FD_OUTPUT) 13149 output_event = group_leader; 13150 if (flags & PERF_FLAG_FD_NO_GROUP) 13151 group_leader = NULL; 13152 } 13153 13154 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { 13155 task = find_lively_task_by_vpid(pid); 13156 if (IS_ERR(task)) { 13157 err = PTR_ERR(task); 13158 goto err_fd; 13159 } 13160 } 13161 13162 if (task && group_leader && 13163 group_leader->attr.inherit != attr.inherit) { 13164 err = -EINVAL; 13165 goto err_task; 13166 } 13167 13168 if (flags & PERF_FLAG_PID_CGROUP) 13169 cgroup_fd = pid; 13170 13171 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 13172 NULL, NULL, cgroup_fd); 13173 if (IS_ERR(event)) { 13174 err = PTR_ERR(event); 13175 goto err_task; 13176 } 13177 13178 if (is_sampling_event(event)) { 13179 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { 13180 err = -EOPNOTSUPP; 13181 goto err_alloc; 13182 } 13183 } 13184 13185 /* 13186 * Special case software events and allow them to be part of 13187 * any hardware group. 13188 */ 13189 pmu = event->pmu; 13190 13191 if (attr.use_clockid) { 13192 err = perf_event_set_clock(event, attr.clockid); 13193 if (err) 13194 goto err_alloc; 13195 } 13196 13197 if (pmu->task_ctx_nr == perf_sw_context) 13198 event->event_caps |= PERF_EV_CAP_SOFTWARE; 13199 13200 if (task) { 13201 err = down_read_interruptible(&task->signal->exec_update_lock); 13202 if (err) 13203 goto err_alloc; 13204 13205 /* 13206 * We must hold exec_update_lock across this and any potential 13207 * perf_install_in_context() call for this new event to 13208 * serialize against exec() altering our credentials (and the 13209 * perf_event_exit_task() that could imply). 13210 */ 13211 err = -EACCES; 13212 if (!perf_check_permission(&attr, task)) 13213 goto err_cred; 13214 } 13215 13216 /* 13217 * Get the target context (task or percpu): 13218 */ 13219 ctx = find_get_context(task, event); 13220 if (IS_ERR(ctx)) { 13221 err = PTR_ERR(ctx); 13222 goto err_cred; 13223 } 13224 13225 mutex_lock(&ctx->mutex); 13226 13227 if (ctx->task == TASK_TOMBSTONE) { 13228 err = -ESRCH; 13229 goto err_locked; 13230 } 13231 13232 if (!task) { 13233 /* 13234 * Check if the @cpu we're creating an event for is online. 13235 * 13236 * We use the perf_cpu_context::ctx::mutex to serialize against 13237 * the hotplug notifiers. See perf_event_{init,exit}_cpu(). 13238 */ 13239 struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); 13240 13241 if (!cpuctx->online) { 13242 err = -ENODEV; 13243 goto err_locked; 13244 } 13245 } 13246 13247 if (group_leader) { 13248 err = -EINVAL; 13249 13250 /* 13251 * Do not allow a recursive hierarchy (this new sibling 13252 * becoming part of another group-sibling): 13253 */ 13254 if (group_leader->group_leader != group_leader) 13255 goto err_locked; 13256 13257 /* All events in a group should have the same clock */ 13258 if (group_leader->clock != event->clock) 13259 goto err_locked; 13260 13261 /* 13262 * Make sure we're both events for the same CPU; 13263 * grouping events for different CPUs is broken; since 13264 * you can never concurrently schedule them anyhow. 13265 */ 13266 if (group_leader->cpu != event->cpu) 13267 goto err_locked; 13268 13269 /* 13270 * Make sure we're both on the same context; either task or cpu. 13271 */ 13272 if (group_leader->ctx != ctx) 13273 goto err_locked; 13274 13275 /* 13276 * Only a group leader can be exclusive or pinned 13277 */ 13278 if (attr.exclusive || attr.pinned) 13279 goto err_locked; 13280 13281 if (is_software_event(event) && 13282 !in_software_context(group_leader)) { 13283 /* 13284 * If the event is a sw event, but the group_leader 13285 * is on hw context. 13286 * 13287 * Allow the addition of software events to hw 13288 * groups, this is safe because software events 13289 * never fail to schedule. 13290 * 13291 * Note the comment that goes with struct 13292 * perf_event_pmu_context. 13293 */ 13294 pmu = group_leader->pmu_ctx->pmu; 13295 } else if (!is_software_event(event)) { 13296 if (is_software_event(group_leader) && 13297 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { 13298 /* 13299 * In case the group is a pure software group, and we 13300 * try to add a hardware event, move the whole group to 13301 * the hardware context. 13302 */ 13303 move_group = 1; 13304 } 13305 13306 /* Don't allow group of multiple hw events from different pmus */ 13307 if (!in_software_context(group_leader) && 13308 group_leader->pmu_ctx->pmu != pmu) 13309 goto err_locked; 13310 } 13311 } 13312 13313 /* 13314 * Now that we're certain of the pmu; find the pmu_ctx. 13315 */ 13316 pmu_ctx = find_get_pmu_context(pmu, ctx, event); 13317 if (IS_ERR(pmu_ctx)) { 13318 err = PTR_ERR(pmu_ctx); 13319 goto err_locked; 13320 } 13321 event->pmu_ctx = pmu_ctx; 13322 13323 if (output_event) { 13324 err = perf_event_set_output(event, output_event); 13325 if (err) 13326 goto err_context; 13327 } 13328 13329 if (!perf_event_validate_size(event)) { 13330 err = -E2BIG; 13331 goto err_context; 13332 } 13333 13334 if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { 13335 err = -EINVAL; 13336 goto err_context; 13337 } 13338 13339 /* 13340 * Must be under the same ctx::mutex as perf_install_in_context(), 13341 * because we need to serialize with concurrent event creation. 13342 */ 13343 if (!exclusive_event_installable(event, ctx)) { 13344 err = -EBUSY; 13345 goto err_context; 13346 } 13347 13348 WARN_ON_ONCE(ctx->parent_ctx); 13349 13350 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); 13351 if (IS_ERR(event_file)) { 13352 err = PTR_ERR(event_file); 13353 event_file = NULL; 13354 goto err_context; 13355 } 13356 13357 /* 13358 * This is the point on no return; we cannot fail hereafter. This is 13359 * where we start modifying current state. 13360 */ 13361 13362 if (move_group) { 13363 perf_remove_from_context(group_leader, 0); 13364 put_pmu_ctx(group_leader->pmu_ctx); 13365 13366 for_each_sibling_event(sibling, group_leader) { 13367 perf_remove_from_context(sibling, 0); 13368 put_pmu_ctx(sibling->pmu_ctx); 13369 } 13370 13371 /* 13372 * Install the group siblings before the group leader. 13373 * 13374 * Because a group leader will try and install the entire group 13375 * (through the sibling list, which is still in-tact), we can 13376 * end up with siblings installed in the wrong context. 13377 * 13378 * By installing siblings first we NO-OP because they're not 13379 * reachable through the group lists. 13380 */ 13381 for_each_sibling_event(sibling, group_leader) { 13382 sibling->pmu_ctx = pmu_ctx; 13383 get_pmu_ctx(pmu_ctx); 13384 perf_event__state_init(sibling); 13385 perf_install_in_context(ctx, sibling, sibling->cpu); 13386 } 13387 13388 /* 13389 * Removing from the context ends up with disabled 13390 * event. What we want here is event in the initial 13391 * startup state, ready to be add into new context. 13392 */ 13393 group_leader->pmu_ctx = pmu_ctx; 13394 get_pmu_ctx(pmu_ctx); 13395 perf_event__state_init(group_leader); 13396 perf_install_in_context(ctx, group_leader, group_leader->cpu); 13397 } 13398 13399 /* 13400 * Precalculate sample_data sizes; do while holding ctx::mutex such 13401 * that we're serialized against further additions and before 13402 * perf_install_in_context() which is the point the event is active and 13403 * can use these values. 13404 */ 13405 perf_event__header_size(event); 13406 perf_event__id_header_size(event); 13407 13408 event->owner = current; 13409 13410 perf_install_in_context(ctx, event, event->cpu); 13411 perf_unpin_context(ctx); 13412 13413 mutex_unlock(&ctx->mutex); 13414 13415 if (task) { 13416 up_read(&task->signal->exec_update_lock); 13417 put_task_struct(task); 13418 } 13419 13420 mutex_lock(¤t->perf_event_mutex); 13421 list_add_tail(&event->owner_entry, ¤t->perf_event_list); 13422 mutex_unlock(¤t->perf_event_mutex); 13423 13424 /* 13425 * File reference in group guarantees that group_leader has been 13426 * kept alive until we place the new event on the sibling_list. 13427 * This ensures destruction of the group leader will find 13428 * the pointer to itself in perf_group_detach(). 13429 */ 13430 fd_install(event_fd, event_file); 13431 return event_fd; 13432 13433 err_context: 13434 put_pmu_ctx(event->pmu_ctx); 13435 event->pmu_ctx = NULL; /* _free_event() */ 13436 err_locked: 13437 mutex_unlock(&ctx->mutex); 13438 perf_unpin_context(ctx); 13439 put_ctx(ctx); 13440 err_cred: 13441 if (task) 13442 up_read(&task->signal->exec_update_lock); 13443 err_alloc: 13444 free_event(event); 13445 err_task: 13446 if (task) 13447 put_task_struct(task); 13448 err_fd: 13449 put_unused_fd(event_fd); 13450 return err; 13451 } 13452 13453 /** 13454 * perf_event_create_kernel_counter 13455 * 13456 * @attr: attributes of the counter to create 13457 * @cpu: cpu in which the counter is bound 13458 * @task: task to profile (NULL for percpu) 13459 * @overflow_handler: callback to trigger when we hit the event 13460 * @context: context data could be used in overflow_handler callback 13461 */ 13462 struct perf_event * 13463 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 13464 struct task_struct *task, 13465 perf_overflow_handler_t overflow_handler, 13466 void *context) 13467 { 13468 struct perf_event_pmu_context *pmu_ctx; 13469 struct perf_event_context *ctx; 13470 struct perf_event *event; 13471 struct pmu *pmu; 13472 int err; 13473 13474 /* 13475 * Grouping is not supported for kernel events, neither is 'AUX', 13476 * make sure the caller's intentions are adjusted. 13477 */ 13478 if (attr->aux_output || attr->aux_action) 13479 return ERR_PTR(-EINVAL); 13480 13481 event = perf_event_alloc(attr, cpu, task, NULL, NULL, 13482 overflow_handler, context, -1); 13483 if (IS_ERR(event)) { 13484 err = PTR_ERR(event); 13485 goto err; 13486 } 13487 13488 /* Mark owner so we could distinguish it from user events. */ 13489 event->owner = TASK_TOMBSTONE; 13490 pmu = event->pmu; 13491 13492 if (pmu->task_ctx_nr == perf_sw_context) 13493 event->event_caps |= PERF_EV_CAP_SOFTWARE; 13494 13495 /* 13496 * Get the target context (task or percpu): 13497 */ 13498 ctx = find_get_context(task, event); 13499 if (IS_ERR(ctx)) { 13500 err = PTR_ERR(ctx); 13501 goto err_alloc; 13502 } 13503 13504 WARN_ON_ONCE(ctx->parent_ctx); 13505 mutex_lock(&ctx->mutex); 13506 if (ctx->task == TASK_TOMBSTONE) { 13507 err = -ESRCH; 13508 goto err_unlock; 13509 } 13510 13511 pmu_ctx = find_get_pmu_context(pmu, ctx, event); 13512 if (IS_ERR(pmu_ctx)) { 13513 err = PTR_ERR(pmu_ctx); 13514 goto err_unlock; 13515 } 13516 event->pmu_ctx = pmu_ctx; 13517 13518 if (!task) { 13519 /* 13520 * Check if the @cpu we're creating an event for is online. 13521 * 13522 * We use the perf_cpu_context::ctx::mutex to serialize against 13523 * the hotplug notifiers. See perf_event_{init,exit}_cpu(). 13524 */ 13525 struct perf_cpu_context *cpuctx = 13526 container_of(ctx, struct perf_cpu_context, ctx); 13527 if (!cpuctx->online) { 13528 err = -ENODEV; 13529 goto err_pmu_ctx; 13530 } 13531 } 13532 13533 if (!exclusive_event_installable(event, ctx)) { 13534 err = -EBUSY; 13535 goto err_pmu_ctx; 13536 } 13537 13538 perf_install_in_context(ctx, event, event->cpu); 13539 perf_unpin_context(ctx); 13540 mutex_unlock(&ctx->mutex); 13541 13542 return event; 13543 13544 err_pmu_ctx: 13545 put_pmu_ctx(pmu_ctx); 13546 event->pmu_ctx = NULL; /* _free_event() */ 13547 err_unlock: 13548 mutex_unlock(&ctx->mutex); 13549 perf_unpin_context(ctx); 13550 put_ctx(ctx); 13551 err_alloc: 13552 free_event(event); 13553 err: 13554 return ERR_PTR(err); 13555 } 13556 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 13557 13558 static void __perf_pmu_remove(struct perf_event_context *ctx, 13559 int cpu, struct pmu *pmu, 13560 struct perf_event_groups *groups, 13561 struct list_head *events) 13562 { 13563 struct perf_event *event, *sibling; 13564 13565 perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { 13566 perf_remove_from_context(event, 0); 13567 put_pmu_ctx(event->pmu_ctx); 13568 list_add(&event->migrate_entry, events); 13569 13570 for_each_sibling_event(sibling, event) { 13571 perf_remove_from_context(sibling, 0); 13572 put_pmu_ctx(sibling->pmu_ctx); 13573 list_add(&sibling->migrate_entry, events); 13574 } 13575 } 13576 } 13577 13578 static void __perf_pmu_install_event(struct pmu *pmu, 13579 struct perf_event_context *ctx, 13580 int cpu, struct perf_event *event) 13581 { 13582 struct perf_event_pmu_context *epc; 13583 struct perf_event_context *old_ctx = event->ctx; 13584 13585 get_ctx(ctx); /* normally find_get_context() */ 13586 13587 event->cpu = cpu; 13588 epc = find_get_pmu_context(pmu, ctx, event); 13589 event->pmu_ctx = epc; 13590 13591 if (event->state >= PERF_EVENT_STATE_OFF) 13592 event->state = PERF_EVENT_STATE_INACTIVE; 13593 perf_install_in_context(ctx, event, cpu); 13594 13595 /* 13596 * Now that event->ctx is updated and visible, put the old ctx. 13597 */ 13598 put_ctx(old_ctx); 13599 } 13600 13601 static void __perf_pmu_install(struct perf_event_context *ctx, 13602 int cpu, struct pmu *pmu, struct list_head *events) 13603 { 13604 struct perf_event *event, *tmp; 13605 13606 /* 13607 * Re-instate events in 2 passes. 13608 * 13609 * Skip over group leaders and only install siblings on this first 13610 * pass, siblings will not get enabled without a leader, however a 13611 * leader will enable its siblings, even if those are still on the old 13612 * context. 13613 */ 13614 list_for_each_entry_safe(event, tmp, events, migrate_entry) { 13615 if (event->group_leader == event) 13616 continue; 13617 13618 list_del(&event->migrate_entry); 13619 __perf_pmu_install_event(pmu, ctx, cpu, event); 13620 } 13621 13622 /* 13623 * Once all the siblings are setup properly, install the group leaders 13624 * to make it go. 13625 */ 13626 list_for_each_entry_safe(event, tmp, events, migrate_entry) { 13627 list_del(&event->migrate_entry); 13628 __perf_pmu_install_event(pmu, ctx, cpu, event); 13629 } 13630 } 13631 13632 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) 13633 { 13634 struct perf_event_context *src_ctx, *dst_ctx; 13635 LIST_HEAD(events); 13636 13637 /* 13638 * Since per-cpu context is persistent, no need to grab an extra 13639 * reference. 13640 */ 13641 src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; 13642 dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; 13643 13644 /* 13645 * See perf_event_ctx_lock() for comments on the details 13646 * of swizzling perf_event::ctx. 13647 */ 13648 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); 13649 13650 __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); 13651 __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); 13652 13653 if (!list_empty(&events)) { 13654 /* 13655 * Wait for the events to quiesce before re-instating them. 13656 */ 13657 synchronize_rcu(); 13658 13659 __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); 13660 } 13661 13662 mutex_unlock(&dst_ctx->mutex); 13663 mutex_unlock(&src_ctx->mutex); 13664 } 13665 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); 13666 13667 static void sync_child_event(struct perf_event *child_event) 13668 { 13669 struct perf_event *parent_event = child_event->parent; 13670 u64 child_val; 13671 13672 if (child_event->attr.inherit_stat) { 13673 struct task_struct *task = child_event->ctx->task; 13674 13675 if (task && task != TASK_TOMBSTONE) 13676 perf_event_read_event(child_event, task); 13677 } 13678 13679 child_val = perf_event_count(child_event, false); 13680 13681 /* 13682 * Add back the child's count to the parent's count: 13683 */ 13684 atomic64_add(child_val, &parent_event->child_count); 13685 atomic64_add(child_event->total_time_enabled, 13686 &parent_event->child_total_time_enabled); 13687 atomic64_add(child_event->total_time_running, 13688 &parent_event->child_total_time_running); 13689 } 13690 13691 static void 13692 perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) 13693 { 13694 struct perf_event *parent_event = event->parent; 13695 unsigned long detach_flags = 0; 13696 13697 if (parent_event) { 13698 /* 13699 * Do not destroy the 'original' grouping; because of the 13700 * context switch optimization the original events could've 13701 * ended up in a random child task. 13702 * 13703 * If we were to destroy the original group, all group related 13704 * operations would cease to function properly after this 13705 * random child dies. 13706 * 13707 * Do destroy all inherited groups, we don't care about those 13708 * and being thorough is better. 13709 */ 13710 detach_flags = DETACH_GROUP | DETACH_CHILD; 13711 mutex_lock(&parent_event->child_mutex); 13712 } 13713 13714 perf_remove_from_context(event, detach_flags | DETACH_EXIT); 13715 13716 /* 13717 * Child events can be freed. 13718 */ 13719 if (parent_event) { 13720 mutex_unlock(&parent_event->child_mutex); 13721 /* 13722 * Kick perf_poll() for is_event_hup(); 13723 */ 13724 perf_event_wakeup(parent_event); 13725 put_event(event); 13726 return; 13727 } 13728 13729 /* 13730 * Parent events are governed by their filedesc, retain them. 13731 */ 13732 perf_event_wakeup(event); 13733 } 13734 13735 static void perf_event_exit_task_context(struct task_struct *child) 13736 { 13737 struct perf_event_context *child_ctx, *clone_ctx = NULL; 13738 struct perf_event *child_event, *next; 13739 13740 WARN_ON_ONCE(child != current); 13741 13742 child_ctx = perf_pin_task_context(child); 13743 if (!child_ctx) 13744 return; 13745 13746 /* 13747 * In order to reduce the amount of tricky in ctx tear-down, we hold 13748 * ctx::mutex over the entire thing. This serializes against almost 13749 * everything that wants to access the ctx. 13750 * 13751 * The exception is sys_perf_event_open() / 13752 * perf_event_create_kernel_count() which does find_get_context() 13753 * without ctx::mutex (it cannot because of the move_group double mutex 13754 * lock thing). See the comments in perf_install_in_context(). 13755 */ 13756 mutex_lock(&child_ctx->mutex); 13757 13758 /* 13759 * In a single ctx::lock section, de-schedule the events and detach the 13760 * context from the task such that we cannot ever get it scheduled back 13761 * in. 13762 */ 13763 raw_spin_lock_irq(&child_ctx->lock); 13764 task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); 13765 13766 /* 13767 * Now that the context is inactive, destroy the task <-> ctx relation 13768 * and mark the context dead. 13769 */ 13770 RCU_INIT_POINTER(child->perf_event_ctxp, NULL); 13771 put_ctx(child_ctx); /* cannot be last */ 13772 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); 13773 put_task_struct(current); /* cannot be last */ 13774 13775 clone_ctx = unclone_ctx(child_ctx); 13776 raw_spin_unlock_irq(&child_ctx->lock); 13777 13778 if (clone_ctx) 13779 put_ctx(clone_ctx); 13780 13781 /* 13782 * Report the task dead after unscheduling the events so that we 13783 * won't get any samples after PERF_RECORD_EXIT. We can however still 13784 * get a few PERF_RECORD_READ events. 13785 */ 13786 perf_event_task(child, child_ctx, 0); 13787 13788 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) 13789 perf_event_exit_event(child_event, child_ctx); 13790 13791 mutex_unlock(&child_ctx->mutex); 13792 13793 put_ctx(child_ctx); 13794 } 13795 13796 /* 13797 * When a child task exits, feed back event values to parent events. 13798 * 13799 * Can be called with exec_update_lock held when called from 13800 * setup_new_exec(). 13801 */ 13802 void perf_event_exit_task(struct task_struct *child) 13803 { 13804 struct perf_event *event, *tmp; 13805 13806 mutex_lock(&child->perf_event_mutex); 13807 list_for_each_entry_safe(event, tmp, &child->perf_event_list, 13808 owner_entry) { 13809 list_del_init(&event->owner_entry); 13810 13811 /* 13812 * Ensure the list deletion is visible before we clear 13813 * the owner, closes a race against perf_release() where 13814 * we need to serialize on the owner->perf_event_mutex. 13815 */ 13816 smp_store_release(&event->owner, NULL); 13817 } 13818 mutex_unlock(&child->perf_event_mutex); 13819 13820 perf_event_exit_task_context(child); 13821 13822 /* 13823 * The perf_event_exit_task_context calls perf_event_task 13824 * with child's task_ctx, which generates EXIT events for 13825 * child contexts and sets child->perf_event_ctxp[] to NULL. 13826 * At this point we need to send EXIT events to cpu contexts. 13827 */ 13828 perf_event_task(child, NULL, 0); 13829 13830 /* 13831 * Detach the perf_ctx_data for the system-wide event. 13832 */ 13833 guard(percpu_read)(&global_ctx_data_rwsem); 13834 detach_task_ctx_data(child); 13835 } 13836 13837 static void perf_free_event(struct perf_event *event, 13838 struct perf_event_context *ctx) 13839 { 13840 struct perf_event *parent = event->parent; 13841 13842 if (WARN_ON_ONCE(!parent)) 13843 return; 13844 13845 mutex_lock(&parent->child_mutex); 13846 list_del_init(&event->child_list); 13847 mutex_unlock(&parent->child_mutex); 13848 13849 raw_spin_lock_irq(&ctx->lock); 13850 perf_group_detach(event); 13851 list_del_event(event, ctx); 13852 raw_spin_unlock_irq(&ctx->lock); 13853 put_event(event); 13854 } 13855 13856 /* 13857 * Free a context as created by inheritance by perf_event_init_task() below, 13858 * used by fork() in case of fail. 13859 * 13860 * Even though the task has never lived, the context and events have been 13861 * exposed through the child_list, so we must take care tearing it all down. 13862 */ 13863 void perf_event_free_task(struct task_struct *task) 13864 { 13865 struct perf_event_context *ctx; 13866 struct perf_event *event, *tmp; 13867 13868 ctx = rcu_access_pointer(task->perf_event_ctxp); 13869 if (!ctx) 13870 return; 13871 13872 mutex_lock(&ctx->mutex); 13873 raw_spin_lock_irq(&ctx->lock); 13874 /* 13875 * Destroy the task <-> ctx relation and mark the context dead. 13876 * 13877 * This is important because even though the task hasn't been 13878 * exposed yet the context has been (through child_list). 13879 */ 13880 RCU_INIT_POINTER(task->perf_event_ctxp, NULL); 13881 WRITE_ONCE(ctx->task, TASK_TOMBSTONE); 13882 put_task_struct(task); /* cannot be last */ 13883 raw_spin_unlock_irq(&ctx->lock); 13884 13885 13886 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) 13887 perf_free_event(event, ctx); 13888 13889 mutex_unlock(&ctx->mutex); 13890 13891 /* 13892 * perf_event_release_kernel() could've stolen some of our 13893 * child events and still have them on its free_list. In that 13894 * case we must wait for these events to have been freed (in 13895 * particular all their references to this task must've been 13896 * dropped). 13897 * 13898 * Without this copy_process() will unconditionally free this 13899 * task (irrespective of its reference count) and 13900 * _free_event()'s put_task_struct(event->hw.target) will be a 13901 * use-after-free. 13902 * 13903 * Wait for all events to drop their context reference. 13904 */ 13905 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); 13906 put_ctx(ctx); /* must be last */ 13907 } 13908 13909 void perf_event_delayed_put(struct task_struct *task) 13910 { 13911 WARN_ON_ONCE(task->perf_event_ctxp); 13912 } 13913 13914 struct file *perf_event_get(unsigned int fd) 13915 { 13916 struct file *file = fget(fd); 13917 if (!file) 13918 return ERR_PTR(-EBADF); 13919 13920 if (file->f_op != &perf_fops) { 13921 fput(file); 13922 return ERR_PTR(-EBADF); 13923 } 13924 13925 return file; 13926 } 13927 13928 const struct perf_event *perf_get_event(struct file *file) 13929 { 13930 if (file->f_op != &perf_fops) 13931 return ERR_PTR(-EINVAL); 13932 13933 return file->private_data; 13934 } 13935 13936 const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 13937 { 13938 if (!event) 13939 return ERR_PTR(-EINVAL); 13940 13941 return &event->attr; 13942 } 13943 13944 int perf_allow_kernel(void) 13945 { 13946 if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) 13947 return -EACCES; 13948 13949 return security_perf_event_open(PERF_SECURITY_KERNEL); 13950 } 13951 EXPORT_SYMBOL_GPL(perf_allow_kernel); 13952 13953 /* 13954 * Inherit an event from parent task to child task. 13955 * 13956 * Returns: 13957 * - valid pointer on success 13958 * - NULL for orphaned events 13959 * - IS_ERR() on error 13960 */ 13961 static struct perf_event * 13962 inherit_event(struct perf_event *parent_event, 13963 struct task_struct *parent, 13964 struct perf_event_context *parent_ctx, 13965 struct task_struct *child, 13966 struct perf_event *group_leader, 13967 struct perf_event_context *child_ctx) 13968 { 13969 enum perf_event_state parent_state = parent_event->state; 13970 struct perf_event_pmu_context *pmu_ctx; 13971 struct perf_event *child_event; 13972 unsigned long flags; 13973 13974 /* 13975 * Instead of creating recursive hierarchies of events, 13976 * we link inherited events back to the original parent, 13977 * which has a filp for sure, which we use as the reference 13978 * count: 13979 */ 13980 if (parent_event->parent) 13981 parent_event = parent_event->parent; 13982 13983 child_event = perf_event_alloc(&parent_event->attr, 13984 parent_event->cpu, 13985 child, 13986 group_leader, parent_event, 13987 NULL, NULL, -1); 13988 if (IS_ERR(child_event)) 13989 return child_event; 13990 13991 get_ctx(child_ctx); 13992 child_event->ctx = child_ctx; 13993 13994 pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); 13995 if (IS_ERR(pmu_ctx)) { 13996 free_event(child_event); 13997 return ERR_CAST(pmu_ctx); 13998 } 13999 child_event->pmu_ctx = pmu_ctx; 14000 14001 /* 14002 * is_orphaned_event() and list_add_tail(&parent_event->child_list) 14003 * must be under the same lock in order to serialize against 14004 * perf_event_release_kernel(), such that either we must observe 14005 * is_orphaned_event() or they will observe us on the child_list. 14006 */ 14007 mutex_lock(&parent_event->child_mutex); 14008 if (is_orphaned_event(parent_event) || 14009 !atomic_long_inc_not_zero(&parent_event->refcount)) { 14010 mutex_unlock(&parent_event->child_mutex); 14011 free_event(child_event); 14012 return NULL; 14013 } 14014 14015 /* 14016 * Make the child state follow the state of the parent event, 14017 * not its attr.disabled bit. We hold the parent's mutex, 14018 * so we won't race with perf_event_{en, dis}able_family. 14019 */ 14020 if (parent_state >= PERF_EVENT_STATE_INACTIVE) 14021 child_event->state = PERF_EVENT_STATE_INACTIVE; 14022 else 14023 child_event->state = PERF_EVENT_STATE_OFF; 14024 14025 if (parent_event->attr.freq) { 14026 u64 sample_period = parent_event->hw.sample_period; 14027 struct hw_perf_event *hwc = &child_event->hw; 14028 14029 hwc->sample_period = sample_period; 14030 hwc->last_period = sample_period; 14031 14032 local64_set(&hwc->period_left, sample_period); 14033 } 14034 14035 child_event->overflow_handler = parent_event->overflow_handler; 14036 child_event->overflow_handler_context 14037 = parent_event->overflow_handler_context; 14038 14039 /* 14040 * Precalculate sample_data sizes 14041 */ 14042 perf_event__header_size(child_event); 14043 perf_event__id_header_size(child_event); 14044 14045 /* 14046 * Link it up in the child's context: 14047 */ 14048 raw_spin_lock_irqsave(&child_ctx->lock, flags); 14049 add_event_to_ctx(child_event, child_ctx); 14050 child_event->attach_state |= PERF_ATTACH_CHILD; 14051 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 14052 14053 /* 14054 * Link this into the parent event's child list 14055 */ 14056 list_add_tail(&child_event->child_list, &parent_event->child_list); 14057 mutex_unlock(&parent_event->child_mutex); 14058 14059 return child_event; 14060 } 14061 14062 /* 14063 * Inherits an event group. 14064 * 14065 * This will quietly suppress orphaned events; !inherit_event() is not an error. 14066 * This matches with perf_event_release_kernel() removing all child events. 14067 * 14068 * Returns: 14069 * - 0 on success 14070 * - <0 on error 14071 */ 14072 static int inherit_group(struct perf_event *parent_event, 14073 struct task_struct *parent, 14074 struct perf_event_context *parent_ctx, 14075 struct task_struct *child, 14076 struct perf_event_context *child_ctx) 14077 { 14078 struct perf_event *leader; 14079 struct perf_event *sub; 14080 struct perf_event *child_ctr; 14081 14082 leader = inherit_event(parent_event, parent, parent_ctx, 14083 child, NULL, child_ctx); 14084 if (IS_ERR(leader)) 14085 return PTR_ERR(leader); 14086 /* 14087 * @leader can be NULL here because of is_orphaned_event(). In this 14088 * case inherit_event() will create individual events, similar to what 14089 * perf_group_detach() would do anyway. 14090 */ 14091 for_each_sibling_event(sub, parent_event) { 14092 child_ctr = inherit_event(sub, parent, parent_ctx, 14093 child, leader, child_ctx); 14094 if (IS_ERR(child_ctr)) 14095 return PTR_ERR(child_ctr); 14096 14097 if (sub->aux_event == parent_event && child_ctr && 14098 !perf_get_aux_event(child_ctr, leader)) 14099 return -EINVAL; 14100 } 14101 if (leader) 14102 leader->group_generation = parent_event->group_generation; 14103 return 0; 14104 } 14105 14106 /* 14107 * Creates the child task context and tries to inherit the event-group. 14108 * 14109 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave 14110 * inherited_all set when we 'fail' to inherit an orphaned event; this is 14111 * consistent with perf_event_release_kernel() removing all child events. 14112 * 14113 * Returns: 14114 * - 0 on success 14115 * - <0 on error 14116 */ 14117 static int 14118 inherit_task_group(struct perf_event *event, struct task_struct *parent, 14119 struct perf_event_context *parent_ctx, 14120 struct task_struct *child, 14121 u64 clone_flags, int *inherited_all) 14122 { 14123 struct perf_event_context *child_ctx; 14124 int ret; 14125 14126 if (!event->attr.inherit || 14127 (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || 14128 /* Do not inherit if sigtrap and signal handlers were cleared. */ 14129 (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { 14130 *inherited_all = 0; 14131 return 0; 14132 } 14133 14134 child_ctx = child->perf_event_ctxp; 14135 if (!child_ctx) { 14136 /* 14137 * This is executed from the parent task context, so 14138 * inherit events that have been marked for cloning. 14139 * First allocate and initialize a context for the 14140 * child. 14141 */ 14142 child_ctx = alloc_perf_context(child); 14143 if (!child_ctx) 14144 return -ENOMEM; 14145 14146 child->perf_event_ctxp = child_ctx; 14147 } 14148 14149 ret = inherit_group(event, parent, parent_ctx, child, child_ctx); 14150 if (ret) 14151 *inherited_all = 0; 14152 14153 return ret; 14154 } 14155 14156 /* 14157 * Initialize the perf_event context in task_struct 14158 */ 14159 static int perf_event_init_context(struct task_struct *child, u64 clone_flags) 14160 { 14161 struct perf_event_context *child_ctx, *parent_ctx; 14162 struct perf_event_context *cloned_ctx; 14163 struct perf_event *event; 14164 struct task_struct *parent = current; 14165 int inherited_all = 1; 14166 unsigned long flags; 14167 int ret = 0; 14168 14169 if (likely(!parent->perf_event_ctxp)) 14170 return 0; 14171 14172 /* 14173 * If the parent's context is a clone, pin it so it won't get 14174 * swapped under us. 14175 */ 14176 parent_ctx = perf_pin_task_context(parent); 14177 if (!parent_ctx) 14178 return 0; 14179 14180 /* 14181 * No need to check if parent_ctx != NULL here; since we saw 14182 * it non-NULL earlier, the only reason for it to become NULL 14183 * is if we exit, and since we're currently in the middle of 14184 * a fork we can't be exiting at the same time. 14185 */ 14186 14187 /* 14188 * Lock the parent list. No need to lock the child - not PID 14189 * hashed yet and not running, so nobody can access it. 14190 */ 14191 mutex_lock(&parent_ctx->mutex); 14192 14193 /* 14194 * We dont have to disable NMIs - we are only looking at 14195 * the list, not manipulating it: 14196 */ 14197 perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { 14198 ret = inherit_task_group(event, parent, parent_ctx, 14199 child, clone_flags, &inherited_all); 14200 if (ret) 14201 goto out_unlock; 14202 } 14203 14204 /* 14205 * We can't hold ctx->lock when iterating the ->flexible_group list due 14206 * to allocations, but we need to prevent rotation because 14207 * rotate_ctx() will change the list from interrupt context. 14208 */ 14209 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 14210 parent_ctx->rotate_disable = 1; 14211 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 14212 14213 perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { 14214 ret = inherit_task_group(event, parent, parent_ctx, 14215 child, clone_flags, &inherited_all); 14216 if (ret) 14217 goto out_unlock; 14218 } 14219 14220 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 14221 parent_ctx->rotate_disable = 0; 14222 14223 child_ctx = child->perf_event_ctxp; 14224 14225 if (child_ctx && inherited_all) { 14226 /* 14227 * Mark the child context as a clone of the parent 14228 * context, or of whatever the parent is a clone of. 14229 * 14230 * Note that if the parent is a clone, the holding of 14231 * parent_ctx->lock avoids it from being uncloned. 14232 */ 14233 cloned_ctx = parent_ctx->parent_ctx; 14234 if (cloned_ctx) { 14235 child_ctx->parent_ctx = cloned_ctx; 14236 child_ctx->parent_gen = parent_ctx->parent_gen; 14237 } else { 14238 child_ctx->parent_ctx = parent_ctx; 14239 child_ctx->parent_gen = parent_ctx->generation; 14240 } 14241 get_ctx(child_ctx->parent_ctx); 14242 } 14243 14244 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 14245 out_unlock: 14246 mutex_unlock(&parent_ctx->mutex); 14247 14248 perf_unpin_context(parent_ctx); 14249 put_ctx(parent_ctx); 14250 14251 return ret; 14252 } 14253 14254 /* 14255 * Initialize the perf_event context in task_struct 14256 */ 14257 int perf_event_init_task(struct task_struct *child, u64 clone_flags) 14258 { 14259 int ret; 14260 14261 memset(child->perf_recursion, 0, sizeof(child->perf_recursion)); 14262 child->perf_event_ctxp = NULL; 14263 mutex_init(&child->perf_event_mutex); 14264 INIT_LIST_HEAD(&child->perf_event_list); 14265 child->perf_ctx_data = NULL; 14266 14267 ret = perf_event_init_context(child, clone_flags); 14268 if (ret) { 14269 perf_event_free_task(child); 14270 return ret; 14271 } 14272 14273 return 0; 14274 } 14275 14276 static void __init perf_event_init_all_cpus(void) 14277 { 14278 struct swevent_htable *swhash; 14279 struct perf_cpu_context *cpuctx; 14280 int cpu; 14281 14282 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); 14283 zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); 14284 zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); 14285 zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); 14286 zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); 14287 zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); 14288 14289 14290 for_each_possible_cpu(cpu) { 14291 swhash = &per_cpu(swevent_htable, cpu); 14292 mutex_init(&swhash->hlist_mutex); 14293 14294 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); 14295 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); 14296 14297 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); 14298 14299 cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); 14300 __perf_event_init_context(&cpuctx->ctx); 14301 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 14302 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 14303 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); 14304 cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); 14305 cpuctx->heap = cpuctx->heap_default; 14306 } 14307 } 14308 14309 static void perf_swevent_init_cpu(unsigned int cpu) 14310 { 14311 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 14312 14313 mutex_lock(&swhash->hlist_mutex); 14314 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { 14315 struct swevent_hlist *hlist; 14316 14317 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); 14318 WARN_ON(!hlist); 14319 rcu_assign_pointer(swhash->swevent_hlist, hlist); 14320 } 14321 mutex_unlock(&swhash->hlist_mutex); 14322 } 14323 14324 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE 14325 static void __perf_event_exit_context(void *__info) 14326 { 14327 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 14328 struct perf_event_context *ctx = __info; 14329 struct perf_event *event; 14330 14331 raw_spin_lock(&ctx->lock); 14332 ctx_sched_out(ctx, NULL, EVENT_TIME); 14333 list_for_each_entry(event, &ctx->event_list, event_entry) 14334 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); 14335 raw_spin_unlock(&ctx->lock); 14336 } 14337 14338 static void perf_event_clear_cpumask(unsigned int cpu) 14339 { 14340 int target[PERF_PMU_MAX_SCOPE]; 14341 unsigned int scope; 14342 struct pmu *pmu; 14343 14344 cpumask_clear_cpu(cpu, perf_online_mask); 14345 14346 for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { 14347 const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); 14348 struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); 14349 14350 target[scope] = -1; 14351 if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) 14352 continue; 14353 14354 if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) 14355 continue; 14356 target[scope] = cpumask_any_but(cpumask, cpu); 14357 if (target[scope] < nr_cpu_ids) 14358 cpumask_set_cpu(target[scope], pmu_cpumask); 14359 } 14360 14361 /* migrate */ 14362 list_for_each_entry(pmu, &pmus, entry) { 14363 if (pmu->scope == PERF_PMU_SCOPE_NONE || 14364 WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) 14365 continue; 14366 14367 if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) 14368 perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); 14369 } 14370 } 14371 14372 static void perf_event_exit_cpu_context(int cpu) 14373 { 14374 struct perf_cpu_context *cpuctx; 14375 struct perf_event_context *ctx; 14376 14377 // XXX simplify cpuctx->online 14378 mutex_lock(&pmus_lock); 14379 /* 14380 * Clear the cpumasks, and migrate to other CPUs if possible. 14381 * Must be invoked before the __perf_event_exit_context. 14382 */ 14383 perf_event_clear_cpumask(cpu); 14384 cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); 14385 ctx = &cpuctx->ctx; 14386 14387 mutex_lock(&ctx->mutex); 14388 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); 14389 cpuctx->online = 0; 14390 mutex_unlock(&ctx->mutex); 14391 mutex_unlock(&pmus_lock); 14392 } 14393 #else 14394 14395 static void perf_event_exit_cpu_context(int cpu) { } 14396 14397 #endif 14398 14399 static void perf_event_setup_cpumask(unsigned int cpu) 14400 { 14401 struct cpumask *pmu_cpumask; 14402 unsigned int scope; 14403 14404 /* 14405 * Early boot stage, the cpumask hasn't been set yet. 14406 * The perf_online_<domain>_masks includes the first CPU of each domain. 14407 * Always unconditionally set the boot CPU for the perf_online_<domain>_masks. 14408 */ 14409 if (cpumask_empty(perf_online_mask)) { 14410 for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { 14411 pmu_cpumask = perf_scope_cpumask(scope); 14412 if (WARN_ON_ONCE(!pmu_cpumask)) 14413 continue; 14414 cpumask_set_cpu(cpu, pmu_cpumask); 14415 } 14416 goto end; 14417 } 14418 14419 for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { 14420 const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); 14421 14422 pmu_cpumask = perf_scope_cpumask(scope); 14423 14424 if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) 14425 continue; 14426 14427 if (!cpumask_empty(cpumask) && 14428 cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) 14429 cpumask_set_cpu(cpu, pmu_cpumask); 14430 } 14431 end: 14432 cpumask_set_cpu(cpu, perf_online_mask); 14433 } 14434 14435 int perf_event_init_cpu(unsigned int cpu) 14436 { 14437 struct perf_cpu_context *cpuctx; 14438 struct perf_event_context *ctx; 14439 14440 perf_swevent_init_cpu(cpu); 14441 14442 mutex_lock(&pmus_lock); 14443 perf_event_setup_cpumask(cpu); 14444 cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); 14445 ctx = &cpuctx->ctx; 14446 14447 mutex_lock(&ctx->mutex); 14448 cpuctx->online = 1; 14449 mutex_unlock(&ctx->mutex); 14450 mutex_unlock(&pmus_lock); 14451 14452 return 0; 14453 } 14454 14455 int perf_event_exit_cpu(unsigned int cpu) 14456 { 14457 perf_event_exit_cpu_context(cpu); 14458 return 0; 14459 } 14460 14461 static int 14462 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) 14463 { 14464 int cpu; 14465 14466 for_each_online_cpu(cpu) 14467 perf_event_exit_cpu(cpu); 14468 14469 return NOTIFY_OK; 14470 } 14471 14472 /* 14473 * Run the perf reboot notifier at the very last possible moment so that 14474 * the generic watchdog code runs as long as possible. 14475 */ 14476 static struct notifier_block perf_reboot_notifier = { 14477 .notifier_call = perf_reboot, 14478 .priority = INT_MIN, 14479 }; 14480 14481 void __init perf_event_init(void) 14482 { 14483 int ret; 14484 14485 idr_init(&pmu_idr); 14486 14487 perf_event_init_all_cpus(); 14488 init_srcu_struct(&pmus_srcu); 14489 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); 14490 perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1); 14491 perf_pmu_register(&perf_task_clock, "task_clock", -1); 14492 perf_tp_register(); 14493 perf_event_init_cpu(smp_processor_id()); 14494 register_reboot_notifier(&perf_reboot_notifier); 14495 14496 ret = init_hw_breakpoint(); 14497 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 14498 14499 perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); 14500 14501 /* 14502 * Build time assertion that we keep the data_head at the intended 14503 * location. IOW, validation we got the __reserved[] size right. 14504 */ 14505 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) 14506 != 1024); 14507 } 14508 14509 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, 14510 char *page) 14511 { 14512 struct perf_pmu_events_attr *pmu_attr = 14513 container_of(attr, struct perf_pmu_events_attr, attr); 14514 14515 if (pmu_attr->event_str) 14516 return sprintf(page, "%s\n", pmu_attr->event_str); 14517 14518 return 0; 14519 } 14520 EXPORT_SYMBOL_GPL(perf_event_sysfs_show); 14521 14522 static int __init perf_event_sysfs_init(void) 14523 { 14524 struct pmu *pmu; 14525 int ret; 14526 14527 mutex_lock(&pmus_lock); 14528 14529 ret = bus_register(&pmu_bus); 14530 if (ret) 14531 goto unlock; 14532 14533 list_for_each_entry(pmu, &pmus, entry) { 14534 if (pmu->dev) 14535 continue; 14536 14537 ret = pmu_dev_alloc(pmu); 14538 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); 14539 } 14540 pmu_bus_running = 1; 14541 ret = 0; 14542 14543 unlock: 14544 mutex_unlock(&pmus_lock); 14545 14546 return ret; 14547 } 14548 device_initcall(perf_event_sysfs_init); 14549 14550 #ifdef CONFIG_CGROUP_PERF 14551 static struct cgroup_subsys_state * 14552 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 14553 { 14554 struct perf_cgroup *jc; 14555 14556 jc = kzalloc(sizeof(*jc), GFP_KERNEL); 14557 if (!jc) 14558 return ERR_PTR(-ENOMEM); 14559 14560 jc->info = alloc_percpu(struct perf_cgroup_info); 14561 if (!jc->info) { 14562 kfree(jc); 14563 return ERR_PTR(-ENOMEM); 14564 } 14565 14566 return &jc->css; 14567 } 14568 14569 static void perf_cgroup_css_free(struct cgroup_subsys_state *css) 14570 { 14571 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); 14572 14573 free_percpu(jc->info); 14574 kfree(jc); 14575 } 14576 14577 static int perf_cgroup_css_online(struct cgroup_subsys_state *css) 14578 { 14579 perf_event_cgroup(css->cgroup); 14580 return 0; 14581 } 14582 14583 static int __perf_cgroup_move(void *info) 14584 { 14585 struct task_struct *task = info; 14586 14587 preempt_disable(); 14588 perf_cgroup_switch(task); 14589 preempt_enable(); 14590 14591 return 0; 14592 } 14593 14594 static void perf_cgroup_attach(struct cgroup_taskset *tset) 14595 { 14596 struct task_struct *task; 14597 struct cgroup_subsys_state *css; 14598 14599 cgroup_taskset_for_each(task, css, tset) 14600 task_function_call(task, __perf_cgroup_move, task); 14601 } 14602 14603 struct cgroup_subsys perf_event_cgrp_subsys = { 14604 .css_alloc = perf_cgroup_css_alloc, 14605 .css_free = perf_cgroup_css_free, 14606 .css_online = perf_cgroup_css_online, 14607 .attach = perf_cgroup_attach, 14608 /* 14609 * Implicitly enable on dfl hierarchy so that perf events can 14610 * always be filtered by cgroup2 path as long as perf_event 14611 * controller is not mounted on a legacy hierarchy. 14612 */ 14613 .implicit_on_dfl = true, 14614 .threaded = true, 14615 }; 14616 #endif /* CONFIG_CGROUP_PERF */ 14617 14618 DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); 14619