1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Arm Statistical Profiling Extensions (SPE) support 4 * Copyright (c) 2017-2018, Arm Ltd. 5 */ 6 7 #include <byteswap.h> 8 #include <endian.h> 9 #include <errno.h> 10 #include <inttypes.h> 11 #include <linux/bitops.h> 12 #include <linux/kernel.h> 13 #include <linux/log2.h> 14 #include <linux/types.h> 15 #include <linux/zalloc.h> 16 #include <stdlib.h> 17 #include <unistd.h> 18 19 #include "auxtrace.h" 20 #include "color.h" 21 #include "debug.h" 22 #include "evlist.h" 23 #include "evsel.h" 24 #include "machine.h" 25 #include "session.h" 26 #include "symbol.h" 27 #include "thread.h" 28 #include "thread-stack.h" 29 #include "tsc.h" 30 #include "tool.h" 31 #include "util/synthetic-events.h" 32 33 #include "arm-spe.h" 34 #include "arm-spe-decoder/arm-spe-decoder.h" 35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h" 36 37 #include "../../arch/arm64/include/asm/cputype.h" 38 #define MAX_TIMESTAMP (~0ULL) 39 40 #define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST)) 41 42 struct arm_spe { 43 struct auxtrace auxtrace; 44 struct auxtrace_queues queues; 45 struct auxtrace_heap heap; 46 struct itrace_synth_opts synth_opts; 47 u32 auxtrace_type; 48 struct perf_session *session; 49 struct machine *machine; 50 u32 pmu_type; 51 52 struct perf_tsc_conversion tc; 53 54 u8 timeless_decoding; 55 u8 data_queued; 56 57 u64 sample_type; 58 u8 sample_flc; 59 u8 sample_llc; 60 u8 sample_tlb; 61 u8 sample_branch; 62 u8 sample_remote_access; 63 u8 sample_memory; 64 u8 sample_instructions; 65 u64 instructions_sample_period; 66 67 u64 l1d_miss_id; 68 u64 l1d_access_id; 69 u64 llc_miss_id; 70 u64 llc_access_id; 71 u64 tlb_miss_id; 72 u64 tlb_access_id; 73 u64 branch_id; 74 u64 remote_access_id; 75 u64 memory_id; 76 u64 instructions_id; 77 78 u64 kernel_start; 79 80 unsigned long num_events; 81 u8 use_ctx_pkt_for_pid; 82 83 u64 **metadata; 84 u64 metadata_ver; 85 u64 metadata_nr_cpu; 86 bool is_homogeneous; 87 }; 88 89 struct arm_spe_queue { 90 struct arm_spe *spe; 91 unsigned int queue_nr; 92 struct auxtrace_buffer *buffer; 93 struct auxtrace_buffer *old_buffer; 94 union perf_event *event_buf; 95 bool on_heap; 96 bool done; 97 pid_t pid; 98 pid_t tid; 99 int cpu; 100 struct arm_spe_decoder *decoder; 101 u64 time; 102 u64 timestamp; 103 struct thread *thread; 104 u64 period_instructions; 105 u32 flags; 106 struct branch_stack *last_branch; 107 }; 108 109 struct data_source_handle { 110 const struct midr_range *midr_ranges; 111 void (*ds_synth)(const struct arm_spe_record *record, 112 union perf_mem_data_src *data_src); 113 }; 114 115 #define DS(range, func) \ 116 { \ 117 .midr_ranges = range, \ 118 .ds_synth = arm_spe__synth_##func, \ 119 } 120 121 static void arm_spe_dump(struct arm_spe *spe __maybe_unused, 122 unsigned char *buf, size_t len) 123 { 124 struct arm_spe_pkt packet; 125 size_t pos = 0; 126 int ret, pkt_len, i; 127 char desc[ARM_SPE_PKT_DESC_MAX]; 128 const char *color = PERF_COLOR_BLUE; 129 130 color_fprintf(stdout, color, 131 ". ... ARM SPE data: size %#zx bytes\n", 132 len); 133 134 while (len) { 135 ret = arm_spe_get_packet(buf, len, &packet); 136 if (ret > 0) 137 pkt_len = ret; 138 else 139 pkt_len = 1; 140 printf("."); 141 color_fprintf(stdout, color, " %08zx: ", pos); 142 for (i = 0; i < pkt_len; i++) 143 color_fprintf(stdout, color, " %02x", buf[i]); 144 for (; i < 16; i++) 145 color_fprintf(stdout, color, " "); 146 if (ret > 0) { 147 ret = arm_spe_pkt_desc(&packet, desc, 148 ARM_SPE_PKT_DESC_MAX); 149 if (!ret) 150 color_fprintf(stdout, color, " %s\n", desc); 151 } else { 152 color_fprintf(stdout, color, " Bad packet!\n"); 153 } 154 pos += pkt_len; 155 buf += pkt_len; 156 len -= pkt_len; 157 } 158 } 159 160 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf, 161 size_t len) 162 { 163 printf(".\n"); 164 arm_spe_dump(spe, buf, len); 165 } 166 167 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data) 168 { 169 struct arm_spe_queue *speq = data; 170 struct auxtrace_buffer *buffer = speq->buffer; 171 struct auxtrace_buffer *old_buffer = speq->old_buffer; 172 struct auxtrace_queue *queue; 173 174 queue = &speq->spe->queues.queue_array[speq->queue_nr]; 175 176 buffer = auxtrace_buffer__next(queue, buffer); 177 /* If no more data, drop the previous auxtrace_buffer and return */ 178 if (!buffer) { 179 if (old_buffer) 180 auxtrace_buffer__drop_data(old_buffer); 181 b->len = 0; 182 return 0; 183 } 184 185 speq->buffer = buffer; 186 187 /* If the aux_buffer doesn't have data associated, try to load it */ 188 if (!buffer->data) { 189 /* get the file desc associated with the perf data file */ 190 int fd = perf_data__fd(speq->spe->session->data); 191 192 buffer->data = auxtrace_buffer__get_data(buffer, fd); 193 if (!buffer->data) 194 return -ENOMEM; 195 } 196 197 b->len = buffer->size; 198 b->buf = buffer->data; 199 200 if (b->len) { 201 if (old_buffer) 202 auxtrace_buffer__drop_data(old_buffer); 203 speq->old_buffer = buffer; 204 } else { 205 auxtrace_buffer__drop_data(buffer); 206 return arm_spe_get_trace(b, data); 207 } 208 209 return 0; 210 } 211 212 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, 213 unsigned int queue_nr) 214 { 215 struct arm_spe_params params = { .get_trace = 0, }; 216 struct arm_spe_queue *speq; 217 218 speq = zalloc(sizeof(*speq)); 219 if (!speq) 220 return NULL; 221 222 speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); 223 if (!speq->event_buf) 224 goto out_free; 225 226 speq->spe = spe; 227 speq->queue_nr = queue_nr; 228 speq->pid = -1; 229 speq->tid = -1; 230 speq->cpu = -1; 231 speq->period_instructions = 0; 232 233 /* params set */ 234 params.get_trace = arm_spe_get_trace; 235 params.data = speq; 236 237 if (spe->synth_opts.last_branch) { 238 size_t sz = sizeof(struct branch_stack); 239 240 /* Allocate up to two entries for PBT + TGT */ 241 sz += sizeof(struct branch_entry) * 242 min(spe->synth_opts.last_branch_sz, 2U); 243 speq->last_branch = zalloc(sz); 244 if (!speq->last_branch) 245 goto out_free; 246 } 247 248 /* create new decoder */ 249 speq->decoder = arm_spe_decoder_new(¶ms); 250 if (!speq->decoder) 251 goto out_free; 252 253 return speq; 254 255 out_free: 256 zfree(&speq->event_buf); 257 zfree(&speq->last_branch); 258 free(speq); 259 260 return NULL; 261 } 262 263 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip) 264 { 265 return ip >= spe->kernel_start ? 266 PERF_RECORD_MISC_KERNEL : 267 PERF_RECORD_MISC_USER; 268 } 269 270 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe, 271 struct auxtrace_queue *queue) 272 { 273 struct arm_spe_queue *speq = queue->priv; 274 pid_t tid; 275 276 tid = machine__get_current_tid(spe->machine, speq->cpu); 277 if (tid != -1) { 278 speq->tid = tid; 279 thread__zput(speq->thread); 280 } else 281 speq->tid = queue->tid; 282 283 if ((!speq->thread) && (speq->tid != -1)) { 284 speq->thread = machine__find_thread(spe->machine, -1, 285 speq->tid); 286 } 287 288 if (speq->thread) { 289 speq->pid = thread__pid(speq->thread); 290 if (queue->cpu == -1) 291 speq->cpu = thread__cpu(speq->thread); 292 } 293 } 294 295 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid) 296 { 297 struct arm_spe *spe = speq->spe; 298 int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid); 299 300 if (err) 301 return err; 302 303 arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]); 304 305 return 0; 306 } 307 308 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu) 309 { 310 u64 i; 311 312 if (!spe->metadata) 313 return NULL; 314 315 for (i = 0; i < spe->metadata_nr_cpu; i++) 316 if (spe->metadata[i][ARM_SPE_CPU] == cpu) 317 return spe->metadata[i]; 318 319 return NULL; 320 } 321 322 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record) 323 { 324 struct simd_flags simd_flags = {}; 325 326 if ((record->op & ARM_SPE_OP_LDST) && (record->op & ARM_SPE_OP_SVE_LDST)) 327 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE; 328 329 if ((record->op & ARM_SPE_OP_OTHER) && (record->op & ARM_SPE_OP_SVE_OTHER)) 330 simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE; 331 332 if (record->type & ARM_SPE_SVE_PARTIAL_PRED) 333 simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL; 334 335 if (record->type & ARM_SPE_SVE_EMPTY_PRED) 336 simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY; 337 338 return simd_flags; 339 } 340 341 static void arm_spe_prep_sample(struct arm_spe *spe, 342 struct arm_spe_queue *speq, 343 union perf_event *event, 344 struct perf_sample *sample) 345 { 346 struct arm_spe_record *record = &speq->decoder->record; 347 348 if (!spe->timeless_decoding) 349 sample->time = tsc_to_perf_time(record->timestamp, &spe->tc); 350 351 sample->ip = record->from_ip; 352 sample->cpumode = arm_spe_cpumode(spe, sample->ip); 353 sample->pid = speq->pid; 354 sample->tid = speq->tid; 355 sample->period = 1; 356 sample->cpu = speq->cpu; 357 sample->simd_flags = arm_spe__synth_simd_flags(record); 358 359 event->sample.header.type = PERF_RECORD_SAMPLE; 360 event->sample.header.misc = sample->cpumode; 361 event->sample.header.size = sizeof(struct perf_event_header); 362 } 363 364 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq) 365 { 366 struct arm_spe *spe = speq->spe; 367 struct arm_spe_record *record = &speq->decoder->record; 368 struct branch_stack *bstack = speq->last_branch; 369 struct branch_flags *bs_flags; 370 unsigned int last_branch_sz = spe->synth_opts.last_branch_sz; 371 bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH); 372 bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt; 373 size_t sz = sizeof(struct branch_stack) + 374 sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */; 375 int i = 0; 376 377 /* Clean up branch stack */ 378 memset(bstack, 0x0, sz); 379 380 if (!have_tgt && !have_pbt) 381 return; 382 383 if (have_tgt) { 384 bstack->entries[i].from = record->from_ip; 385 bstack->entries[i].to = record->to_ip; 386 387 bs_flags = &bstack->entries[i].flags; 388 bs_flags->value = 0; 389 390 if (record->op & ARM_SPE_OP_BR_CR_BL) { 391 if (record->op & ARM_SPE_OP_BR_COND) 392 bs_flags->type |= PERF_BR_COND_CALL; 393 else 394 bs_flags->type |= PERF_BR_CALL; 395 /* 396 * Indirect branch instruction without link (e.g. BR), 397 * take this case as function return. 398 */ 399 } else if (record->op & ARM_SPE_OP_BR_CR_RET || 400 record->op & ARM_SPE_OP_BR_INDIRECT) { 401 if (record->op & ARM_SPE_OP_BR_COND) 402 bs_flags->type |= PERF_BR_COND_RET; 403 else 404 bs_flags->type |= PERF_BR_RET; 405 } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) { 406 if (record->op & ARM_SPE_OP_BR_COND) 407 bs_flags->type |= PERF_BR_COND; 408 else 409 bs_flags->type |= PERF_BR_UNCOND; 410 } else { 411 if (record->op & ARM_SPE_OP_BR_COND) 412 bs_flags->type |= PERF_BR_COND; 413 else 414 bs_flags->type |= PERF_BR_UNKNOWN; 415 } 416 417 if (record->type & ARM_SPE_BRANCH_MISS) { 418 bs_flags->mispred = 1; 419 bs_flags->predicted = 0; 420 } else { 421 bs_flags->mispred = 0; 422 bs_flags->predicted = 1; 423 } 424 425 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 426 bs_flags->not_taken = 1; 427 428 if (record->type & ARM_SPE_IN_TXN) 429 bs_flags->in_tx = 1; 430 431 bs_flags->cycles = min(record->latency, 0xFFFFU); 432 i++; 433 } 434 435 if (have_pbt) { 436 bs_flags = &bstack->entries[i].flags; 437 bs_flags->type |= PERF_BR_UNKNOWN; 438 bstack->entries[i].to = record->prev_br_tgt; 439 i++; 440 } 441 442 bstack->nr = i; 443 bstack->hw_idx = -1ULL; 444 } 445 446 static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type) 447 { 448 event->header.size = perf_event__sample_event_size(sample, type, 0); 449 return perf_event__synthesize_sample(event, type, 0, sample); 450 } 451 452 static inline int 453 arm_spe_deliver_synth_event(struct arm_spe *spe, 454 struct arm_spe_queue *speq __maybe_unused, 455 union perf_event *event, 456 struct perf_sample *sample) 457 { 458 int ret; 459 460 if (spe->synth_opts.inject) { 461 ret = arm_spe__inject_event(event, sample, spe->sample_type); 462 if (ret) 463 return ret; 464 } 465 466 ret = perf_session__deliver_synth_event(spe->session, event, sample); 467 if (ret) 468 pr_err("ARM SPE: failed to deliver event, error %d\n", ret); 469 470 return ret; 471 } 472 473 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, 474 u64 spe_events_id, u64 data_src) 475 { 476 struct arm_spe *spe = speq->spe; 477 struct arm_spe_record *record = &speq->decoder->record; 478 union perf_event *event = speq->event_buf; 479 struct perf_sample sample; 480 int ret; 481 482 perf_sample__init(&sample, /*all=*/true); 483 arm_spe_prep_sample(spe, speq, event, &sample); 484 485 sample.id = spe_events_id; 486 sample.stream_id = spe_events_id; 487 sample.addr = record->virt_addr; 488 sample.phys_addr = record->phys_addr; 489 sample.data_src = data_src; 490 sample.weight = record->latency; 491 492 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 493 perf_sample__exit(&sample); 494 return ret; 495 } 496 497 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, 498 u64 spe_events_id) 499 { 500 struct arm_spe *spe = speq->spe; 501 struct arm_spe_record *record = &speq->decoder->record; 502 union perf_event *event = speq->event_buf; 503 struct perf_sample sample; 504 int ret; 505 506 perf_sample__init(&sample, /*all=*/true); 507 arm_spe_prep_sample(spe, speq, event, &sample); 508 509 sample.id = spe_events_id; 510 sample.stream_id = spe_events_id; 511 sample.addr = record->to_ip; 512 sample.weight = record->latency; 513 sample.flags = speq->flags; 514 sample.branch_stack = speq->last_branch; 515 516 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 517 perf_sample__exit(&sample); 518 return ret; 519 } 520 521 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, 522 u64 spe_events_id, u64 data_src) 523 { 524 struct arm_spe *spe = speq->spe; 525 struct arm_spe_record *record = &speq->decoder->record; 526 union perf_event *event = speq->event_buf; 527 struct perf_sample sample; 528 int ret; 529 530 /* 531 * Handles perf instruction sampling period. 532 */ 533 speq->period_instructions++; 534 if (speq->period_instructions < spe->instructions_sample_period) 535 return 0; 536 speq->period_instructions = 0; 537 538 perf_sample__init(&sample, /*all=*/true); 539 arm_spe_prep_sample(spe, speq, event, &sample); 540 541 sample.id = spe_events_id; 542 sample.stream_id = spe_events_id; 543 sample.addr = record->to_ip; 544 sample.phys_addr = record->phys_addr; 545 sample.data_src = data_src; 546 sample.period = spe->instructions_sample_period; 547 sample.weight = record->latency; 548 sample.flags = speq->flags; 549 sample.branch_stack = speq->last_branch; 550 551 ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); 552 perf_sample__exit(&sample); 553 return ret; 554 } 555 556 static const struct midr_range common_ds_encoding_cpus[] = { 557 MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), 558 MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), 559 MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), 560 MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), 561 MIDR_ALL_VERSIONS(MIDR_CORTEX_X925), 562 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), 563 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), 564 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), 565 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), 566 {}, 567 }; 568 569 static const struct midr_range ampereone_ds_encoding_cpus[] = { 570 MIDR_ALL_VERSIONS(MIDR_AMPERE1A), 571 {}, 572 }; 573 574 static void arm_spe__sample_flags(struct arm_spe_queue *speq) 575 { 576 const struct arm_spe_record *record = &speq->decoder->record; 577 578 speq->flags = 0; 579 if (record->op & ARM_SPE_OP_BRANCH_ERET) { 580 speq->flags = PERF_IP_FLAG_BRANCH; 581 582 if (record->type & ARM_SPE_BRANCH_MISS) 583 speq->flags |= PERF_IP_FLAG_BRANCH_MISS; 584 585 if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 586 speq->flags |= PERF_IP_FLAG_NOT_TAKEN; 587 588 if (record->type & ARM_SPE_IN_TXN) 589 speq->flags |= PERF_IP_FLAG_IN_TX; 590 591 if (record->op & ARM_SPE_OP_BR_COND) 592 speq->flags |= PERF_IP_FLAG_CONDITIONAL; 593 594 if (record->op & ARM_SPE_OP_BR_CR_BL) 595 speq->flags |= PERF_IP_FLAG_CALL; 596 else if (record->op & ARM_SPE_OP_BR_CR_RET) 597 speq->flags |= PERF_IP_FLAG_RETURN; 598 /* 599 * Indirect branch instruction without link (e.g. BR), 600 * take it as a function return. 601 */ 602 else if (record->op & ARM_SPE_OP_BR_INDIRECT) 603 speq->flags |= PERF_IP_FLAG_RETURN; 604 } 605 } 606 607 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record, 608 union perf_mem_data_src *data_src) 609 { 610 /* 611 * Even though four levels of cache hierarchy are possible, no known 612 * production Neoverse systems currently include more than three levels 613 * so for the time being we assume three exist. If a production system 614 * is built with four the this function would have to be changed to 615 * detect the number of levels for reporting. 616 */ 617 618 /* 619 * We have no data on the hit level or data source for stores in the 620 * Neoverse SPE records. 621 */ 622 if (record->op & ARM_SPE_OP_ST) { 623 data_src->mem_lvl = PERF_MEM_LVL_NA; 624 data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; 625 data_src->mem_snoop = PERF_MEM_SNOOP_NA; 626 return; 627 } 628 629 switch (record->source) { 630 case ARM_SPE_COMMON_DS_L1D: 631 data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 632 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 633 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 634 break; 635 case ARM_SPE_COMMON_DS_L2: 636 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 637 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 638 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 639 break; 640 case ARM_SPE_COMMON_DS_PEER_CORE: 641 data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 642 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 643 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 644 break; 645 /* 646 * We don't know if this is L1, L2 but we do know it was a cache-2-cache 647 * transfer, so set SNOOPX_PEER 648 */ 649 case ARM_SPE_COMMON_DS_LOCAL_CLUSTER: 650 case ARM_SPE_COMMON_DS_PEER_CLUSTER: 651 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 652 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 653 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 654 break; 655 /* 656 * System cache is assumed to be L3 657 */ 658 case ARM_SPE_COMMON_DS_SYS_CACHE: 659 data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 660 data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 661 data_src->mem_snoop = PERF_MEM_SNOOP_HIT; 662 break; 663 /* 664 * We don't know what level it hit in, except it came from the other 665 * socket 666 */ 667 case ARM_SPE_COMMON_DS_REMOTE: 668 data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1; 669 data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 670 data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 671 data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 672 break; 673 case ARM_SPE_COMMON_DS_DRAM: 674 data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; 675 data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 676 data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 677 break; 678 default: 679 break; 680 } 681 } 682 683 /* 684 * Source is IMPDEF. Here we convert the source code used on AmpereOne cores 685 * to the common (Neoverse, Cortex) to avoid duplicating the decoding code. 686 */ 687 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record, 688 union perf_mem_data_src *data_src) 689 { 690 struct arm_spe_record common_record; 691 692 switch (record->source) { 693 case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE: 694 common_record.source = ARM_SPE_COMMON_DS_PEER_CORE; 695 break; 696 case ARM_SPE_AMPEREONE_SLC: 697 common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE; 698 break; 699 case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE: 700 common_record.source = ARM_SPE_COMMON_DS_REMOTE; 701 break; 702 case ARM_SPE_AMPEREONE_DDR: 703 common_record.source = ARM_SPE_COMMON_DS_DRAM; 704 break; 705 case ARM_SPE_AMPEREONE_L1D: 706 common_record.source = ARM_SPE_COMMON_DS_L1D; 707 break; 708 case ARM_SPE_AMPEREONE_L2D: 709 common_record.source = ARM_SPE_COMMON_DS_L2; 710 break; 711 default: 712 pr_warning_once("AmpereOne: Unknown data source (0x%x)\n", 713 record->source); 714 return; 715 } 716 717 common_record.op = record->op; 718 arm_spe__synth_data_source_common(&common_record, data_src); 719 } 720 721 static const struct data_source_handle data_source_handles[] = { 722 DS(common_ds_encoding_cpus, data_source_common), 723 DS(ampereone_ds_encoding_cpus, data_source_ampereone), 724 }; 725 726 static void arm_spe__synth_memory_level(const struct arm_spe_record *record, 727 union perf_mem_data_src *data_src) 728 { 729 if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { 730 data_src->mem_lvl = PERF_MEM_LVL_L3; 731 732 if (record->type & ARM_SPE_LLC_MISS) 733 data_src->mem_lvl |= PERF_MEM_LVL_MISS; 734 else 735 data_src->mem_lvl |= PERF_MEM_LVL_HIT; 736 } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { 737 data_src->mem_lvl = PERF_MEM_LVL_L1; 738 739 if (record->type & ARM_SPE_L1D_MISS) 740 data_src->mem_lvl |= PERF_MEM_LVL_MISS; 741 else 742 data_src->mem_lvl |= PERF_MEM_LVL_HIT; 743 } 744 745 if (record->type & ARM_SPE_REMOTE_ACCESS) 746 data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; 747 } 748 749 static bool arm_spe__synth_ds(struct arm_spe_queue *speq, 750 const struct arm_spe_record *record, 751 union perf_mem_data_src *data_src) 752 { 753 struct arm_spe *spe = speq->spe; 754 u64 *metadata = NULL; 755 u64 midr; 756 unsigned int i; 757 758 /* Metadata version 1 assumes all CPUs are the same (old behavior) */ 759 if (spe->metadata_ver == 1) { 760 const char *cpuid; 761 762 pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n"); 763 cpuid = perf_env__cpuid(spe->session->evlist->env); 764 midr = strtol(cpuid, NULL, 16); 765 } else { 766 /* CPU ID is -1 for per-thread mode */ 767 if (speq->cpu < 0) { 768 /* 769 * On the heterogeneous system, due to CPU ID is -1, 770 * cannot confirm the data source packet is supported. 771 */ 772 if (!spe->is_homogeneous) 773 return false; 774 775 /* In homogeneous system, simply use CPU0's metadata */ 776 if (spe->metadata) 777 metadata = spe->metadata[0]; 778 } else { 779 metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu); 780 } 781 782 if (!metadata) 783 return false; 784 785 midr = metadata[ARM_SPE_CPU_MIDR]; 786 } 787 788 for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) { 789 if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) { 790 data_source_handles[i].ds_synth(record, data_src); 791 return true; 792 } 793 } 794 795 return false; 796 } 797 798 static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, 799 const struct arm_spe_record *record) 800 { 801 union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA }; 802 803 /* Only synthesize data source for LDST operations */ 804 if (!is_ldst_op(record->op)) 805 return 0; 806 807 if (record->op & ARM_SPE_OP_LD) 808 data_src.mem_op = PERF_MEM_OP_LOAD; 809 else if (record->op & ARM_SPE_OP_ST) 810 data_src.mem_op = PERF_MEM_OP_STORE; 811 else 812 return 0; 813 814 if (!arm_spe__synth_ds(speq, record, &data_src)) 815 arm_spe__synth_memory_level(record, &data_src); 816 817 if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { 818 data_src.mem_dtlb = PERF_MEM_TLB_WK; 819 820 if (record->type & ARM_SPE_TLB_MISS) 821 data_src.mem_dtlb |= PERF_MEM_TLB_MISS; 822 else 823 data_src.mem_dtlb |= PERF_MEM_TLB_HIT; 824 } 825 826 return data_src.val; 827 } 828 829 static int arm_spe_sample(struct arm_spe_queue *speq) 830 { 831 const struct arm_spe_record *record = &speq->decoder->record; 832 struct arm_spe *spe = speq->spe; 833 u64 data_src; 834 int err; 835 836 arm_spe__sample_flags(speq); 837 data_src = arm_spe__synth_data_source(speq, record); 838 839 if (spe->sample_flc) { 840 if (record->type & ARM_SPE_L1D_MISS) { 841 err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id, 842 data_src); 843 if (err) 844 return err; 845 } 846 847 if (record->type & ARM_SPE_L1D_ACCESS) { 848 err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id, 849 data_src); 850 if (err) 851 return err; 852 } 853 } 854 855 if (spe->sample_llc) { 856 if (record->type & ARM_SPE_LLC_MISS) { 857 err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id, 858 data_src); 859 if (err) 860 return err; 861 } 862 863 if (record->type & ARM_SPE_LLC_ACCESS) { 864 err = arm_spe__synth_mem_sample(speq, spe->llc_access_id, 865 data_src); 866 if (err) 867 return err; 868 } 869 } 870 871 if (spe->sample_tlb) { 872 if (record->type & ARM_SPE_TLB_MISS) { 873 err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id, 874 data_src); 875 if (err) 876 return err; 877 } 878 879 if (record->type & ARM_SPE_TLB_ACCESS) { 880 err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id, 881 data_src); 882 if (err) 883 return err; 884 } 885 } 886 887 if (spe->synth_opts.last_branch && 888 (spe->sample_branch || spe->sample_instructions)) 889 arm_spe__prep_branch_stack(speq); 890 891 if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) { 892 err = arm_spe__synth_branch_sample(speq, spe->branch_id); 893 if (err) 894 return err; 895 } 896 897 if (spe->sample_remote_access && 898 (record->type & ARM_SPE_REMOTE_ACCESS)) { 899 err = arm_spe__synth_mem_sample(speq, spe->remote_access_id, 900 data_src); 901 if (err) 902 return err; 903 } 904 905 /* 906 * When data_src is zero it means the record is not a memory operation, 907 * skip to synthesize memory sample for this case. 908 */ 909 if (spe->sample_memory && is_ldst_op(record->op)) { 910 err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src); 911 if (err) 912 return err; 913 } 914 915 if (spe->sample_instructions) { 916 err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src); 917 if (err) 918 return err; 919 } 920 921 return 0; 922 } 923 924 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp) 925 { 926 struct arm_spe *spe = speq->spe; 927 struct arm_spe_record *record; 928 int ret; 929 930 if (!spe->kernel_start) 931 spe->kernel_start = machine__kernel_start(spe->machine); 932 933 while (1) { 934 /* 935 * The usual logic is firstly to decode the packets, and then 936 * based the record to synthesize sample; but here the flow is 937 * reversed: it calls arm_spe_sample() for synthesizing samples 938 * prior to arm_spe_decode(). 939 * 940 * Two reasons for this code logic: 941 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it 942 * has decoded trace data and generated a record, but the record 943 * is left to generate sample until run to here, so it's correct 944 * to synthesize sample for the left record. 945 * 2. After decoding trace data, it needs to compare the record 946 * timestamp with the coming perf event, if the record timestamp 947 * is later than the perf event, it needs bail out and pushs the 948 * record into auxtrace heap, thus the record can be deferred to 949 * synthesize sample until run to here at the next time; so this 950 * can correlate samples between Arm SPE trace data and other 951 * perf events with correct time ordering. 952 */ 953 954 /* 955 * Update pid/tid info. 956 */ 957 record = &speq->decoder->record; 958 if (!spe->timeless_decoding && record->context_id != (u64)-1) { 959 ret = arm_spe_set_tid(speq, record->context_id); 960 if (ret) 961 return ret; 962 963 spe->use_ctx_pkt_for_pid = true; 964 } 965 966 ret = arm_spe_sample(speq); 967 if (ret) 968 return ret; 969 970 ret = arm_spe_decode(speq->decoder); 971 if (!ret) { 972 pr_debug("No data or all data has been processed.\n"); 973 return 1; 974 } 975 976 /* 977 * Error is detected when decode SPE trace data, continue to 978 * the next trace data and find out more records. 979 */ 980 if (ret < 0) 981 continue; 982 983 record = &speq->decoder->record; 984 985 /* Update timestamp for the last record */ 986 if (record->timestamp > speq->timestamp) 987 speq->timestamp = record->timestamp; 988 989 /* 990 * If the timestamp of the queue is later than timestamp of the 991 * coming perf event, bail out so can allow the perf event to 992 * be processed ahead. 993 */ 994 if (!spe->timeless_decoding && speq->timestamp >= *timestamp) { 995 *timestamp = speq->timestamp; 996 return 0; 997 } 998 } 999 1000 return 0; 1001 } 1002 1003 static int arm_spe__setup_queue(struct arm_spe *spe, 1004 struct auxtrace_queue *queue, 1005 unsigned int queue_nr) 1006 { 1007 struct arm_spe_queue *speq = queue->priv; 1008 struct arm_spe_record *record; 1009 1010 if (list_empty(&queue->head) || speq) 1011 return 0; 1012 1013 speq = arm_spe__alloc_queue(spe, queue_nr); 1014 1015 if (!speq) 1016 return -ENOMEM; 1017 1018 queue->priv = speq; 1019 1020 if (queue->cpu != -1) 1021 speq->cpu = queue->cpu; 1022 1023 if (!speq->on_heap) { 1024 int ret; 1025 1026 if (spe->timeless_decoding) 1027 return 0; 1028 1029 retry: 1030 ret = arm_spe_decode(speq->decoder); 1031 1032 if (!ret) 1033 return 0; 1034 1035 if (ret < 0) 1036 goto retry; 1037 1038 record = &speq->decoder->record; 1039 1040 speq->timestamp = record->timestamp; 1041 ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp); 1042 if (ret) 1043 return ret; 1044 speq->on_heap = true; 1045 } 1046 1047 return 0; 1048 } 1049 1050 static int arm_spe__setup_queues(struct arm_spe *spe) 1051 { 1052 unsigned int i; 1053 int ret; 1054 1055 for (i = 0; i < spe->queues.nr_queues; i++) { 1056 ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i); 1057 if (ret) 1058 return ret; 1059 } 1060 1061 return 0; 1062 } 1063 1064 static int arm_spe__update_queues(struct arm_spe *spe) 1065 { 1066 if (spe->queues.new_data) { 1067 spe->queues.new_data = false; 1068 return arm_spe__setup_queues(spe); 1069 } 1070 1071 return 0; 1072 } 1073 1074 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe) 1075 { 1076 struct evsel *evsel; 1077 struct evlist *evlist = spe->session->evlist; 1078 bool timeless_decoding = true; 1079 1080 /* 1081 * Circle through the list of event and complain if we find one 1082 * with the time bit set. 1083 */ 1084 evlist__for_each_entry(evlist, evsel) { 1085 if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME)) 1086 timeless_decoding = false; 1087 } 1088 1089 return timeless_decoding; 1090 } 1091 1092 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp) 1093 { 1094 unsigned int queue_nr; 1095 u64 ts; 1096 int ret; 1097 1098 while (1) { 1099 struct auxtrace_queue *queue; 1100 struct arm_spe_queue *speq; 1101 1102 if (!spe->heap.heap_cnt) 1103 return 0; 1104 1105 if (spe->heap.heap_array[0].ordinal >= timestamp) 1106 return 0; 1107 1108 queue_nr = spe->heap.heap_array[0].queue_nr; 1109 queue = &spe->queues.queue_array[queue_nr]; 1110 speq = queue->priv; 1111 1112 auxtrace_heap__pop(&spe->heap); 1113 1114 if (spe->heap.heap_cnt) { 1115 ts = spe->heap.heap_array[0].ordinal + 1; 1116 if (ts > timestamp) 1117 ts = timestamp; 1118 } else { 1119 ts = timestamp; 1120 } 1121 1122 /* 1123 * A previous context-switch event has set pid/tid in the machine's context, so 1124 * here we need to update the pid/tid in the thread and SPE queue. 1125 */ 1126 if (!spe->use_ctx_pkt_for_pid) 1127 arm_spe_set_pid_tid_cpu(spe, queue); 1128 1129 ret = arm_spe_run_decoder(speq, &ts); 1130 if (ret < 0) { 1131 auxtrace_heap__add(&spe->heap, queue_nr, ts); 1132 return ret; 1133 } 1134 1135 if (!ret) { 1136 ret = auxtrace_heap__add(&spe->heap, queue_nr, ts); 1137 if (ret < 0) 1138 return ret; 1139 } else { 1140 speq->on_heap = false; 1141 } 1142 } 1143 1144 return 0; 1145 } 1146 1147 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid, 1148 u64 time_) 1149 { 1150 struct auxtrace_queues *queues = &spe->queues; 1151 unsigned int i; 1152 u64 ts = 0; 1153 1154 for (i = 0; i < queues->nr_queues; i++) { 1155 struct auxtrace_queue *queue = &spe->queues.queue_array[i]; 1156 struct arm_spe_queue *speq = queue->priv; 1157 1158 if (speq && (tid == -1 || speq->tid == tid)) { 1159 speq->time = time_; 1160 arm_spe_set_pid_tid_cpu(spe, queue); 1161 arm_spe_run_decoder(speq, &ts); 1162 } 1163 } 1164 return 0; 1165 } 1166 1167 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event, 1168 struct perf_sample *sample) 1169 { 1170 pid_t pid, tid; 1171 int cpu; 1172 1173 if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT)) 1174 return 0; 1175 1176 pid = event->context_switch.next_prev_pid; 1177 tid = event->context_switch.next_prev_tid; 1178 cpu = sample->cpu; 1179 1180 if (tid == -1) 1181 pr_warning("context_switch event has no tid\n"); 1182 1183 return machine__set_current_tid(spe->machine, cpu, pid, tid); 1184 } 1185 1186 static int arm_spe_process_event(struct perf_session *session, 1187 union perf_event *event, 1188 struct perf_sample *sample, 1189 const struct perf_tool *tool) 1190 { 1191 int err = 0; 1192 u64 timestamp; 1193 struct arm_spe *spe = container_of(session->auxtrace, 1194 struct arm_spe, auxtrace); 1195 1196 if (dump_trace) 1197 return 0; 1198 1199 if (!tool->ordered_events) { 1200 pr_err("SPE trace requires ordered events\n"); 1201 return -EINVAL; 1202 } 1203 1204 if (sample->time && (sample->time != (u64) -1)) 1205 timestamp = perf_time_to_tsc(sample->time, &spe->tc); 1206 else 1207 timestamp = 0; 1208 1209 if (timestamp || spe->timeless_decoding) { 1210 err = arm_spe__update_queues(spe); 1211 if (err) 1212 return err; 1213 } 1214 1215 if (spe->timeless_decoding) { 1216 if (event->header.type == PERF_RECORD_EXIT) { 1217 err = arm_spe_process_timeless_queues(spe, 1218 event->fork.tid, 1219 sample->time); 1220 } 1221 } else if (timestamp) { 1222 err = arm_spe_process_queues(spe, timestamp); 1223 if (err) 1224 return err; 1225 1226 if (!spe->use_ctx_pkt_for_pid && 1227 (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE || 1228 event->header.type == PERF_RECORD_SWITCH)) 1229 err = arm_spe_context_switch(spe, event, sample); 1230 } 1231 1232 return err; 1233 } 1234 1235 static int arm_spe_process_auxtrace_event(struct perf_session *session, 1236 union perf_event *event, 1237 const struct perf_tool *tool __maybe_unused) 1238 { 1239 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1240 auxtrace); 1241 1242 if (!spe->data_queued) { 1243 struct auxtrace_buffer *buffer; 1244 off_t data_offset; 1245 int fd = perf_data__fd(session->data); 1246 int err; 1247 1248 if (perf_data__is_pipe(session->data)) { 1249 data_offset = 0; 1250 } else { 1251 data_offset = lseek(fd, 0, SEEK_CUR); 1252 if (data_offset == -1) 1253 return -errno; 1254 } 1255 1256 err = auxtrace_queues__add_event(&spe->queues, session, event, 1257 data_offset, &buffer); 1258 if (err) 1259 return err; 1260 1261 /* Dump here now we have copied a piped trace out of the pipe */ 1262 if (dump_trace) { 1263 if (auxtrace_buffer__get_data(buffer, fd)) { 1264 arm_spe_dump_event(spe, buffer->data, 1265 buffer->size); 1266 auxtrace_buffer__put_data(buffer); 1267 } 1268 } 1269 } 1270 1271 return 0; 1272 } 1273 1274 static int arm_spe_flush(struct perf_session *session __maybe_unused, 1275 const struct perf_tool *tool __maybe_unused) 1276 { 1277 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1278 auxtrace); 1279 int ret; 1280 1281 if (dump_trace) 1282 return 0; 1283 1284 if (!tool->ordered_events) 1285 return -EINVAL; 1286 1287 ret = arm_spe__update_queues(spe); 1288 if (ret < 0) 1289 return ret; 1290 1291 if (spe->timeless_decoding) 1292 return arm_spe_process_timeless_queues(spe, -1, 1293 MAX_TIMESTAMP - 1); 1294 1295 ret = arm_spe_process_queues(spe, MAX_TIMESTAMP); 1296 if (ret) 1297 return ret; 1298 1299 if (!spe->use_ctx_pkt_for_pid) 1300 ui__warning("Arm SPE CONTEXT packets not found in the traces.\n" 1301 "Matching of TIDs to SPE events could be inaccurate.\n"); 1302 1303 return 0; 1304 } 1305 1306 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size) 1307 { 1308 u64 *metadata; 1309 1310 metadata = zalloc(per_cpu_size); 1311 if (!metadata) 1312 return NULL; 1313 1314 memcpy(metadata, buf, per_cpu_size); 1315 return metadata; 1316 } 1317 1318 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu) 1319 { 1320 int i; 1321 1322 for (i = 0; i < nr_cpu; i++) 1323 zfree(&metadata[i]); 1324 free(metadata); 1325 } 1326 1327 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info, 1328 u64 *ver, int *nr_cpu) 1329 { 1330 u64 *ptr = (u64 *)info->priv; 1331 u64 metadata_size; 1332 u64 **metadata = NULL; 1333 int hdr_sz, per_cpu_sz, i; 1334 1335 metadata_size = info->header.size - 1336 sizeof(struct perf_record_auxtrace_info); 1337 1338 /* Metadata version 1 */ 1339 if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) { 1340 *ver = 1; 1341 *nr_cpu = 0; 1342 /* No per CPU metadata */ 1343 return NULL; 1344 } 1345 1346 *ver = ptr[ARM_SPE_HEADER_VERSION]; 1347 hdr_sz = ptr[ARM_SPE_HEADER_SIZE]; 1348 *nr_cpu = ptr[ARM_SPE_CPUS_NUM]; 1349 1350 metadata = calloc(*nr_cpu, sizeof(*metadata)); 1351 if (!metadata) 1352 return NULL; 1353 1354 /* Locate the start address of per CPU metadata */ 1355 ptr += hdr_sz; 1356 per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu); 1357 1358 for (i = 0; i < *nr_cpu; i++) { 1359 metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz); 1360 if (!metadata[i]) 1361 goto err_per_cpu_metadata; 1362 1363 ptr += per_cpu_sz / sizeof(u64); 1364 } 1365 1366 return metadata; 1367 1368 err_per_cpu_metadata: 1369 arm_spe__free_metadata(metadata, *nr_cpu); 1370 return NULL; 1371 } 1372 1373 static void arm_spe_free_queue(void *priv) 1374 { 1375 struct arm_spe_queue *speq = priv; 1376 1377 if (!speq) 1378 return; 1379 thread__zput(speq->thread); 1380 arm_spe_decoder_free(speq->decoder); 1381 zfree(&speq->event_buf); 1382 zfree(&speq->last_branch); 1383 free(speq); 1384 } 1385 1386 static void arm_spe_free_events(struct perf_session *session) 1387 { 1388 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1389 auxtrace); 1390 struct auxtrace_queues *queues = &spe->queues; 1391 unsigned int i; 1392 1393 for (i = 0; i < queues->nr_queues; i++) { 1394 arm_spe_free_queue(queues->queue_array[i].priv); 1395 queues->queue_array[i].priv = NULL; 1396 } 1397 auxtrace_queues__free(queues); 1398 } 1399 1400 static void arm_spe_free(struct perf_session *session) 1401 { 1402 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, 1403 auxtrace); 1404 1405 auxtrace_heap__free(&spe->heap); 1406 arm_spe_free_events(session); 1407 session->auxtrace = NULL; 1408 arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu); 1409 free(spe); 1410 } 1411 1412 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session, 1413 struct evsel *evsel) 1414 { 1415 struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); 1416 1417 return evsel->core.attr.type == spe->pmu_type; 1418 } 1419 1420 static const char * const metadata_hdr_v1_fmts[] = { 1421 [ARM_SPE_PMU_TYPE] = " PMU Type :%"PRId64"\n", 1422 [ARM_SPE_PER_CPU_MMAPS] = " Per CPU mmaps :%"PRId64"\n", 1423 }; 1424 1425 static const char * const metadata_hdr_fmts[] = { 1426 [ARM_SPE_HEADER_VERSION] = " Header version :%"PRId64"\n", 1427 [ARM_SPE_HEADER_SIZE] = " Header size :%"PRId64"\n", 1428 [ARM_SPE_PMU_TYPE_V2] = " PMU type v2 :%"PRId64"\n", 1429 [ARM_SPE_CPUS_NUM] = " CPU number :%"PRId64"\n", 1430 }; 1431 1432 static const char * const metadata_per_cpu_fmts[] = { 1433 [ARM_SPE_MAGIC] = " Magic :0x%"PRIx64"\n", 1434 [ARM_SPE_CPU] = " CPU # :%"PRId64"\n", 1435 [ARM_SPE_CPU_NR_PARAMS] = " Num of params :%"PRId64"\n", 1436 [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n", 1437 [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n", 1438 [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n", 1439 }; 1440 1441 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr) 1442 { 1443 unsigned int i, cpu, hdr_size, cpu_num, cpu_size; 1444 const char * const *hdr_fmts; 1445 1446 if (!dump_trace) 1447 return; 1448 1449 if (spe->metadata_ver == 1) { 1450 cpu_num = 0; 1451 hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX; 1452 hdr_fmts = metadata_hdr_v1_fmts; 1453 } else { 1454 cpu_num = arr[ARM_SPE_CPUS_NUM]; 1455 hdr_size = arr[ARM_SPE_HEADER_SIZE]; 1456 hdr_fmts = metadata_hdr_fmts; 1457 } 1458 1459 for (i = 0; i < hdr_size; i++) 1460 fprintf(stdout, hdr_fmts[i], arr[i]); 1461 1462 arr += hdr_size; 1463 for (cpu = 0; cpu < cpu_num; cpu++) { 1464 /* 1465 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS 1466 * are fixed. The sequential parameter size is decided by the 1467 * field 'ARM_SPE_CPU_NR_PARAMS'. 1468 */ 1469 cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS]; 1470 for (i = 0; i < cpu_size; i++) 1471 fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]); 1472 arr += cpu_size; 1473 } 1474 } 1475 1476 static void arm_spe_set_event_name(struct evlist *evlist, u64 id, 1477 const char *name) 1478 { 1479 struct evsel *evsel; 1480 1481 evlist__for_each_entry(evlist, evsel) { 1482 if (evsel->core.id && evsel->core.id[0] == id) { 1483 if (evsel->name) 1484 zfree(&evsel->name); 1485 evsel->name = strdup(name); 1486 break; 1487 } 1488 } 1489 } 1490 1491 static int 1492 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) 1493 { 1494 struct evlist *evlist = session->evlist; 1495 struct evsel *evsel; 1496 struct perf_event_attr attr; 1497 bool found = false; 1498 u64 id; 1499 int err; 1500 1501 evlist__for_each_entry(evlist, evsel) { 1502 if (evsel->core.attr.type == spe->pmu_type) { 1503 found = true; 1504 break; 1505 } 1506 } 1507 1508 if (!found) { 1509 pr_debug("No selected events with SPE trace data\n"); 1510 return 0; 1511 } 1512 1513 memset(&attr, 0, sizeof(struct perf_event_attr)); 1514 attr.size = sizeof(struct perf_event_attr); 1515 attr.type = PERF_TYPE_HARDWARE; 1516 attr.sample_type = evsel->core.attr.sample_type & 1517 (PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR); 1518 attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID | 1519 PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC | 1520 PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR; 1521 if (spe->timeless_decoding) 1522 attr.sample_type &= ~(u64)PERF_SAMPLE_TIME; 1523 else 1524 attr.sample_type |= PERF_SAMPLE_TIME; 1525 1526 spe->sample_type = attr.sample_type; 1527 1528 attr.exclude_user = evsel->core.attr.exclude_user; 1529 attr.exclude_kernel = evsel->core.attr.exclude_kernel; 1530 attr.exclude_hv = evsel->core.attr.exclude_hv; 1531 attr.exclude_host = evsel->core.attr.exclude_host; 1532 attr.exclude_guest = evsel->core.attr.exclude_guest; 1533 attr.sample_id_all = evsel->core.attr.sample_id_all; 1534 attr.read_format = evsel->core.attr.read_format; 1535 1536 /* create new id val to be a fixed offset from evsel id */ 1537 id = evsel->core.id[0] + 1000000000; 1538 1539 if (!id) 1540 id = 1; 1541 1542 if (spe->synth_opts.flc) { 1543 spe->sample_flc = true; 1544 1545 /* Level 1 data cache miss */ 1546 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1547 if (err) 1548 return err; 1549 spe->l1d_miss_id = id; 1550 arm_spe_set_event_name(evlist, id, "l1d-miss"); 1551 id += 1; 1552 1553 /* Level 1 data cache access */ 1554 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1555 if (err) 1556 return err; 1557 spe->l1d_access_id = id; 1558 arm_spe_set_event_name(evlist, id, "l1d-access"); 1559 id += 1; 1560 } 1561 1562 if (spe->synth_opts.llc) { 1563 spe->sample_llc = true; 1564 1565 /* Last level cache miss */ 1566 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1567 if (err) 1568 return err; 1569 spe->llc_miss_id = id; 1570 arm_spe_set_event_name(evlist, id, "llc-miss"); 1571 id += 1; 1572 1573 /* Last level cache access */ 1574 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1575 if (err) 1576 return err; 1577 spe->llc_access_id = id; 1578 arm_spe_set_event_name(evlist, id, "llc-access"); 1579 id += 1; 1580 } 1581 1582 if (spe->synth_opts.tlb) { 1583 spe->sample_tlb = true; 1584 1585 /* TLB miss */ 1586 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1587 if (err) 1588 return err; 1589 spe->tlb_miss_id = id; 1590 arm_spe_set_event_name(evlist, id, "tlb-miss"); 1591 id += 1; 1592 1593 /* TLB access */ 1594 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1595 if (err) 1596 return err; 1597 spe->tlb_access_id = id; 1598 arm_spe_set_event_name(evlist, id, "tlb-access"); 1599 id += 1; 1600 } 1601 1602 if (spe->synth_opts.last_branch) { 1603 if (spe->synth_opts.last_branch_sz > 2) 1604 pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n"); 1605 1606 attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; 1607 /* 1608 * We don't use the hardware index, but the sample generation 1609 * code uses the new format branch_stack with this field, 1610 * so the event attributes must indicate that it's present. 1611 */ 1612 attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; 1613 } 1614 1615 if (spe->synth_opts.branches) { 1616 spe->sample_branch = true; 1617 1618 /* Branch */ 1619 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1620 if (err) 1621 return err; 1622 spe->branch_id = id; 1623 arm_spe_set_event_name(evlist, id, "branch"); 1624 id += 1; 1625 } 1626 1627 if (spe->synth_opts.remote_access) { 1628 spe->sample_remote_access = true; 1629 1630 /* Remote access */ 1631 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1632 if (err) 1633 return err; 1634 spe->remote_access_id = id; 1635 arm_spe_set_event_name(evlist, id, "remote-access"); 1636 id += 1; 1637 } 1638 1639 if (spe->synth_opts.mem) { 1640 spe->sample_memory = true; 1641 1642 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1643 if (err) 1644 return err; 1645 spe->memory_id = id; 1646 arm_spe_set_event_name(evlist, id, "memory"); 1647 id += 1; 1648 } 1649 1650 if (spe->synth_opts.instructions) { 1651 if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { 1652 pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n"); 1653 goto synth_instructions_out; 1654 } 1655 if (spe->synth_opts.period > 1) 1656 pr_warning("Arm SPE has a hardware-based sample period.\n" 1657 "Additional instruction events will be discarded by --itrace\n"); 1658 1659 spe->sample_instructions = true; 1660 attr.config = PERF_COUNT_HW_INSTRUCTIONS; 1661 attr.sample_period = spe->synth_opts.period; 1662 spe->instructions_sample_period = attr.sample_period; 1663 err = perf_session__deliver_synth_attr_event(session, &attr, id); 1664 if (err) 1665 return err; 1666 spe->instructions_id = id; 1667 arm_spe_set_event_name(evlist, id, "instructions"); 1668 } 1669 synth_instructions_out: 1670 1671 return 0; 1672 } 1673 1674 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu) 1675 { 1676 u64 midr; 1677 int i; 1678 1679 if (!nr_cpu) 1680 return false; 1681 1682 for (i = 0; i < nr_cpu; i++) { 1683 if (!metadata[i]) 1684 return false; 1685 1686 if (i == 0) { 1687 midr = metadata[i][ARM_SPE_CPU_MIDR]; 1688 continue; 1689 } 1690 1691 if (midr != metadata[i][ARM_SPE_CPU_MIDR]) 1692 return false; 1693 } 1694 1695 return true; 1696 } 1697 1698 int arm_spe_process_auxtrace_info(union perf_event *event, 1699 struct perf_session *session) 1700 { 1701 struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; 1702 size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE; 1703 struct perf_record_time_conv *tc = &session->time_conv; 1704 struct arm_spe *spe; 1705 u64 **metadata = NULL; 1706 u64 metadata_ver; 1707 int nr_cpu, err; 1708 1709 if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) + 1710 min_sz) 1711 return -EINVAL; 1712 1713 metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver, 1714 &nr_cpu); 1715 if (!metadata && metadata_ver != 1) { 1716 pr_err("Failed to parse Arm SPE metadata.\n"); 1717 return -EINVAL; 1718 } 1719 1720 spe = zalloc(sizeof(struct arm_spe)); 1721 if (!spe) { 1722 err = -ENOMEM; 1723 goto err_free_metadata; 1724 } 1725 1726 err = auxtrace_queues__init(&spe->queues); 1727 if (err) 1728 goto err_free; 1729 1730 spe->session = session; 1731 spe->machine = &session->machines.host; /* No kvm support */ 1732 spe->auxtrace_type = auxtrace_info->type; 1733 if (metadata_ver == 1) 1734 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; 1735 else 1736 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2]; 1737 spe->metadata = metadata; 1738 spe->metadata_ver = metadata_ver; 1739 spe->metadata_nr_cpu = nr_cpu; 1740 spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu); 1741 1742 spe->timeless_decoding = arm_spe__is_timeless_decoding(spe); 1743 1744 /* 1745 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead 1746 * and the parameters for hardware clock are stored in the session 1747 * context. Passes these parameters to the struct perf_tsc_conversion 1748 * in "spe->tc", which is used for later conversion between clock 1749 * counter and timestamp. 1750 * 1751 * For backward compatibility, copies the fields starting from 1752 * "time_cycles" only if they are contained in the event. 1753 */ 1754 spe->tc.time_shift = tc->time_shift; 1755 spe->tc.time_mult = tc->time_mult; 1756 spe->tc.time_zero = tc->time_zero; 1757 1758 if (event_contains(*tc, time_cycles)) { 1759 spe->tc.time_cycles = tc->time_cycles; 1760 spe->tc.time_mask = tc->time_mask; 1761 spe->tc.cap_user_time_zero = tc->cap_user_time_zero; 1762 spe->tc.cap_user_time_short = tc->cap_user_time_short; 1763 } 1764 1765 spe->auxtrace.process_event = arm_spe_process_event; 1766 spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event; 1767 spe->auxtrace.flush_events = arm_spe_flush; 1768 spe->auxtrace.free_events = arm_spe_free_events; 1769 spe->auxtrace.free = arm_spe_free; 1770 spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace; 1771 session->auxtrace = &spe->auxtrace; 1772 1773 arm_spe_print_info(spe, &auxtrace_info->priv[0]); 1774 1775 if (dump_trace) 1776 return 0; 1777 1778 if (session->itrace_synth_opts && session->itrace_synth_opts->set) 1779 spe->synth_opts = *session->itrace_synth_opts; 1780 else 1781 itrace_synth_opts__set_default(&spe->synth_opts, false); 1782 1783 err = arm_spe_synth_events(spe, session); 1784 if (err) 1785 goto err_free_queues; 1786 1787 err = auxtrace_queues__process_index(&spe->queues, session); 1788 if (err) 1789 goto err_free_queues; 1790 1791 if (spe->queues.populated) 1792 spe->data_queued = true; 1793 1794 return 0; 1795 1796 err_free_queues: 1797 auxtrace_queues__free(&spe->queues); 1798 session->auxtrace = NULL; 1799 err_free: 1800 free(spe); 1801 err_free_metadata: 1802 arm_spe__free_metadata(metadata, nr_cpu); 1803 return err; 1804 } 1805