1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * intel_tpebs.c: Intel TPEBS support 4 */ 5 6 #include <api/fs/fs.h> 7 #include <sys/param.h> 8 #include <subcmd/run-command.h> 9 #include <thread.h> 10 #include "intel-tpebs.h" 11 #include <linux/list.h> 12 #include <linux/zalloc.h> 13 #include <linux/err.h> 14 #include "sample.h" 15 #include "counts.h" 16 #include "debug.h" 17 #include "evlist.h" 18 #include "evsel.h" 19 #include "mutex.h" 20 #include "session.h" 21 #include "stat.h" 22 #include "tool.h" 23 #include "cpumap.h" 24 #include "metricgroup.h" 25 #include "stat.h" 26 #include <sys/stat.h> 27 #include <sys/file.h> 28 #include <poll.h> 29 #include <math.h> 30 31 #define PERF_DATA "-" 32 33 bool tpebs_recording; 34 enum tpebs_mode tpebs_mode; 35 static LIST_HEAD(tpebs_results); 36 static pthread_t tpebs_reader_thread; 37 static struct child_process tpebs_cmd; 38 static int control_fd[2], ack_fd[2]; 39 static struct mutex tpebs_mtx; 40 41 struct tpebs_retire_lat { 42 struct list_head nd; 43 /** @evsel: The evsel that opened the retire_lat event. */ 44 struct evsel *evsel; 45 /** @event: Event passed to perf record. */ 46 char *event; 47 /** @stats: Recorded retirement latency stats. */ 48 struct stats stats; 49 /** @last: Last retirement latency read. */ 50 uint64_t last; 51 /* Has the event been sent to perf record? */ 52 bool started; 53 }; 54 55 static void tpebs_mtx_init(void) 56 { 57 mutex_init(&tpebs_mtx); 58 } 59 60 static struct mutex *tpebs_mtx_get(void) 61 { 62 static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT; 63 64 pthread_once(&tpebs_mtx_once, tpebs_mtx_init); 65 return &tpebs_mtx; 66 } 67 68 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 69 EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()); 70 71 static int evsel__tpebs_start_perf_record(struct evsel *evsel) 72 { 73 const char **record_argv; 74 int tpebs_event_size = 0, i = 0, ret; 75 char control_fd_buf[32]; 76 char cpumap_buf[50]; 77 struct tpebs_retire_lat *t; 78 79 list_for_each_entry(t, &tpebs_results, nd) 80 tpebs_event_size++; 81 82 record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv)); 83 if (!record_argv) 84 return -ENOMEM; 85 86 record_argv[i++] = "perf"; 87 record_argv[i++] = "record"; 88 record_argv[i++] = "-W"; 89 record_argv[i++] = "--synth=no"; 90 91 scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d", 92 control_fd[0], ack_fd[1]); 93 record_argv[i++] = control_fd_buf; 94 95 record_argv[i++] = "-o"; 96 record_argv[i++] = PERF_DATA; 97 98 if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) { 99 cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf, 100 sizeof(cpumap_buf)); 101 record_argv[i++] = "-C"; 102 record_argv[i++] = cpumap_buf; 103 } 104 105 list_for_each_entry(t, &tpebs_results, nd) { 106 record_argv[i++] = "-e"; 107 record_argv[i++] = t->event; 108 } 109 record_argv[i++] = NULL; 110 assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size); 111 /* Note, no workload given so system wide is implied. */ 112 113 assert(tpebs_cmd.pid == 0); 114 tpebs_cmd.argv = record_argv; 115 tpebs_cmd.out = -1; 116 ret = start_command(&tpebs_cmd); 117 zfree(&tpebs_cmd.argv); 118 list_for_each_entry(t, &tpebs_results, nd) 119 t->started = true; 120 121 return ret; 122 } 123 124 static bool is_child_pid(pid_t parent, pid_t child) 125 { 126 if (parent < 0 || child < 0) 127 return false; 128 129 while (true) { 130 char path[PATH_MAX]; 131 char line[256]; 132 FILE *fp; 133 134 new_child: 135 if (parent == child) 136 return true; 137 138 if (child <= 0) 139 return false; 140 141 scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child); 142 fp = fopen(path, "r"); 143 if (!fp) { 144 /* Presumably the process went away. Assume not a child. */ 145 return false; 146 } 147 while (fgets(line, sizeof(line), fp) != NULL) { 148 if (strncmp(line, "PPid:", 5) == 0) { 149 fclose(fp); 150 if (sscanf(line + 5, "%d", &child) != 1) { 151 /* Unexpected error parsing. */ 152 return false; 153 } 154 goto new_child; 155 } 156 } 157 /* Unexpected EOF. */ 158 fclose(fp); 159 return false; 160 } 161 } 162 163 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t) 164 { 165 pid_t workload_pid, sample_pid = sample->pid; 166 167 /* 168 * During evlist__purge the evlist will be removed prior to the 169 * evsel__exit calling evsel__tpebs_close and taking the 170 * tpebs_mtx. Avoid a segfault by ignoring samples in this case. 171 */ 172 if (t->evsel->evlist == NULL) 173 return true; 174 175 workload_pid = t->evsel->evlist->workload.pid; 176 if (workload_pid < 0 || workload_pid == sample_pid) 177 return false; 178 179 if (!t->evsel->core.attr.inherit) 180 return true; 181 182 return !is_child_pid(workload_pid, sample_pid); 183 } 184 185 static int process_sample_event(const struct perf_tool *tool __maybe_unused, 186 union perf_event *event __maybe_unused, 187 struct perf_sample *sample, 188 struct evsel *evsel, 189 struct machine *machine __maybe_unused) 190 { 191 struct tpebs_retire_lat *t; 192 193 mutex_lock(tpebs_mtx_get()); 194 if (tpebs_cmd.pid == 0) { 195 /* Record has terminated. */ 196 mutex_unlock(tpebs_mtx_get()); 197 return 0; 198 } 199 t = tpebs_retire_lat__find(evsel); 200 if (!t) { 201 mutex_unlock(tpebs_mtx_get()); 202 return -EINVAL; 203 } 204 if (should_ignore_sample(sample, t)) { 205 mutex_unlock(tpebs_mtx_get()); 206 return 0; 207 } 208 /* 209 * Need to handle per core results? We are assuming average retire 210 * latency value will be used. Save the number of samples and the sum of 211 * retire latency value for each event. 212 */ 213 t->last = sample->retire_lat; 214 update_stats(&t->stats, sample->retire_lat); 215 mutex_unlock(tpebs_mtx_get()); 216 return 0; 217 } 218 219 static int process_feature_event(struct perf_session *session, 220 union perf_event *event) 221 { 222 if (event->feat.feat_id < HEADER_LAST_FEATURE) 223 return perf_event__process_feature(session, event); 224 return 0; 225 } 226 227 static void *__sample_reader(void *arg __maybe_unused) 228 { 229 struct perf_session *session; 230 struct perf_data data = { 231 .mode = PERF_DATA_MODE_READ, 232 .path = PERF_DATA, 233 .file.fd = tpebs_cmd.out, 234 }; 235 struct perf_tool tool; 236 237 perf_tool__init(&tool, /*ordered_events=*/false); 238 tool.sample = process_sample_event; 239 tool.feature = process_feature_event; 240 tool.attr = perf_event__process_attr; 241 242 session = perf_session__new(&data, &tool); 243 if (IS_ERR(session)) 244 return NULL; 245 perf_session__process_events(session); 246 perf_session__delete(session); 247 248 return NULL; 249 } 250 251 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 252 { 253 struct pollfd pollfd = { .events = POLLIN, }; 254 int ret, len, retries = 0; 255 char ack_buf[8]; 256 257 /* Check if the command exited before the send, done with the lock held. */ 258 if (tpebs_cmd.pid == 0) 259 return 0; 260 261 /* 262 * Let go of the lock while sending/receiving as blocking can starve the 263 * sample reading thread. 264 */ 265 mutex_unlock(tpebs_mtx_get()); 266 267 /* Send perf record command.*/ 268 len = strlen(msg); 269 ret = write(control_fd[1], msg, len); 270 if (ret != len) { 271 pr_err("perf record control write control message '%s' failed\n", msg); 272 ret = -EPIPE; 273 goto out; 274 } 275 276 if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) { 277 ret = 0; 278 goto out; 279 } 280 281 /* Wait for an ack. */ 282 pollfd.fd = ack_fd[0]; 283 284 /* 285 * We need this poll to ensure the ack_fd PIPE will not hang 286 * when perf record failed for any reason. The timeout value 287 * 3000ms is an empirical selection. 288 */ 289 again: 290 if (!poll(&pollfd, 1, 500)) { 291 if (check_if_command_finished(&tpebs_cmd)) { 292 ret = 0; 293 goto out; 294 } 295 296 if (retries++ < 6) 297 goto again; 298 pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg); 299 ret = -ETIMEDOUT; 300 goto out; 301 } 302 303 if (!(pollfd.revents & POLLIN)) { 304 if (check_if_command_finished(&tpebs_cmd)) { 305 ret = 0; 306 goto out; 307 } 308 309 pr_err("tpebs failed: did not received an ack for '%s'\n", msg); 310 ret = -EPIPE; 311 goto out; 312 } 313 314 ret = read(ack_fd[0], ack_buf, sizeof(ack_buf)); 315 if (ret > 0) 316 ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG); 317 else 318 pr_err("tpebs: perf record control ack failed\n"); 319 out: 320 /* Re-take lock as expected by caller. */ 321 mutex_lock(tpebs_mtx_get()); 322 return ret; 323 } 324 325 /* 326 * tpebs_stop - stop the sample data read thread and the perf record process. 327 */ 328 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get()) 329 { 330 int ret = 0; 331 332 /* Like tpebs_start, we should only run tpebs_end once. */ 333 if (tpebs_cmd.pid != 0) { 334 tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG); 335 tpebs_cmd.pid = 0; 336 mutex_unlock(tpebs_mtx_get()); 337 pthread_join(tpebs_reader_thread, NULL); 338 mutex_lock(tpebs_mtx_get()); 339 close(control_fd[0]); 340 close(control_fd[1]); 341 close(ack_fd[0]); 342 close(ack_fd[1]); 343 close(tpebs_cmd.out); 344 ret = finish_command(&tpebs_cmd); 345 tpebs_cmd.pid = 0; 346 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) 347 ret = 0; 348 } 349 return ret; 350 } 351 352 /** 353 * evsel__tpebs_event() - Create string event encoding to pass to `perf record`. 354 */ 355 static int evsel__tpebs_event(struct evsel *evsel, char **event) 356 { 357 char *name, *modifier; 358 int ret; 359 360 name = strdup(evsel->name); 361 if (!name) 362 return -ENOMEM; 363 364 modifier = strrchr(name, 'R'); 365 if (!modifier) { 366 ret = -EINVAL; 367 goto out; 368 } 369 *modifier = 'p'; 370 modifier = strchr(name, ':'); 371 if (!modifier) 372 modifier = strrchr(name, '/'); 373 if (!modifier) { 374 ret = -EINVAL; 375 goto out; 376 } 377 *modifier = '\0'; 378 if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0) 379 ret = 0; 380 else 381 ret = -ENOMEM; 382 out: 383 if (ret) 384 pr_err("Tpebs event modifier broken '%s'\n", evsel->name); 385 free(name); 386 return ret; 387 } 388 389 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel) 390 { 391 struct tpebs_retire_lat *result = zalloc(sizeof(*result)); 392 int ret; 393 394 if (!result) 395 return NULL; 396 397 ret = evsel__tpebs_event(evsel, &result->event); 398 if (ret) { 399 free(result); 400 return NULL; 401 } 402 result->evsel = evsel; 403 return result; 404 } 405 406 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r) 407 { 408 zfree(&r->event); 409 free(r); 410 } 411 412 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel) 413 { 414 struct tpebs_retire_lat *t; 415 unsigned long num; 416 const char *evsel_name; 417 418 /* 419 * Evsels will match for evlist with the retirement latency event. The 420 * name with "tpebs_event_" prefix will be present on events being read 421 * from `perf record`. 422 */ 423 if (evsel__is_retire_lat(evsel)) { 424 list_for_each_entry(t, &tpebs_results, nd) { 425 if (t->evsel == evsel) 426 return t; 427 } 428 return NULL; 429 } 430 evsel_name = strstr(evsel->name, "tpebs_event_"); 431 if (!evsel_name) { 432 /* Unexpected that the perf record should have other events. */ 433 return NULL; 434 } 435 errno = 0; 436 num = strtoull(evsel_name + 12, NULL, 16); 437 if (errno) { 438 pr_err("Bad evsel for tpebs find '%s'\n", evsel->name); 439 return NULL; 440 } 441 list_for_each_entry(t, &tpebs_results, nd) { 442 if ((unsigned long)t->evsel == num) 443 return t; 444 } 445 return NULL; 446 } 447 448 /** 449 * evsel__tpebs_prepare - create tpebs data structures ready for opening. 450 * @evsel: retire_latency evsel, all evsels on its list will be prepared. 451 */ 452 static int evsel__tpebs_prepare(struct evsel *evsel) 453 { 454 struct evsel *pos; 455 struct tpebs_retire_lat *tpebs_event; 456 457 mutex_lock(tpebs_mtx_get()); 458 tpebs_event = tpebs_retire_lat__find(evsel); 459 if (tpebs_event) { 460 /* evsel, or an identically named one, was already prepared. */ 461 mutex_unlock(tpebs_mtx_get()); 462 return 0; 463 } 464 tpebs_event = tpebs_retire_lat__new(evsel); 465 if (!tpebs_event) { 466 mutex_unlock(tpebs_mtx_get()); 467 return -ENOMEM; 468 } 469 list_add_tail(&tpebs_event->nd, &tpebs_results); 470 mutex_unlock(tpebs_mtx_get()); 471 472 /* 473 * Eagerly prepare all other evsels on the list to try to ensure that by 474 * open they are all known. 475 */ 476 evlist__for_each_entry(evsel->evlist, pos) { 477 int ret; 478 479 if (pos == evsel || !pos->retire_lat) 480 continue; 481 482 ret = evsel__tpebs_prepare(pos); 483 if (ret) 484 return ret; 485 } 486 return 0; 487 } 488 489 /** 490 * evsel__tpebs_open - starts tpebs execution. 491 * @evsel: retire_latency evsel, all evsels on its list will be selected. Each 492 * evsel is sampled to get the average retire_latency value. 493 */ 494 int evsel__tpebs_open(struct evsel *evsel) 495 { 496 int ret; 497 bool tpebs_empty; 498 499 /* We should only run tpebs_start when tpebs_recording is enabled. */ 500 if (!tpebs_recording) 501 return 0; 502 /* Only start the events once. */ 503 if (tpebs_cmd.pid != 0) { 504 struct tpebs_retire_lat *t; 505 bool valid; 506 507 mutex_lock(tpebs_mtx_get()); 508 t = tpebs_retire_lat__find(evsel); 509 valid = t && t->started; 510 mutex_unlock(tpebs_mtx_get()); 511 /* May fail as the event wasn't started. */ 512 return valid ? 0 : -EBUSY; 513 } 514 515 ret = evsel__tpebs_prepare(evsel); 516 if (ret) 517 return ret; 518 519 mutex_lock(tpebs_mtx_get()); 520 tpebs_empty = list_empty(&tpebs_results); 521 if (!tpebs_empty) { 522 /*Create control and ack fd for --control*/ 523 if (pipe(control_fd) < 0) { 524 pr_err("tpebs: Failed to create control fifo"); 525 ret = -1; 526 goto out; 527 } 528 if (pipe(ack_fd) < 0) { 529 pr_err("tpebs: Failed to create control fifo"); 530 ret = -1; 531 goto out; 532 } 533 534 ret = evsel__tpebs_start_perf_record(evsel); 535 if (ret) 536 goto out; 537 538 if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader, 539 /*arg=*/NULL)) { 540 kill(tpebs_cmd.pid, SIGTERM); 541 close(tpebs_cmd.out); 542 pr_err("Could not create thread to process sample data.\n"); 543 ret = -1; 544 goto out; 545 } 546 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG); 547 } 548 out: 549 if (ret) { 550 struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel); 551 552 list_del_init(&t->nd); 553 tpebs_retire_lat__delete(t); 554 } 555 mutex_unlock(tpebs_mtx_get()); 556 return ret; 557 } 558 559 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread) 560 { 561 struct perf_counts_values *count, *old_count = NULL; 562 struct tpebs_retire_lat *t; 563 uint64_t val; 564 int ret; 565 566 /* Only set retire_latency value to the first CPU and thread. */ 567 if (cpu_map_idx != 0 || thread != 0) 568 return 0; 569 570 if (evsel->prev_raw_counts) 571 old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread); 572 573 count = perf_counts(evsel->counts, cpu_map_idx, thread); 574 575 mutex_lock(tpebs_mtx_get()); 576 t = tpebs_retire_lat__find(evsel); 577 /* 578 * If reading the first tpebs result, send a ping to the record 579 * process. Allow the sample reader a chance to read by releasing and 580 * reacquiring the lock. 581 */ 582 if (t && &t->nd == tpebs_results.next) { 583 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG); 584 mutex_unlock(tpebs_mtx_get()); 585 if (ret) 586 return ret; 587 mutex_lock(tpebs_mtx_get()); 588 } 589 if (t == NULL || t->stats.n == 0) { 590 /* No sample data, use default. */ 591 if (tpebs_recording) { 592 pr_warning_once( 593 "Using precomputed retirement latency data as no samples\n"); 594 } 595 val = 0; 596 switch (tpebs_mode) { 597 case TPEBS_MODE__MIN: 598 val = rint(evsel->retirement_latency.min); 599 break; 600 case TPEBS_MODE__MAX: 601 val = rint(evsel->retirement_latency.max); 602 break; 603 default: 604 case TPEBS_MODE__LAST: 605 case TPEBS_MODE__MEAN: 606 val = rint(evsel->retirement_latency.mean); 607 break; 608 } 609 } else { 610 switch (tpebs_mode) { 611 case TPEBS_MODE__MIN: 612 val = t->stats.min; 613 break; 614 case TPEBS_MODE__MAX: 615 val = t->stats.max; 616 break; 617 case TPEBS_MODE__LAST: 618 val = t->last; 619 break; 620 default: 621 case TPEBS_MODE__MEAN: 622 val = rint(t->stats.mean); 623 break; 624 } 625 } 626 mutex_unlock(tpebs_mtx_get()); 627 628 if (old_count) { 629 count->val = old_count->val + val; 630 count->run = old_count->run + 1; 631 count->ena = old_count->ena + 1; 632 } else { 633 count->val = val; 634 count->run++; 635 count->ena++; 636 } 637 return 0; 638 } 639 640 /** 641 * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the 642 * created thread and process by calling tpebs_stop(). 643 * 644 * This function is called in evsel__close() to be symmetric with 645 * evsel__tpebs_open() being called in evsel__open(). 646 */ 647 void evsel__tpebs_close(struct evsel *evsel) 648 { 649 struct tpebs_retire_lat *t; 650 651 mutex_lock(tpebs_mtx_get()); 652 t = tpebs_retire_lat__find(evsel); 653 if (t) { 654 list_del_init(&t->nd); 655 tpebs_retire_lat__delete(t); 656 657 if (list_empty(&tpebs_results)) 658 tpebs_stop(); 659 } 660 mutex_unlock(tpebs_mtx_get()); 661 } 662