1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * intel_tpebs.c: Intel TPEBS support
4 */
5
6 #include <api/fs/fs.h>
7 #include <sys/param.h>
8 #include <subcmd/run-command.h>
9 #include <thread.h>
10 #include "intel-tpebs.h"
11 #include <linux/list.h>
12 #include <linux/zalloc.h>
13 #include <linux/err.h>
14 #include "sample.h"
15 #include "counts.h"
16 #include "debug.h"
17 #include "evlist.h"
18 #include "evsel.h"
19 #include "mutex.h"
20 #include "session.h"
21 #include "stat.h"
22 #include "tool.h"
23 #include "cpumap.h"
24 #include "metricgroup.h"
25 #include "stat.h"
26 #include <sys/stat.h>
27 #include <sys/file.h>
28 #include <poll.h>
29 #include <math.h>
30
31 #define PERF_DATA "-"
32
33 bool tpebs_recording;
34 enum tpebs_mode tpebs_mode;
35 static LIST_HEAD(tpebs_results);
36 static pthread_t tpebs_reader_thread;
37 static struct child_process tpebs_cmd;
38 static int control_fd[2], ack_fd[2];
39 static struct mutex tpebs_mtx;
40
41 struct tpebs_retire_lat {
42 struct list_head nd;
43 /** @evsel: The evsel that opened the retire_lat event. */
44 struct evsel *evsel;
45 /** @event: Event passed to perf record. */
46 char *event;
47 /** @stats: Recorded retirement latency stats. */
48 struct stats stats;
49 /** @last: Last retirement latency read. */
50 uint64_t last;
51 /* Has the event been sent to perf record? */
52 bool started;
53 };
54
tpebs_mtx_init(void)55 static void tpebs_mtx_init(void)
56 {
57 mutex_init(&tpebs_mtx);
58 }
59
tpebs_mtx_get(void)60 static struct mutex *tpebs_mtx_get(void)
61 {
62 static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT;
63
64 pthread_once(&tpebs_mtx_once, tpebs_mtx_init);
65 return &tpebs_mtx;
66 }
67
68 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
69 EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get());
70
evsel__tpebs_start_perf_record(struct evsel * evsel)71 static int evsel__tpebs_start_perf_record(struct evsel *evsel)
72 {
73 const char **record_argv;
74 int tpebs_event_size = 0, i = 0, ret;
75 char control_fd_buf[32];
76 char cpumap_buf[50];
77 struct tpebs_retire_lat *t;
78
79 list_for_each_entry(t, &tpebs_results, nd)
80 tpebs_event_size++;
81
82 record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv));
83 if (!record_argv)
84 return -ENOMEM;
85
86 record_argv[i++] = "perf";
87 record_argv[i++] = "record";
88 record_argv[i++] = "-W";
89 record_argv[i++] = "--synth=no";
90
91 scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d",
92 control_fd[0], ack_fd[1]);
93 record_argv[i++] = control_fd_buf;
94
95 record_argv[i++] = "-o";
96 record_argv[i++] = PERF_DATA;
97
98 if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) {
99 cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf,
100 sizeof(cpumap_buf));
101 record_argv[i++] = "-C";
102 record_argv[i++] = cpumap_buf;
103 }
104
105 list_for_each_entry(t, &tpebs_results, nd) {
106 record_argv[i++] = "-e";
107 record_argv[i++] = t->event;
108 }
109 record_argv[i++] = NULL;
110 assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size);
111 /* Note, no workload given so system wide is implied. */
112
113 assert(tpebs_cmd.pid == 0);
114 tpebs_cmd.argv = record_argv;
115 tpebs_cmd.out = -1;
116 ret = start_command(&tpebs_cmd);
117 zfree(&tpebs_cmd.argv);
118 list_for_each_entry(t, &tpebs_results, nd)
119 t->started = true;
120
121 return ret;
122 }
123
is_child_pid(pid_t parent,pid_t child)124 static bool is_child_pid(pid_t parent, pid_t child)
125 {
126 if (parent < 0 || child < 0)
127 return false;
128
129 while (true) {
130 char path[PATH_MAX];
131 char line[256];
132 FILE *fp;
133
134 new_child:
135 if (parent == child)
136 return true;
137
138 if (child <= 0)
139 return false;
140
141 scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child);
142 fp = fopen(path, "r");
143 if (!fp) {
144 /* Presumably the process went away. Assume not a child. */
145 return false;
146 }
147 while (fgets(line, sizeof(line), fp) != NULL) {
148 if (strncmp(line, "PPid:", 5) == 0) {
149 fclose(fp);
150 if (sscanf(line + 5, "%d", &child) != 1) {
151 /* Unexpected error parsing. */
152 return false;
153 }
154 goto new_child;
155 }
156 }
157 /* Unexpected EOF. */
158 fclose(fp);
159 return false;
160 }
161 }
162
should_ignore_sample(const struct perf_sample * sample,const struct tpebs_retire_lat * t)163 static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t)
164 {
165 pid_t workload_pid, sample_pid = sample->pid;
166
167 /*
168 * During evlist__purge the evlist will be removed prior to the
169 * evsel__exit calling evsel__tpebs_close and taking the
170 * tpebs_mtx. Avoid a segfault by ignoring samples in this case.
171 */
172 if (t->evsel->evlist == NULL)
173 return true;
174
175 workload_pid = t->evsel->evlist->workload.pid;
176 if (workload_pid < 0 || workload_pid == sample_pid)
177 return false;
178
179 if (!t->evsel->core.attr.inherit)
180 return true;
181
182 return !is_child_pid(workload_pid, sample_pid);
183 }
184
process_sample_event(const struct perf_tool * tool __maybe_unused,union perf_event * event __maybe_unused,struct perf_sample * sample,struct evsel * evsel,struct machine * machine __maybe_unused)185 static int process_sample_event(const struct perf_tool *tool __maybe_unused,
186 union perf_event *event __maybe_unused,
187 struct perf_sample *sample,
188 struct evsel *evsel,
189 struct machine *machine __maybe_unused)
190 {
191 struct tpebs_retire_lat *t;
192
193 mutex_lock(tpebs_mtx_get());
194 if (tpebs_cmd.pid == 0) {
195 /* Record has terminated. */
196 mutex_unlock(tpebs_mtx_get());
197 return 0;
198 }
199 t = tpebs_retire_lat__find(evsel);
200 if (!t) {
201 mutex_unlock(tpebs_mtx_get());
202 return -EINVAL;
203 }
204 if (should_ignore_sample(sample, t)) {
205 mutex_unlock(tpebs_mtx_get());
206 return 0;
207 }
208 /*
209 * Need to handle per core results? We are assuming average retire
210 * latency value will be used. Save the number of samples and the sum of
211 * retire latency value for each event.
212 */
213 t->last = sample->weight3;
214 update_stats(&t->stats, sample->weight3);
215 mutex_unlock(tpebs_mtx_get());
216 return 0;
217 }
218
process_feature_event(struct perf_session * session,union perf_event * event)219 static int process_feature_event(struct perf_session *session,
220 union perf_event *event)
221 {
222 if (event->feat.feat_id < HEADER_LAST_FEATURE)
223 return perf_event__process_feature(session, event);
224 return 0;
225 }
226
__sample_reader(void * arg __maybe_unused)227 static void *__sample_reader(void *arg __maybe_unused)
228 {
229 struct perf_session *session;
230 struct perf_data data = {
231 .mode = PERF_DATA_MODE_READ,
232 .path = PERF_DATA,
233 .file.fd = tpebs_cmd.out,
234 };
235 struct perf_tool tool;
236
237 perf_tool__init(&tool, /*ordered_events=*/false);
238 tool.sample = process_sample_event;
239 tool.feature = process_feature_event;
240 tool.attr = perf_event__process_attr;
241
242 session = perf_session__new(&data, &tool);
243 if (IS_ERR(session))
244 return NULL;
245 perf_session__process_events(session);
246 perf_session__delete(session);
247
248 return NULL;
249 }
250
tpebs_send_record_cmd(const char * msg)251 static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
252 {
253 struct pollfd pollfd = { .events = POLLIN, };
254 int ret, len, retries = 0;
255 char ack_buf[8];
256
257 /* Check if the command exited before the send, done with the lock held. */
258 if (tpebs_cmd.pid == 0)
259 return 0;
260
261 /*
262 * Let go of the lock while sending/receiving as blocking can starve the
263 * sample reading thread.
264 */
265 mutex_unlock(tpebs_mtx_get());
266
267 /* Send perf record command.*/
268 len = strlen(msg);
269 ret = write(control_fd[1], msg, len);
270 if (ret != len) {
271 pr_err("perf record control write control message '%s' failed\n", msg);
272 ret = -EPIPE;
273 goto out;
274 }
275
276 if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) {
277 ret = 0;
278 goto out;
279 }
280
281 /* Wait for an ack. */
282 pollfd.fd = ack_fd[0];
283
284 /*
285 * We need this poll to ensure the ack_fd PIPE will not hang
286 * when perf record failed for any reason. The timeout value
287 * 3000ms is an empirical selection.
288 */
289 again:
290 if (!poll(&pollfd, 1, 500)) {
291 if (check_if_command_finished(&tpebs_cmd)) {
292 ret = 0;
293 goto out;
294 }
295
296 if (retries++ < 6)
297 goto again;
298 pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg);
299 ret = -ETIMEDOUT;
300 goto out;
301 }
302
303 if (!(pollfd.revents & POLLIN)) {
304 if (check_if_command_finished(&tpebs_cmd)) {
305 ret = 0;
306 goto out;
307 }
308
309 pr_err("tpebs failed: did not received an ack for '%s'\n", msg);
310 ret = -EPIPE;
311 goto out;
312 }
313
314 ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
315 if (ret > 0)
316 ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
317 else
318 pr_err("tpebs: perf record control ack failed\n");
319 out:
320 /* Re-take lock as expected by caller. */
321 mutex_lock(tpebs_mtx_get());
322 return ret;
323 }
324
325 /*
326 * tpebs_stop - stop the sample data read thread and the perf record process.
327 */
tpebs_stop(void)328 static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
329 {
330 int ret = 0;
331
332 /* Like tpebs_start, we should only run tpebs_end once. */
333 if (tpebs_cmd.pid != 0) {
334 tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG);
335 tpebs_cmd.pid = 0;
336 mutex_unlock(tpebs_mtx_get());
337 pthread_join(tpebs_reader_thread, NULL);
338 mutex_lock(tpebs_mtx_get());
339 close(control_fd[0]);
340 close(control_fd[1]);
341 close(ack_fd[0]);
342 close(ack_fd[1]);
343 close(tpebs_cmd.out);
344 ret = finish_command(&tpebs_cmd);
345 tpebs_cmd.pid = 0;
346 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
347 ret = 0;
348 }
349 return ret;
350 }
351
352 /**
353 * evsel__tpebs_event() - Create string event encoding to pass to `perf record`.
354 */
evsel__tpebs_event(struct evsel * evsel,char ** event)355 static int evsel__tpebs_event(struct evsel *evsel, char **event)
356 {
357 char *name, *modifier;
358 int ret;
359
360 name = strdup(evsel->name);
361 if (!name)
362 return -ENOMEM;
363
364 modifier = strrchr(name, 'R');
365 if (!modifier) {
366 ret = -EINVAL;
367 goto out;
368 }
369 *modifier = 'p';
370 modifier = strchr(name, ':');
371 if (!modifier)
372 modifier = strrchr(name, '/');
373 if (!modifier) {
374 ret = -EINVAL;
375 goto out;
376 }
377 *modifier = '\0';
378 if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0)
379 ret = 0;
380 else
381 ret = -ENOMEM;
382 out:
383 if (ret)
384 pr_err("Tpebs event modifier broken '%s'\n", evsel->name);
385 free(name);
386 return ret;
387 }
388
tpebs_retire_lat__new(struct evsel * evsel)389 static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel)
390 {
391 struct tpebs_retire_lat *result = zalloc(sizeof(*result));
392 int ret;
393
394 if (!result)
395 return NULL;
396
397 ret = evsel__tpebs_event(evsel, &result->event);
398 if (ret) {
399 free(result);
400 return NULL;
401 }
402 result->evsel = evsel;
403 return result;
404 }
405
tpebs_retire_lat__delete(struct tpebs_retire_lat * r)406 static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
407 {
408 zfree(&r->event);
409 free(r);
410 }
411
tpebs_retire_lat__find(struct evsel * evsel)412 static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
413 {
414 struct tpebs_retire_lat *t;
415 unsigned long num;
416 const char *evsel_name;
417
418 /*
419 * Evsels will match for evlist with the retirement latency event. The
420 * name with "tpebs_event_" prefix will be present on events being read
421 * from `perf record`.
422 */
423 if (evsel__is_retire_lat(evsel)) {
424 list_for_each_entry(t, &tpebs_results, nd) {
425 if (t->evsel == evsel)
426 return t;
427 }
428 return NULL;
429 }
430 evsel_name = strstr(evsel->name, "tpebs_event_");
431 if (!evsel_name) {
432 /* Unexpected that the perf record should have other events. */
433 return NULL;
434 }
435 errno = 0;
436 num = strtoull(evsel_name + 12, NULL, 16);
437 if (errno) {
438 pr_err("Bad evsel for tpebs find '%s'\n", evsel->name);
439 return NULL;
440 }
441 list_for_each_entry(t, &tpebs_results, nd) {
442 if ((unsigned long)t->evsel == num)
443 return t;
444 }
445 return NULL;
446 }
447
448 /**
449 * evsel__tpebs_prepare - create tpebs data structures ready for opening.
450 * @evsel: retire_latency evsel, all evsels on its list will be prepared.
451 */
evsel__tpebs_prepare(struct evsel * evsel)452 static int evsel__tpebs_prepare(struct evsel *evsel)
453 {
454 struct evsel *pos;
455 struct tpebs_retire_lat *tpebs_event;
456
457 mutex_lock(tpebs_mtx_get());
458 tpebs_event = tpebs_retire_lat__find(evsel);
459 if (tpebs_event) {
460 /* evsel, or an identically named one, was already prepared. */
461 mutex_unlock(tpebs_mtx_get());
462 return 0;
463 }
464 tpebs_event = tpebs_retire_lat__new(evsel);
465 if (!tpebs_event) {
466 mutex_unlock(tpebs_mtx_get());
467 return -ENOMEM;
468 }
469 list_add_tail(&tpebs_event->nd, &tpebs_results);
470 mutex_unlock(tpebs_mtx_get());
471
472 /*
473 * Eagerly prepare all other evsels on the list to try to ensure that by
474 * open they are all known.
475 */
476 evlist__for_each_entry(evsel->evlist, pos) {
477 int ret;
478
479 if (pos == evsel || !pos->retire_lat)
480 continue;
481
482 ret = evsel__tpebs_prepare(pos);
483 if (ret)
484 return ret;
485 }
486 return 0;
487 }
488
489 /**
490 * evsel__tpebs_open - starts tpebs execution.
491 * @evsel: retire_latency evsel, all evsels on its list will be selected. Each
492 * evsel is sampled to get the average retire_latency value.
493 */
evsel__tpebs_open(struct evsel * evsel)494 int evsel__tpebs_open(struct evsel *evsel)
495 {
496 int ret;
497 bool tpebs_empty;
498
499 /* We should only run tpebs_start when tpebs_recording is enabled. */
500 if (!tpebs_recording)
501 return 0;
502 /* Only start the events once. */
503 if (tpebs_cmd.pid != 0) {
504 struct tpebs_retire_lat *t;
505 bool valid;
506
507 mutex_lock(tpebs_mtx_get());
508 t = tpebs_retire_lat__find(evsel);
509 valid = t && t->started;
510 mutex_unlock(tpebs_mtx_get());
511 /* May fail as the event wasn't started. */
512 return valid ? 0 : -EBUSY;
513 }
514
515 ret = evsel__tpebs_prepare(evsel);
516 if (ret)
517 return ret;
518
519 mutex_lock(tpebs_mtx_get());
520 tpebs_empty = list_empty(&tpebs_results);
521 if (!tpebs_empty) {
522 /*Create control and ack fd for --control*/
523 if (pipe(control_fd) < 0) {
524 pr_err("tpebs: Failed to create control fifo");
525 ret = -1;
526 goto out;
527 }
528 if (pipe(ack_fd) < 0) {
529 pr_err("tpebs: Failed to create control fifo");
530 ret = -1;
531 goto out;
532 }
533
534 ret = evsel__tpebs_start_perf_record(evsel);
535 if (ret)
536 goto out;
537
538 if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader,
539 /*arg=*/NULL)) {
540 kill(tpebs_cmd.pid, SIGTERM);
541 close(tpebs_cmd.out);
542 pr_err("Could not create thread to process sample data.\n");
543 ret = -1;
544 goto out;
545 }
546 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG);
547 }
548 out:
549 if (ret) {
550 struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel);
551
552 list_del_init(&t->nd);
553 tpebs_retire_lat__delete(t);
554 }
555 mutex_unlock(tpebs_mtx_get());
556 return ret;
557 }
558
evsel__tpebs_read(struct evsel * evsel,int cpu_map_idx,int thread)559 int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread)
560 {
561 struct perf_counts_values *count, *old_count = NULL;
562 struct tpebs_retire_lat *t;
563 uint64_t val;
564 int ret;
565
566 /* Only set retire_latency value to the first CPU and thread. */
567 if (cpu_map_idx != 0 || thread != 0)
568 return 0;
569
570 if (evsel->prev_raw_counts)
571 old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
572
573 count = perf_counts(evsel->counts, cpu_map_idx, thread);
574
575 mutex_lock(tpebs_mtx_get());
576 t = tpebs_retire_lat__find(evsel);
577 /*
578 * If reading the first tpebs result, send a ping to the record
579 * process. Allow the sample reader a chance to read by releasing and
580 * reacquiring the lock.
581 */
582 if (t && &t->nd == tpebs_results.next) {
583 ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG);
584 mutex_unlock(tpebs_mtx_get());
585 if (ret)
586 return ret;
587 mutex_lock(tpebs_mtx_get());
588 }
589 if (t == NULL || t->stats.n == 0) {
590 /* No sample data, use default. */
591 if (tpebs_recording) {
592 pr_warning_once(
593 "Using precomputed retirement latency data as no samples\n");
594 }
595 val = 0;
596 switch (tpebs_mode) {
597 case TPEBS_MODE__MIN:
598 val = rint(evsel->retirement_latency.min);
599 break;
600 case TPEBS_MODE__MAX:
601 val = rint(evsel->retirement_latency.max);
602 break;
603 default:
604 case TPEBS_MODE__LAST:
605 case TPEBS_MODE__MEAN:
606 val = rint(evsel->retirement_latency.mean);
607 break;
608 }
609 } else {
610 switch (tpebs_mode) {
611 case TPEBS_MODE__MIN:
612 val = t->stats.min;
613 break;
614 case TPEBS_MODE__MAX:
615 val = t->stats.max;
616 break;
617 case TPEBS_MODE__LAST:
618 val = t->last;
619 break;
620 default:
621 case TPEBS_MODE__MEAN:
622 val = rint(t->stats.mean);
623 break;
624 }
625 }
626 mutex_unlock(tpebs_mtx_get());
627
628 if (old_count) {
629 count->val = old_count->val + val;
630 count->run = old_count->run + 1;
631 count->ena = old_count->ena + 1;
632 } else {
633 count->val = val;
634 count->run++;
635 count->ena++;
636 }
637 return 0;
638 }
639
640 /**
641 * evsel__tpebs_close() - delete tpebs related data. If the last event, stop the
642 * created thread and process by calling tpebs_stop().
643 *
644 * This function is called in evsel__close() to be symmetric with
645 * evsel__tpebs_open() being called in evsel__open().
646 */
evsel__tpebs_close(struct evsel * evsel)647 void evsel__tpebs_close(struct evsel *evsel)
648 {
649 struct tpebs_retire_lat *t;
650
651 mutex_lock(tpebs_mtx_get());
652 t = tpebs_retire_lat__find(evsel);
653 if (t) {
654 list_del_init(&t->nd);
655 tpebs_retire_lat__delete(t);
656
657 if (list_empty(&tpebs_results))
658 tpebs_stop();
659 }
660 mutex_unlock(tpebs_mtx_get());
661 }
662