1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Arm Statistical Profiling Extensions (SPE) support
4  * Copyright (c) 2017-2018, Arm Ltd.
5  */
6 
7 #include <byteswap.h>
8 #include <endian.h>
9 #include <errno.h>
10 #include <inttypes.h>
11 #include <linux/bitops.h>
12 #include <linux/kernel.h>
13 #include <linux/log2.h>
14 #include <linux/types.h>
15 #include <linux/zalloc.h>
16 #include <stdlib.h>
17 #include <unistd.h>
18 
19 #include "auxtrace.h"
20 #include "color.h"
21 #include "debug.h"
22 #include "evlist.h"
23 #include "evsel.h"
24 #include "machine.h"
25 #include "session.h"
26 #include "symbol.h"
27 #include "thread.h"
28 #include "thread-stack.h"
29 #include "tsc.h"
30 #include "tool.h"
31 #include "util/synthetic-events.h"
32 
33 #include "arm-spe.h"
34 #include "arm-spe-decoder/arm-spe-decoder.h"
35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
36 
37 #include "../../arch/arm64/include/asm/cputype.h"
38 #define MAX_TIMESTAMP (~0ULL)
39 
40 #define is_ldst_op(op)		(!!((op) & ARM_SPE_OP_LDST))
41 
42 struct arm_spe {
43 	struct auxtrace			auxtrace;
44 	struct auxtrace_queues		queues;
45 	struct auxtrace_heap		heap;
46 	struct itrace_synth_opts        synth_opts;
47 	u32				auxtrace_type;
48 	struct perf_session		*session;
49 	struct machine			*machine;
50 	u32				pmu_type;
51 
52 	struct perf_tsc_conversion	tc;
53 
54 	u8				timeless_decoding;
55 	u8				data_queued;
56 
57 	u64				sample_type;
58 	u8				sample_flc;
59 	u8				sample_llc;
60 	u8				sample_tlb;
61 	u8				sample_branch;
62 	u8				sample_remote_access;
63 	u8				sample_memory;
64 	u8				sample_instructions;
65 	u64				instructions_sample_period;
66 
67 	u64				l1d_miss_id;
68 	u64				l1d_access_id;
69 	u64				llc_miss_id;
70 	u64				llc_access_id;
71 	u64				tlb_miss_id;
72 	u64				tlb_access_id;
73 	u64				branch_id;
74 	u64				remote_access_id;
75 	u64				memory_id;
76 	u64				instructions_id;
77 
78 	u64				kernel_start;
79 
80 	unsigned long			num_events;
81 	u8				use_ctx_pkt_for_pid;
82 
83 	u64				**metadata;
84 	u64				metadata_ver;
85 	u64				metadata_nr_cpu;
86 	bool				is_homogeneous;
87 };
88 
89 struct arm_spe_queue {
90 	struct arm_spe			*spe;
91 	unsigned int			queue_nr;
92 	struct auxtrace_buffer		*buffer;
93 	struct auxtrace_buffer		*old_buffer;
94 	union perf_event		*event_buf;
95 	bool				on_heap;
96 	bool				done;
97 	pid_t				pid;
98 	pid_t				tid;
99 	int				cpu;
100 	struct arm_spe_decoder		*decoder;
101 	u64				time;
102 	u64				timestamp;
103 	struct thread			*thread;
104 	u64				period_instructions;
105 	u32				flags;
106 	struct branch_stack		*last_branch;
107 };
108 
109 struct data_source_handle {
110 	const struct midr_range *midr_ranges;
111 	void (*ds_synth)(const struct arm_spe_record *record,
112 			 union perf_mem_data_src *data_src);
113 };
114 
115 #define DS(range, func)					\
116 	{						\
117 		.midr_ranges = range,			\
118 		.ds_synth = arm_spe__synth_##func,	\
119 	}
120 
arm_spe_dump(struct arm_spe * spe __maybe_unused,unsigned char * buf,size_t len)121 static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
122 			 unsigned char *buf, size_t len)
123 {
124 	struct arm_spe_pkt packet;
125 	size_t pos = 0;
126 	int ret, pkt_len, i;
127 	char desc[ARM_SPE_PKT_DESC_MAX];
128 	const char *color = PERF_COLOR_BLUE;
129 
130 	color_fprintf(stdout, color,
131 		      ". ... ARM SPE data: size %#zx bytes\n",
132 		      len);
133 
134 	while (len) {
135 		ret = arm_spe_get_packet(buf, len, &packet);
136 		if (ret > 0)
137 			pkt_len = ret;
138 		else
139 			pkt_len = 1;
140 		printf(".");
141 		color_fprintf(stdout, color, "  %08zx: ", pos);
142 		for (i = 0; i < pkt_len; i++)
143 			color_fprintf(stdout, color, " %02x", buf[i]);
144 		for (; i < 16; i++)
145 			color_fprintf(stdout, color, "   ");
146 		if (ret > 0) {
147 			ret = arm_spe_pkt_desc(&packet, desc,
148 					       ARM_SPE_PKT_DESC_MAX);
149 			if (!ret)
150 				color_fprintf(stdout, color, " %s\n", desc);
151 		} else {
152 			color_fprintf(stdout, color, " Bad packet!\n");
153 		}
154 		pos += pkt_len;
155 		buf += pkt_len;
156 		len -= pkt_len;
157 	}
158 }
159 
arm_spe_dump_event(struct arm_spe * spe,unsigned char * buf,size_t len)160 static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf,
161 			       size_t len)
162 {
163 	printf(".\n");
164 	arm_spe_dump(spe, buf, len);
165 }
166 
arm_spe_get_trace(struct arm_spe_buffer * b,void * data)167 static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data)
168 {
169 	struct arm_spe_queue *speq = data;
170 	struct auxtrace_buffer *buffer = speq->buffer;
171 	struct auxtrace_buffer *old_buffer = speq->old_buffer;
172 	struct auxtrace_queue *queue;
173 
174 	queue = &speq->spe->queues.queue_array[speq->queue_nr];
175 
176 	buffer = auxtrace_buffer__next(queue, buffer);
177 	/* If no more data, drop the previous auxtrace_buffer and return */
178 	if (!buffer) {
179 		if (old_buffer)
180 			auxtrace_buffer__drop_data(old_buffer);
181 		b->len = 0;
182 		return 0;
183 	}
184 
185 	speq->buffer = buffer;
186 
187 	/* If the aux_buffer doesn't have data associated, try to load it */
188 	if (!buffer->data) {
189 		/* get the file desc associated with the perf data file */
190 		int fd = perf_data__fd(speq->spe->session->data);
191 
192 		buffer->data = auxtrace_buffer__get_data(buffer, fd);
193 		if (!buffer->data)
194 			return -ENOMEM;
195 	}
196 
197 	b->len = buffer->size;
198 	b->buf = buffer->data;
199 
200 	if (b->len) {
201 		if (old_buffer)
202 			auxtrace_buffer__drop_data(old_buffer);
203 		speq->old_buffer = buffer;
204 	} else {
205 		auxtrace_buffer__drop_data(buffer);
206 		return arm_spe_get_trace(b, data);
207 	}
208 
209 	return 0;
210 }
211 
arm_spe__alloc_queue(struct arm_spe * spe,unsigned int queue_nr)212 static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
213 		unsigned int queue_nr)
214 {
215 	struct arm_spe_params params = { .get_trace = 0, };
216 	struct arm_spe_queue *speq;
217 
218 	speq = zalloc(sizeof(*speq));
219 	if (!speq)
220 		return NULL;
221 
222 	speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
223 	if (!speq->event_buf)
224 		goto out_free;
225 
226 	speq->spe = spe;
227 	speq->queue_nr = queue_nr;
228 	speq->pid = -1;
229 	speq->tid = -1;
230 	speq->cpu = -1;
231 	speq->period_instructions = 0;
232 
233 	/* params set */
234 	params.get_trace = arm_spe_get_trace;
235 	params.data = speq;
236 
237 	if (spe->synth_opts.last_branch) {
238 		size_t sz = sizeof(struct branch_stack);
239 
240 		/* Allocate up to two entries for PBT + TGT */
241 		sz += sizeof(struct branch_entry) *
242 			min(spe->synth_opts.last_branch_sz, 2U);
243 		speq->last_branch = zalloc(sz);
244 		if (!speq->last_branch)
245 			goto out_free;
246 	}
247 
248 	/* create new decoder */
249 	speq->decoder = arm_spe_decoder_new(&params);
250 	if (!speq->decoder)
251 		goto out_free;
252 
253 	return speq;
254 
255 out_free:
256 	zfree(&speq->event_buf);
257 	zfree(&speq->last_branch);
258 	free(speq);
259 
260 	return NULL;
261 }
262 
arm_spe_cpumode(struct arm_spe * spe,u64 ip)263 static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
264 {
265 	return ip >= spe->kernel_start ?
266 		PERF_RECORD_MISC_KERNEL :
267 		PERF_RECORD_MISC_USER;
268 }
269 
arm_spe_set_pid_tid_cpu(struct arm_spe * spe,struct auxtrace_queue * queue)270 static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
271 				    struct auxtrace_queue *queue)
272 {
273 	struct arm_spe_queue *speq = queue->priv;
274 	pid_t tid;
275 
276 	tid = machine__get_current_tid(spe->machine, speq->cpu);
277 	if (tid != -1) {
278 		speq->tid = tid;
279 		thread__zput(speq->thread);
280 	} else
281 		speq->tid = queue->tid;
282 
283 	if ((!speq->thread) && (speq->tid != -1)) {
284 		speq->thread = machine__find_thread(spe->machine, -1,
285 						    speq->tid);
286 	}
287 
288 	if (speq->thread) {
289 		speq->pid = thread__pid(speq->thread);
290 		if (queue->cpu == -1)
291 			speq->cpu = thread__cpu(speq->thread);
292 	}
293 }
294 
arm_spe_set_tid(struct arm_spe_queue * speq,pid_t tid)295 static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
296 {
297 	struct arm_spe *spe = speq->spe;
298 	int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid);
299 
300 	if (err)
301 		return err;
302 
303 	arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
304 
305 	return 0;
306 }
307 
arm_spe__get_metadata_by_cpu(struct arm_spe * spe,u64 cpu)308 static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu)
309 {
310 	u64 i;
311 
312 	if (!spe->metadata)
313 		return NULL;
314 
315 	for (i = 0; i < spe->metadata_nr_cpu; i++)
316 		if (spe->metadata[i][ARM_SPE_CPU] == cpu)
317 			return spe->metadata[i];
318 
319 	return NULL;
320 }
321 
arm_spe__synth_simd_flags(const struct arm_spe_record * record)322 static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record)
323 {
324 	struct simd_flags simd_flags = {};
325 
326 	if ((record->op & ARM_SPE_OP_LDST) && (record->op & ARM_SPE_OP_SVE_LDST))
327 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
328 
329 	if ((record->op & ARM_SPE_OP_OTHER) && (record->op & ARM_SPE_OP_SVE_OTHER))
330 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
331 
332 	if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
333 		simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL;
334 
335 	if (record->type & ARM_SPE_SVE_EMPTY_PRED)
336 		simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY;
337 
338 	return simd_flags;
339 }
340 
arm_spe_prep_sample(struct arm_spe * spe,struct arm_spe_queue * speq,union perf_event * event,struct perf_sample * sample)341 static void arm_spe_prep_sample(struct arm_spe *spe,
342 				struct arm_spe_queue *speq,
343 				union perf_event *event,
344 				struct perf_sample *sample)
345 {
346 	struct arm_spe_record *record = &speq->decoder->record;
347 
348 	if (!spe->timeless_decoding)
349 		sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);
350 
351 	sample->ip = record->from_ip;
352 	sample->cpumode = arm_spe_cpumode(spe, sample->ip);
353 	sample->pid = speq->pid;
354 	sample->tid = speq->tid;
355 	sample->period = 1;
356 	sample->cpu = speq->cpu;
357 	sample->simd_flags = arm_spe__synth_simd_flags(record);
358 
359 	event->sample.header.type = PERF_RECORD_SAMPLE;
360 	event->sample.header.misc = sample->cpumode;
361 	event->sample.header.size = sizeof(struct perf_event_header);
362 }
363 
arm_spe__prep_branch_stack(struct arm_spe_queue * speq)364 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq)
365 {
366 	struct arm_spe *spe = speq->spe;
367 	struct arm_spe_record *record = &speq->decoder->record;
368 	struct branch_stack *bstack = speq->last_branch;
369 	struct branch_flags *bs_flags;
370 	unsigned int last_branch_sz = spe->synth_opts.last_branch_sz;
371 	bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH);
372 	bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt;
373 	size_t sz = sizeof(struct branch_stack) +
374 		    sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */;
375 	int i = 0;
376 
377 	/* Clean up branch stack */
378 	memset(bstack, 0x0, sz);
379 
380 	if (!have_tgt && !have_pbt)
381 		return;
382 
383 	if (have_tgt) {
384 		bstack->entries[i].from = record->from_ip;
385 		bstack->entries[i].to = record->to_ip;
386 
387 		bs_flags = &bstack->entries[i].flags;
388 		bs_flags->value = 0;
389 
390 		if (record->op & ARM_SPE_OP_BR_CR_BL) {
391 			if (record->op & ARM_SPE_OP_BR_COND)
392 				bs_flags->type |= PERF_BR_COND_CALL;
393 			else
394 				bs_flags->type |= PERF_BR_CALL;
395 		/*
396 		 * Indirect branch instruction without link (e.g. BR),
397 		 * take this case as function return.
398 		 */
399 		} else if (record->op & ARM_SPE_OP_BR_CR_RET ||
400 			   record->op & ARM_SPE_OP_BR_INDIRECT) {
401 			if (record->op & ARM_SPE_OP_BR_COND)
402 				bs_flags->type |= PERF_BR_COND_RET;
403 			else
404 				bs_flags->type |= PERF_BR_RET;
405 		} else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) {
406 			if (record->op & ARM_SPE_OP_BR_COND)
407 				bs_flags->type |= PERF_BR_COND;
408 			else
409 				bs_flags->type |= PERF_BR_UNCOND;
410 		} else {
411 			if (record->op & ARM_SPE_OP_BR_COND)
412 				bs_flags->type |= PERF_BR_COND;
413 			else
414 				bs_flags->type |= PERF_BR_UNKNOWN;
415 		}
416 
417 		if (record->type & ARM_SPE_BRANCH_MISS) {
418 			bs_flags->mispred = 1;
419 			bs_flags->predicted = 0;
420 		} else {
421 			bs_flags->mispred = 0;
422 			bs_flags->predicted = 1;
423 		}
424 
425 		if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
426 			bs_flags->not_taken = 1;
427 
428 		if (record->type & ARM_SPE_IN_TXN)
429 			bs_flags->in_tx = 1;
430 
431 		bs_flags->cycles = min(record->latency, 0xFFFFU);
432 		i++;
433 	}
434 
435 	if (have_pbt) {
436 		bs_flags = &bstack->entries[i].flags;
437 		bs_flags->type |= PERF_BR_UNKNOWN;
438 		bstack->entries[i].to = record->prev_br_tgt;
439 		i++;
440 	}
441 
442 	bstack->nr = i;
443 	bstack->hw_idx = -1ULL;
444 }
445 
arm_spe__inject_event(union perf_event * event,struct perf_sample * sample,u64 type)446 static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type)
447 {
448 	event->header.size = perf_event__sample_event_size(sample, type, 0);
449 	return perf_event__synthesize_sample(event, type, 0, sample);
450 }
451 
452 static inline int
arm_spe_deliver_synth_event(struct arm_spe * spe,struct arm_spe_queue * speq __maybe_unused,union perf_event * event,struct perf_sample * sample)453 arm_spe_deliver_synth_event(struct arm_spe *spe,
454 			    struct arm_spe_queue *speq __maybe_unused,
455 			    union perf_event *event,
456 			    struct perf_sample *sample)
457 {
458 	int ret;
459 
460 	if (spe->synth_opts.inject) {
461 		ret = arm_spe__inject_event(event, sample, spe->sample_type);
462 		if (ret)
463 			return ret;
464 	}
465 
466 	ret = perf_session__deliver_synth_event(spe->session, event, sample);
467 	if (ret)
468 		pr_err("ARM SPE: failed to deliver event, error %d\n", ret);
469 
470 	return ret;
471 }
472 
arm_spe__synth_mem_sample(struct arm_spe_queue * speq,u64 spe_events_id,u64 data_src)473 static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
474 				     u64 spe_events_id, u64 data_src)
475 {
476 	struct arm_spe *spe = speq->spe;
477 	struct arm_spe_record *record = &speq->decoder->record;
478 	union perf_event *event = speq->event_buf;
479 	struct perf_sample sample;
480 	int ret;
481 
482 	perf_sample__init(&sample, /*all=*/true);
483 	arm_spe_prep_sample(spe, speq, event, &sample);
484 
485 	sample.id = spe_events_id;
486 	sample.stream_id = spe_events_id;
487 	sample.addr = record->virt_addr;
488 	sample.phys_addr = record->phys_addr;
489 	sample.data_src = data_src;
490 	sample.weight = record->latency;
491 
492 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
493 	perf_sample__exit(&sample);
494 	return ret;
495 }
496 
arm_spe__synth_branch_sample(struct arm_spe_queue * speq,u64 spe_events_id)497 static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
498 					u64 spe_events_id)
499 {
500 	struct arm_spe *spe = speq->spe;
501 	struct arm_spe_record *record = &speq->decoder->record;
502 	union perf_event *event = speq->event_buf;
503 	struct perf_sample sample;
504 	int ret;
505 
506 	perf_sample__init(&sample, /*all=*/true);
507 	arm_spe_prep_sample(spe, speq, event, &sample);
508 
509 	sample.id = spe_events_id;
510 	sample.stream_id = spe_events_id;
511 	sample.addr = record->to_ip;
512 	sample.weight = record->latency;
513 	sample.flags = speq->flags;
514 	sample.branch_stack = speq->last_branch;
515 
516 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
517 	perf_sample__exit(&sample);
518 	return ret;
519 }
520 
arm_spe__synth_instruction_sample(struct arm_spe_queue * speq,u64 spe_events_id,u64 data_src)521 static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
522 					     u64 spe_events_id, u64 data_src)
523 {
524 	struct arm_spe *spe = speq->spe;
525 	struct arm_spe_record *record = &speq->decoder->record;
526 	union perf_event *event = speq->event_buf;
527 	struct perf_sample sample;
528 	int ret;
529 
530 	/*
531 	 * Handles perf instruction sampling period.
532 	 */
533 	speq->period_instructions++;
534 	if (speq->period_instructions < spe->instructions_sample_period)
535 		return 0;
536 	speq->period_instructions = 0;
537 
538 	perf_sample__init(&sample, /*all=*/true);
539 	arm_spe_prep_sample(spe, speq, event, &sample);
540 
541 	sample.id = spe_events_id;
542 	sample.stream_id = spe_events_id;
543 	sample.addr = record->to_ip;
544 	sample.phys_addr = record->phys_addr;
545 	sample.data_src = data_src;
546 	sample.period = spe->instructions_sample_period;
547 	sample.weight = record->latency;
548 	sample.flags = speq->flags;
549 	sample.branch_stack = speq->last_branch;
550 
551 	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
552 	perf_sample__exit(&sample);
553 	return ret;
554 }
555 
556 static const struct midr_range common_ds_encoding_cpus[] = {
557 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
558 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
559 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
560 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
561 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X925),
562 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
563 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
564 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
565 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
566 	{},
567 };
568 
569 static const struct midr_range ampereone_ds_encoding_cpus[] = {
570 	MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
571 	{},
572 };
573 
arm_spe__sample_flags(struct arm_spe_queue * speq)574 static void arm_spe__sample_flags(struct arm_spe_queue *speq)
575 {
576 	const struct arm_spe_record *record = &speq->decoder->record;
577 
578 	speq->flags = 0;
579 	if (record->op & ARM_SPE_OP_BRANCH_ERET) {
580 		speq->flags = PERF_IP_FLAG_BRANCH;
581 
582 		if (record->type & ARM_SPE_BRANCH_MISS)
583 			speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
584 
585 		if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
586 			speq->flags |= PERF_IP_FLAG_NOT_TAKEN;
587 
588 		if (record->type & ARM_SPE_IN_TXN)
589 			speq->flags |= PERF_IP_FLAG_IN_TX;
590 
591 		if (record->op & ARM_SPE_OP_BR_COND)
592 			speq->flags |= PERF_IP_FLAG_CONDITIONAL;
593 
594 		if (record->op & ARM_SPE_OP_BR_CR_BL)
595 			speq->flags |= PERF_IP_FLAG_CALL;
596 		else if (record->op & ARM_SPE_OP_BR_CR_RET)
597 			speq->flags |= PERF_IP_FLAG_RETURN;
598 		/*
599 		 * Indirect branch instruction without link (e.g. BR),
600 		 * take it as a function return.
601 		 */
602 		else if (record->op & ARM_SPE_OP_BR_INDIRECT)
603 			speq->flags |= PERF_IP_FLAG_RETURN;
604 	}
605 }
606 
arm_spe__synth_data_source_common(const struct arm_spe_record * record,union perf_mem_data_src * data_src)607 static void arm_spe__synth_data_source_common(const struct arm_spe_record *record,
608 					      union perf_mem_data_src *data_src)
609 {
610 	/*
611 	 * Even though four levels of cache hierarchy are possible, no known
612 	 * production Neoverse systems currently include more than three levels
613 	 * so for the time being we assume three exist. If a production system
614 	 * is built with four the this function would have to be changed to
615 	 * detect the number of levels for reporting.
616 	 */
617 
618 	/*
619 	 * We have no data on the hit level or data source for stores in the
620 	 * Neoverse SPE records.
621 	 */
622 	if (record->op & ARM_SPE_OP_ST) {
623 		data_src->mem_lvl = PERF_MEM_LVL_NA;
624 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
625 		data_src->mem_snoop = PERF_MEM_SNOOP_NA;
626 		return;
627 	}
628 
629 	switch (record->source) {
630 	case ARM_SPE_COMMON_DS_L1D:
631 		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
632 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
633 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
634 		break;
635 	case ARM_SPE_COMMON_DS_L2:
636 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
637 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
638 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
639 		break;
640 	case ARM_SPE_COMMON_DS_PEER_CORE:
641 		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
642 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
643 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
644 		break;
645 	/*
646 	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
647 	 * transfer, so set SNOOPX_PEER
648 	 */
649 	case ARM_SPE_COMMON_DS_LOCAL_CLUSTER:
650 	case ARM_SPE_COMMON_DS_PEER_CLUSTER:
651 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
652 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
653 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
654 		break;
655 	/*
656 	 * System cache is assumed to be L3
657 	 */
658 	case ARM_SPE_COMMON_DS_SYS_CACHE:
659 		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
660 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
661 		data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
662 		break;
663 	/*
664 	 * We don't know what level it hit in, except it came from the other
665 	 * socket
666 	 */
667 	case ARM_SPE_COMMON_DS_REMOTE:
668 		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1;
669 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
670 		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
671 		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
672 		break;
673 	case ARM_SPE_COMMON_DS_DRAM:
674 		data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
675 		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
676 		data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
677 		break;
678 	default:
679 		break;
680 	}
681 }
682 
683 /*
684  * Source is IMPDEF. Here we convert the source code used on AmpereOne cores
685  * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
686  */
arm_spe__synth_data_source_ampereone(const struct arm_spe_record * record,union perf_mem_data_src * data_src)687 static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record,
688 						 union perf_mem_data_src *data_src)
689 {
690 	struct arm_spe_record common_record;
691 
692 	switch (record->source) {
693 	case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
694 		common_record.source = ARM_SPE_COMMON_DS_PEER_CORE;
695 		break;
696 	case ARM_SPE_AMPEREONE_SLC:
697 		common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE;
698 		break;
699 	case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
700 		common_record.source = ARM_SPE_COMMON_DS_REMOTE;
701 		break;
702 	case ARM_SPE_AMPEREONE_DDR:
703 		common_record.source = ARM_SPE_COMMON_DS_DRAM;
704 		break;
705 	case ARM_SPE_AMPEREONE_L1D:
706 		common_record.source = ARM_SPE_COMMON_DS_L1D;
707 		break;
708 	case ARM_SPE_AMPEREONE_L2D:
709 		common_record.source = ARM_SPE_COMMON_DS_L2;
710 		break;
711 	default:
712 		pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
713 				record->source);
714 		return;
715 	}
716 
717 	common_record.op = record->op;
718 	arm_spe__synth_data_source_common(&common_record, data_src);
719 }
720 
721 static const struct data_source_handle data_source_handles[] = {
722 	DS(common_ds_encoding_cpus, data_source_common),
723 	DS(ampereone_ds_encoding_cpus, data_source_ampereone),
724 };
725 
arm_spe__synth_memory_level(const struct arm_spe_record * record,union perf_mem_data_src * data_src)726 static void arm_spe__synth_memory_level(const struct arm_spe_record *record,
727 					union perf_mem_data_src *data_src)
728 {
729 	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
730 		data_src->mem_lvl = PERF_MEM_LVL_L3;
731 
732 		if (record->type & ARM_SPE_LLC_MISS)
733 			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
734 		else
735 			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
736 	} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
737 		data_src->mem_lvl = PERF_MEM_LVL_L1;
738 
739 		if (record->type & ARM_SPE_L1D_MISS)
740 			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
741 		else
742 			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
743 	}
744 
745 	if (record->type & ARM_SPE_REMOTE_ACCESS)
746 		data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
747 }
748 
arm_spe__synth_ds(struct arm_spe_queue * speq,const struct arm_spe_record * record,union perf_mem_data_src * data_src)749 static bool arm_spe__synth_ds(struct arm_spe_queue *speq,
750 			      const struct arm_spe_record *record,
751 			      union perf_mem_data_src *data_src)
752 {
753 	struct arm_spe *spe = speq->spe;
754 	u64 *metadata = NULL;
755 	u64 midr;
756 	unsigned int i;
757 
758 	/* Metadata version 1 assumes all CPUs are the same (old behavior) */
759 	if (spe->metadata_ver == 1) {
760 		const char *cpuid;
761 
762 		pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
763 		cpuid = perf_env__cpuid(spe->session->evlist->env);
764 		midr = strtol(cpuid, NULL, 16);
765 	} else {
766 		/* CPU ID is -1 for per-thread mode */
767 		if (speq->cpu < 0) {
768 			/*
769 			 * On the heterogeneous system, due to CPU ID is -1,
770 			 * cannot confirm the data source packet is supported.
771 			 */
772 			if (!spe->is_homogeneous)
773 				return false;
774 
775 			/* In homogeneous system, simply use CPU0's metadata */
776 			if (spe->metadata)
777 				metadata = spe->metadata[0];
778 		} else {
779 			metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);
780 		}
781 
782 		if (!metadata)
783 			return false;
784 
785 		midr = metadata[ARM_SPE_CPU_MIDR];
786 	}
787 
788 	for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {
789 		if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
790 			data_source_handles[i].ds_synth(record, data_src);
791 			return true;
792 		}
793 	}
794 
795 	return false;
796 }
797 
arm_spe__synth_data_source(struct arm_spe_queue * speq,const struct arm_spe_record * record)798 static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq,
799 				      const struct arm_spe_record *record)
800 {
801 	union perf_mem_data_src	data_src = { .mem_op = PERF_MEM_OP_NA };
802 
803 	/* Only synthesize data source for LDST operations */
804 	if (!is_ldst_op(record->op))
805 		return 0;
806 
807 	if (record->op & ARM_SPE_OP_LD)
808 		data_src.mem_op = PERF_MEM_OP_LOAD;
809 	else if (record->op & ARM_SPE_OP_ST)
810 		data_src.mem_op = PERF_MEM_OP_STORE;
811 	else
812 		return 0;
813 
814 	if (!arm_spe__synth_ds(speq, record, &data_src))
815 		arm_spe__synth_memory_level(record, &data_src);
816 
817 	if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
818 		data_src.mem_dtlb = PERF_MEM_TLB_WK;
819 
820 		if (record->type & ARM_SPE_TLB_MISS)
821 			data_src.mem_dtlb |= PERF_MEM_TLB_MISS;
822 		else
823 			data_src.mem_dtlb |= PERF_MEM_TLB_HIT;
824 	}
825 
826 	return data_src.val;
827 }
828 
arm_spe_sample(struct arm_spe_queue * speq)829 static int arm_spe_sample(struct arm_spe_queue *speq)
830 {
831 	const struct arm_spe_record *record = &speq->decoder->record;
832 	struct arm_spe *spe = speq->spe;
833 	u64 data_src;
834 	int err;
835 
836 	arm_spe__sample_flags(speq);
837 	data_src = arm_spe__synth_data_source(speq, record);
838 
839 	if (spe->sample_flc) {
840 		if (record->type & ARM_SPE_L1D_MISS) {
841 			err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id,
842 							data_src);
843 			if (err)
844 				return err;
845 		}
846 
847 		if (record->type & ARM_SPE_L1D_ACCESS) {
848 			err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id,
849 							data_src);
850 			if (err)
851 				return err;
852 		}
853 	}
854 
855 	if (spe->sample_llc) {
856 		if (record->type & ARM_SPE_LLC_MISS) {
857 			err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id,
858 							data_src);
859 			if (err)
860 				return err;
861 		}
862 
863 		if (record->type & ARM_SPE_LLC_ACCESS) {
864 			err = arm_spe__synth_mem_sample(speq, spe->llc_access_id,
865 							data_src);
866 			if (err)
867 				return err;
868 		}
869 	}
870 
871 	if (spe->sample_tlb) {
872 		if (record->type & ARM_SPE_TLB_MISS) {
873 			err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id,
874 							data_src);
875 			if (err)
876 				return err;
877 		}
878 
879 		if (record->type & ARM_SPE_TLB_ACCESS) {
880 			err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id,
881 							data_src);
882 			if (err)
883 				return err;
884 		}
885 	}
886 
887 	if (spe->synth_opts.last_branch &&
888 	    (spe->sample_branch || spe->sample_instructions))
889 		arm_spe__prep_branch_stack(speq);
890 
891 	if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
892 		err = arm_spe__synth_branch_sample(speq, spe->branch_id);
893 		if (err)
894 			return err;
895 	}
896 
897 	if (spe->sample_remote_access &&
898 	    (record->type & ARM_SPE_REMOTE_ACCESS)) {
899 		err = arm_spe__synth_mem_sample(speq, spe->remote_access_id,
900 						data_src);
901 		if (err)
902 			return err;
903 	}
904 
905 	/*
906 	 * When data_src is zero it means the record is not a memory operation,
907 	 * skip to synthesize memory sample for this case.
908 	 */
909 	if (spe->sample_memory && is_ldst_op(record->op)) {
910 		err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
911 		if (err)
912 			return err;
913 	}
914 
915 	if (spe->sample_instructions) {
916 		err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
917 		if (err)
918 			return err;
919 	}
920 
921 	return 0;
922 }
923 
arm_spe_run_decoder(struct arm_spe_queue * speq,u64 * timestamp)924 static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
925 {
926 	struct arm_spe *spe = speq->spe;
927 	struct arm_spe_record *record;
928 	int ret;
929 
930 	if (!spe->kernel_start)
931 		spe->kernel_start = machine__kernel_start(spe->machine);
932 
933 	while (1) {
934 		/*
935 		 * The usual logic is firstly to decode the packets, and then
936 		 * based the record to synthesize sample; but here the flow is
937 		 * reversed: it calls arm_spe_sample() for synthesizing samples
938 		 * prior to arm_spe_decode().
939 		 *
940 		 * Two reasons for this code logic:
941 		 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it
942 		 * has decoded trace data and generated a record, but the record
943 		 * is left to generate sample until run to here, so it's correct
944 		 * to synthesize sample for the left record.
945 		 * 2. After decoding trace data, it needs to compare the record
946 		 * timestamp with the coming perf event, if the record timestamp
947 		 * is later than the perf event, it needs bail out and pushs the
948 		 * record into auxtrace heap, thus the record can be deferred to
949 		 * synthesize sample until run to here at the next time; so this
950 		 * can correlate samples between Arm SPE trace data and other
951 		 * perf events with correct time ordering.
952 		 */
953 
954 		/*
955 		 * Update pid/tid info.
956 		 */
957 		record = &speq->decoder->record;
958 		if (!spe->timeless_decoding && record->context_id != (u64)-1) {
959 			ret = arm_spe_set_tid(speq, record->context_id);
960 			if (ret)
961 				return ret;
962 
963 			spe->use_ctx_pkt_for_pid = true;
964 		}
965 
966 		ret = arm_spe_sample(speq);
967 		if (ret)
968 			return ret;
969 
970 		ret = arm_spe_decode(speq->decoder);
971 		if (!ret) {
972 			pr_debug("No data or all data has been processed.\n");
973 			return 1;
974 		}
975 
976 		/*
977 		 * Error is detected when decode SPE trace data, continue to
978 		 * the next trace data and find out more records.
979 		 */
980 		if (ret < 0)
981 			continue;
982 
983 		record = &speq->decoder->record;
984 
985 		/* Update timestamp for the last record */
986 		if (record->timestamp > speq->timestamp)
987 			speq->timestamp = record->timestamp;
988 
989 		/*
990 		 * If the timestamp of the queue is later than timestamp of the
991 		 * coming perf event, bail out so can allow the perf event to
992 		 * be processed ahead.
993 		 */
994 		if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
995 			*timestamp = speq->timestamp;
996 			return 0;
997 		}
998 	}
999 
1000 	return 0;
1001 }
1002 
arm_spe__setup_queue(struct arm_spe * spe,struct auxtrace_queue * queue,unsigned int queue_nr)1003 static int arm_spe__setup_queue(struct arm_spe *spe,
1004 			       struct auxtrace_queue *queue,
1005 			       unsigned int queue_nr)
1006 {
1007 	struct arm_spe_queue *speq = queue->priv;
1008 	struct arm_spe_record *record;
1009 
1010 	if (list_empty(&queue->head) || speq)
1011 		return 0;
1012 
1013 	speq = arm_spe__alloc_queue(spe, queue_nr);
1014 
1015 	if (!speq)
1016 		return -ENOMEM;
1017 
1018 	queue->priv = speq;
1019 
1020 	if (queue->cpu != -1)
1021 		speq->cpu = queue->cpu;
1022 
1023 	if (!speq->on_heap) {
1024 		int ret;
1025 
1026 		if (spe->timeless_decoding)
1027 			return 0;
1028 
1029 retry:
1030 		ret = arm_spe_decode(speq->decoder);
1031 
1032 		if (!ret)
1033 			return 0;
1034 
1035 		if (ret < 0)
1036 			goto retry;
1037 
1038 		record = &speq->decoder->record;
1039 
1040 		speq->timestamp = record->timestamp;
1041 		ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp);
1042 		if (ret)
1043 			return ret;
1044 		speq->on_heap = true;
1045 	}
1046 
1047 	return 0;
1048 }
1049 
arm_spe__setup_queues(struct arm_spe * spe)1050 static int arm_spe__setup_queues(struct arm_spe *spe)
1051 {
1052 	unsigned int i;
1053 	int ret;
1054 
1055 	for (i = 0; i < spe->queues.nr_queues; i++) {
1056 		ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i);
1057 		if (ret)
1058 			return ret;
1059 	}
1060 
1061 	return 0;
1062 }
1063 
arm_spe__update_queues(struct arm_spe * spe)1064 static int arm_spe__update_queues(struct arm_spe *spe)
1065 {
1066 	if (spe->queues.new_data) {
1067 		spe->queues.new_data = false;
1068 		return arm_spe__setup_queues(spe);
1069 	}
1070 
1071 	return 0;
1072 }
1073 
arm_spe__is_timeless_decoding(struct arm_spe * spe)1074 static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
1075 {
1076 	struct evsel *evsel;
1077 	struct evlist *evlist = spe->session->evlist;
1078 	bool timeless_decoding = true;
1079 
1080 	/*
1081 	 * Circle through the list of event and complain if we find one
1082 	 * with the time bit set.
1083 	 */
1084 	evlist__for_each_entry(evlist, evsel) {
1085 		if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
1086 			timeless_decoding = false;
1087 	}
1088 
1089 	return timeless_decoding;
1090 }
1091 
arm_spe_process_queues(struct arm_spe * spe,u64 timestamp)1092 static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
1093 {
1094 	unsigned int queue_nr;
1095 	u64 ts;
1096 	int ret;
1097 
1098 	while (1) {
1099 		struct auxtrace_queue *queue;
1100 		struct arm_spe_queue *speq;
1101 
1102 		if (!spe->heap.heap_cnt)
1103 			return 0;
1104 
1105 		if (spe->heap.heap_array[0].ordinal >= timestamp)
1106 			return 0;
1107 
1108 		queue_nr = spe->heap.heap_array[0].queue_nr;
1109 		queue = &spe->queues.queue_array[queue_nr];
1110 		speq = queue->priv;
1111 
1112 		auxtrace_heap__pop(&spe->heap);
1113 
1114 		if (spe->heap.heap_cnt) {
1115 			ts = spe->heap.heap_array[0].ordinal + 1;
1116 			if (ts > timestamp)
1117 				ts = timestamp;
1118 		} else {
1119 			ts = timestamp;
1120 		}
1121 
1122 		/*
1123 		 * A previous context-switch event has set pid/tid in the machine's context, so
1124 		 * here we need to update the pid/tid in the thread and SPE queue.
1125 		 */
1126 		if (!spe->use_ctx_pkt_for_pid)
1127 			arm_spe_set_pid_tid_cpu(spe, queue);
1128 
1129 		ret = arm_spe_run_decoder(speq, &ts);
1130 		if (ret < 0) {
1131 			auxtrace_heap__add(&spe->heap, queue_nr, ts);
1132 			return ret;
1133 		}
1134 
1135 		if (!ret) {
1136 			ret = auxtrace_heap__add(&spe->heap, queue_nr, ts);
1137 			if (ret < 0)
1138 				return ret;
1139 		} else {
1140 			speq->on_heap = false;
1141 		}
1142 	}
1143 
1144 	return 0;
1145 }
1146 
arm_spe_process_timeless_queues(struct arm_spe * spe,pid_t tid,u64 time_)1147 static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid,
1148 					    u64 time_)
1149 {
1150 	struct auxtrace_queues *queues = &spe->queues;
1151 	unsigned int i;
1152 	u64 ts = 0;
1153 
1154 	for (i = 0; i < queues->nr_queues; i++) {
1155 		struct auxtrace_queue *queue = &spe->queues.queue_array[i];
1156 		struct arm_spe_queue *speq = queue->priv;
1157 
1158 		if (speq && (tid == -1 || speq->tid == tid)) {
1159 			speq->time = time_;
1160 			arm_spe_set_pid_tid_cpu(spe, queue);
1161 			arm_spe_run_decoder(speq, &ts);
1162 		}
1163 	}
1164 	return 0;
1165 }
1166 
arm_spe_context_switch(struct arm_spe * spe,union perf_event * event,struct perf_sample * sample)1167 static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event,
1168 				  struct perf_sample *sample)
1169 {
1170 	pid_t pid, tid;
1171 	int cpu;
1172 
1173 	if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT))
1174 		return 0;
1175 
1176 	pid = event->context_switch.next_prev_pid;
1177 	tid = event->context_switch.next_prev_tid;
1178 	cpu = sample->cpu;
1179 
1180 	if (tid == -1)
1181 		pr_warning("context_switch event has no tid\n");
1182 
1183 	return machine__set_current_tid(spe->machine, cpu, pid, tid);
1184 }
1185 
arm_spe_process_event(struct perf_session * session,union perf_event * event,struct perf_sample * sample,const struct perf_tool * tool)1186 static int arm_spe_process_event(struct perf_session *session,
1187 				 union perf_event *event,
1188 				 struct perf_sample *sample,
1189 				 const struct perf_tool *tool)
1190 {
1191 	int err = 0;
1192 	u64 timestamp;
1193 	struct arm_spe *spe = container_of(session->auxtrace,
1194 			struct arm_spe, auxtrace);
1195 
1196 	if (dump_trace)
1197 		return 0;
1198 
1199 	if (!tool->ordered_events) {
1200 		pr_err("SPE trace requires ordered events\n");
1201 		return -EINVAL;
1202 	}
1203 
1204 	if (sample->time && (sample->time != (u64) -1))
1205 		timestamp = perf_time_to_tsc(sample->time, &spe->tc);
1206 	else
1207 		timestamp = 0;
1208 
1209 	if (timestamp || spe->timeless_decoding) {
1210 		err = arm_spe__update_queues(spe);
1211 		if (err)
1212 			return err;
1213 	}
1214 
1215 	if (spe->timeless_decoding) {
1216 		if (event->header.type == PERF_RECORD_EXIT) {
1217 			err = arm_spe_process_timeless_queues(spe,
1218 					event->fork.tid,
1219 					sample->time);
1220 		}
1221 	} else if (timestamp) {
1222 		err = arm_spe_process_queues(spe, timestamp);
1223 		if (err)
1224 			return err;
1225 
1226 		if (!spe->use_ctx_pkt_for_pid &&
1227 		    (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
1228 		    event->header.type == PERF_RECORD_SWITCH))
1229 			err = arm_spe_context_switch(spe, event, sample);
1230 	}
1231 
1232 	return err;
1233 }
1234 
arm_spe_process_auxtrace_event(struct perf_session * session,union perf_event * event,const struct perf_tool * tool __maybe_unused)1235 static int arm_spe_process_auxtrace_event(struct perf_session *session,
1236 					  union perf_event *event,
1237 					  const struct perf_tool *tool __maybe_unused)
1238 {
1239 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1240 					     auxtrace);
1241 
1242 	if (!spe->data_queued) {
1243 		struct auxtrace_buffer *buffer;
1244 		off_t data_offset;
1245 		int fd = perf_data__fd(session->data);
1246 		int err;
1247 
1248 		if (perf_data__is_pipe(session->data)) {
1249 			data_offset = 0;
1250 		} else {
1251 			data_offset = lseek(fd, 0, SEEK_CUR);
1252 			if (data_offset == -1)
1253 				return -errno;
1254 		}
1255 
1256 		err = auxtrace_queues__add_event(&spe->queues, session, event,
1257 				data_offset, &buffer);
1258 		if (err)
1259 			return err;
1260 
1261 		/* Dump here now we have copied a piped trace out of the pipe */
1262 		if (dump_trace) {
1263 			if (auxtrace_buffer__get_data(buffer, fd)) {
1264 				arm_spe_dump_event(spe, buffer->data,
1265 						buffer->size);
1266 				auxtrace_buffer__put_data(buffer);
1267 			}
1268 		}
1269 	}
1270 
1271 	return 0;
1272 }
1273 
arm_spe_flush(struct perf_session * session __maybe_unused,const struct perf_tool * tool __maybe_unused)1274 static int arm_spe_flush(struct perf_session *session __maybe_unused,
1275 			 const struct perf_tool *tool __maybe_unused)
1276 {
1277 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1278 			auxtrace);
1279 	int ret;
1280 
1281 	if (dump_trace)
1282 		return 0;
1283 
1284 	if (!tool->ordered_events)
1285 		return -EINVAL;
1286 
1287 	ret = arm_spe__update_queues(spe);
1288 	if (ret < 0)
1289 		return ret;
1290 
1291 	if (spe->timeless_decoding)
1292 		return arm_spe_process_timeless_queues(spe, -1,
1293 				MAX_TIMESTAMP - 1);
1294 
1295 	ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
1296 	if (ret)
1297 		return ret;
1298 
1299 	if (!spe->use_ctx_pkt_for_pid)
1300 		ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
1301 			    "Matching of TIDs to SPE events could be inaccurate.\n");
1302 
1303 	return 0;
1304 }
1305 
arm_spe__alloc_per_cpu_metadata(u64 * buf,int per_cpu_size)1306 static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
1307 {
1308 	u64 *metadata;
1309 
1310 	metadata = zalloc(per_cpu_size);
1311 	if (!metadata)
1312 		return NULL;
1313 
1314 	memcpy(metadata, buf, per_cpu_size);
1315 	return metadata;
1316 }
1317 
arm_spe__free_metadata(u64 ** metadata,int nr_cpu)1318 static void arm_spe__free_metadata(u64 **metadata, int nr_cpu)
1319 {
1320 	int i;
1321 
1322 	for (i = 0; i < nr_cpu; i++)
1323 		zfree(&metadata[i]);
1324 	free(metadata);
1325 }
1326 
arm_spe__alloc_metadata(struct perf_record_auxtrace_info * info,u64 * ver,int * nr_cpu)1327 static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info,
1328 				     u64 *ver, int *nr_cpu)
1329 {
1330 	u64 *ptr = (u64 *)info->priv;
1331 	u64 metadata_size;
1332 	u64 **metadata = NULL;
1333 	int hdr_sz, per_cpu_sz, i;
1334 
1335 	metadata_size = info->header.size -
1336 		sizeof(struct perf_record_auxtrace_info);
1337 
1338 	/* Metadata version 1 */
1339 	if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) {
1340 		*ver = 1;
1341 		*nr_cpu = 0;
1342 		/* No per CPU metadata */
1343 		return NULL;
1344 	}
1345 
1346 	*ver = ptr[ARM_SPE_HEADER_VERSION];
1347 	hdr_sz = ptr[ARM_SPE_HEADER_SIZE];
1348 	*nr_cpu = ptr[ARM_SPE_CPUS_NUM];
1349 
1350 	metadata = calloc(*nr_cpu, sizeof(*metadata));
1351 	if (!metadata)
1352 		return NULL;
1353 
1354 	/* Locate the start address of per CPU metadata */
1355 	ptr += hdr_sz;
1356 	per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu);
1357 
1358 	for (i = 0; i < *nr_cpu; i++) {
1359 		metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz);
1360 		if (!metadata[i])
1361 			goto err_per_cpu_metadata;
1362 
1363 		ptr += per_cpu_sz / sizeof(u64);
1364 	}
1365 
1366 	return metadata;
1367 
1368 err_per_cpu_metadata:
1369 	arm_spe__free_metadata(metadata, *nr_cpu);
1370 	return NULL;
1371 }
1372 
arm_spe_free_queue(void * priv)1373 static void arm_spe_free_queue(void *priv)
1374 {
1375 	struct arm_spe_queue *speq = priv;
1376 
1377 	if (!speq)
1378 		return;
1379 	thread__zput(speq->thread);
1380 	arm_spe_decoder_free(speq->decoder);
1381 	zfree(&speq->event_buf);
1382 	zfree(&speq->last_branch);
1383 	free(speq);
1384 }
1385 
arm_spe_free_events(struct perf_session * session)1386 static void arm_spe_free_events(struct perf_session *session)
1387 {
1388 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1389 					     auxtrace);
1390 	struct auxtrace_queues *queues = &spe->queues;
1391 	unsigned int i;
1392 
1393 	for (i = 0; i < queues->nr_queues; i++) {
1394 		arm_spe_free_queue(queues->queue_array[i].priv);
1395 		queues->queue_array[i].priv = NULL;
1396 	}
1397 	auxtrace_queues__free(queues);
1398 }
1399 
arm_spe_free(struct perf_session * session)1400 static void arm_spe_free(struct perf_session *session)
1401 {
1402 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
1403 					     auxtrace);
1404 
1405 	auxtrace_heap__free(&spe->heap);
1406 	arm_spe_free_events(session);
1407 	session->auxtrace = NULL;
1408 	arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu);
1409 	free(spe);
1410 }
1411 
arm_spe_evsel_is_auxtrace(struct perf_session * session,struct evsel * evsel)1412 static bool arm_spe_evsel_is_auxtrace(struct perf_session *session,
1413 				      struct evsel *evsel)
1414 {
1415 	struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace);
1416 
1417 	return evsel->core.attr.type == spe->pmu_type;
1418 }
1419 
1420 static const char * const metadata_hdr_v1_fmts[] = {
1421 	[ARM_SPE_PMU_TYPE]		= "  PMU Type           :%"PRId64"\n",
1422 	[ARM_SPE_PER_CPU_MMAPS]		= "  Per CPU mmaps      :%"PRId64"\n",
1423 };
1424 
1425 static const char * const metadata_hdr_fmts[] = {
1426 	[ARM_SPE_HEADER_VERSION]	= "  Header version     :%"PRId64"\n",
1427 	[ARM_SPE_HEADER_SIZE]		= "  Header size        :%"PRId64"\n",
1428 	[ARM_SPE_PMU_TYPE_V2]		= "  PMU type v2        :%"PRId64"\n",
1429 	[ARM_SPE_CPUS_NUM]		= "  CPU number         :%"PRId64"\n",
1430 };
1431 
1432 static const char * const metadata_per_cpu_fmts[] = {
1433 	[ARM_SPE_MAGIC]			= "    Magic            :0x%"PRIx64"\n",
1434 	[ARM_SPE_CPU]			= "    CPU #            :%"PRId64"\n",
1435 	[ARM_SPE_CPU_NR_PARAMS]		= "    Num of params    :%"PRId64"\n",
1436 	[ARM_SPE_CPU_MIDR]		= "    MIDR             :0x%"PRIx64"\n",
1437 	[ARM_SPE_CPU_PMU_TYPE]		= "    PMU Type         :%"PRId64"\n",
1438 	[ARM_SPE_CAP_MIN_IVAL]		= "    Min Interval     :%"PRId64"\n",
1439 };
1440 
arm_spe_print_info(struct arm_spe * spe,__u64 * arr)1441 static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr)
1442 {
1443 	unsigned int i, cpu, hdr_size, cpu_num, cpu_size;
1444 	const char * const *hdr_fmts;
1445 
1446 	if (!dump_trace)
1447 		return;
1448 
1449 	if (spe->metadata_ver == 1) {
1450 		cpu_num = 0;
1451 		hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX;
1452 		hdr_fmts = metadata_hdr_v1_fmts;
1453 	} else {
1454 		cpu_num = arr[ARM_SPE_CPUS_NUM];
1455 		hdr_size = arr[ARM_SPE_HEADER_SIZE];
1456 		hdr_fmts = metadata_hdr_fmts;
1457 	}
1458 
1459 	for (i = 0; i < hdr_size; i++)
1460 		fprintf(stdout, hdr_fmts[i], arr[i]);
1461 
1462 	arr += hdr_size;
1463 	for (cpu = 0; cpu < cpu_num; cpu++) {
1464 		/*
1465 		 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS
1466 		 * are fixed. The sequential parameter size is decided by the
1467 		 * field 'ARM_SPE_CPU_NR_PARAMS'.
1468 		 */
1469 		cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS];
1470 		for (i = 0; i < cpu_size; i++)
1471 			fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
1472 		arr += cpu_size;
1473 	}
1474 }
1475 
arm_spe_set_event_name(struct evlist * evlist,u64 id,const char * name)1476 static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
1477 				    const char *name)
1478 {
1479 	struct evsel *evsel;
1480 
1481 	evlist__for_each_entry(evlist, evsel) {
1482 		if (evsel->core.id && evsel->core.id[0] == id) {
1483 			if (evsel->name)
1484 				zfree(&evsel->name);
1485 			evsel->name = strdup(name);
1486 			break;
1487 		}
1488 	}
1489 }
1490 
1491 static int
arm_spe_synth_events(struct arm_spe * spe,struct perf_session * session)1492 arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
1493 {
1494 	struct evlist *evlist = session->evlist;
1495 	struct evsel *evsel;
1496 	struct perf_event_attr attr;
1497 	bool found = false;
1498 	u64 id;
1499 	int err;
1500 
1501 	evlist__for_each_entry(evlist, evsel) {
1502 		if (evsel->core.attr.type == spe->pmu_type) {
1503 			found = true;
1504 			break;
1505 		}
1506 	}
1507 
1508 	if (!found) {
1509 		pr_debug("No selected events with SPE trace data\n");
1510 		return 0;
1511 	}
1512 
1513 	memset(&attr, 0, sizeof(struct perf_event_attr));
1514 	attr.size = sizeof(struct perf_event_attr);
1515 	attr.type = PERF_TYPE_HARDWARE;
1516 	attr.sample_type = evsel->core.attr.sample_type &
1517 				(PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR);
1518 	attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
1519 			    PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
1520 			    PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR;
1521 	if (spe->timeless_decoding)
1522 		attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
1523 	else
1524 		attr.sample_type |= PERF_SAMPLE_TIME;
1525 
1526 	spe->sample_type = attr.sample_type;
1527 
1528 	attr.exclude_user = evsel->core.attr.exclude_user;
1529 	attr.exclude_kernel = evsel->core.attr.exclude_kernel;
1530 	attr.exclude_hv = evsel->core.attr.exclude_hv;
1531 	attr.exclude_host = evsel->core.attr.exclude_host;
1532 	attr.exclude_guest = evsel->core.attr.exclude_guest;
1533 	attr.sample_id_all = evsel->core.attr.sample_id_all;
1534 	attr.read_format = evsel->core.attr.read_format;
1535 
1536 	/* create new id val to be a fixed offset from evsel id */
1537 	id = evsel->core.id[0] + 1000000000;
1538 
1539 	if (!id)
1540 		id = 1;
1541 
1542 	if (spe->synth_opts.flc) {
1543 		spe->sample_flc = true;
1544 
1545 		/* Level 1 data cache miss */
1546 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1547 		if (err)
1548 			return err;
1549 		spe->l1d_miss_id = id;
1550 		arm_spe_set_event_name(evlist, id, "l1d-miss");
1551 		id += 1;
1552 
1553 		/* Level 1 data cache access */
1554 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1555 		if (err)
1556 			return err;
1557 		spe->l1d_access_id = id;
1558 		arm_spe_set_event_name(evlist, id, "l1d-access");
1559 		id += 1;
1560 	}
1561 
1562 	if (spe->synth_opts.llc) {
1563 		spe->sample_llc = true;
1564 
1565 		/* Last level cache miss */
1566 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1567 		if (err)
1568 			return err;
1569 		spe->llc_miss_id = id;
1570 		arm_spe_set_event_name(evlist, id, "llc-miss");
1571 		id += 1;
1572 
1573 		/* Last level cache access */
1574 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1575 		if (err)
1576 			return err;
1577 		spe->llc_access_id = id;
1578 		arm_spe_set_event_name(evlist, id, "llc-access");
1579 		id += 1;
1580 	}
1581 
1582 	if (spe->synth_opts.tlb) {
1583 		spe->sample_tlb = true;
1584 
1585 		/* TLB miss */
1586 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1587 		if (err)
1588 			return err;
1589 		spe->tlb_miss_id = id;
1590 		arm_spe_set_event_name(evlist, id, "tlb-miss");
1591 		id += 1;
1592 
1593 		/* TLB access */
1594 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1595 		if (err)
1596 			return err;
1597 		spe->tlb_access_id = id;
1598 		arm_spe_set_event_name(evlist, id, "tlb-access");
1599 		id += 1;
1600 	}
1601 
1602 	if (spe->synth_opts.last_branch) {
1603 		if (spe->synth_opts.last_branch_sz > 2)
1604 			pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n");
1605 
1606 		attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
1607 		/*
1608 		 * We don't use the hardware index, but the sample generation
1609 		 * code uses the new format branch_stack with this field,
1610 		 * so the event attributes must indicate that it's present.
1611 		 */
1612 		attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
1613 	}
1614 
1615 	if (spe->synth_opts.branches) {
1616 		spe->sample_branch = true;
1617 
1618 		/* Branch */
1619 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1620 		if (err)
1621 			return err;
1622 		spe->branch_id = id;
1623 		arm_spe_set_event_name(evlist, id, "branch");
1624 		id += 1;
1625 	}
1626 
1627 	if (spe->synth_opts.remote_access) {
1628 		spe->sample_remote_access = true;
1629 
1630 		/* Remote access */
1631 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1632 		if (err)
1633 			return err;
1634 		spe->remote_access_id = id;
1635 		arm_spe_set_event_name(evlist, id, "remote-access");
1636 		id += 1;
1637 	}
1638 
1639 	if (spe->synth_opts.mem) {
1640 		spe->sample_memory = true;
1641 
1642 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1643 		if (err)
1644 			return err;
1645 		spe->memory_id = id;
1646 		arm_spe_set_event_name(evlist, id, "memory");
1647 		id += 1;
1648 	}
1649 
1650 	if (spe->synth_opts.instructions) {
1651 		if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
1652 			pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n");
1653 			goto synth_instructions_out;
1654 		}
1655 		if (spe->synth_opts.period > 1)
1656 			pr_warning("Arm SPE has a hardware-based sample period.\n"
1657 				   "Additional instruction events will be discarded by --itrace\n");
1658 
1659 		spe->sample_instructions = true;
1660 		attr.config = PERF_COUNT_HW_INSTRUCTIONS;
1661 		attr.sample_period = spe->synth_opts.period;
1662 		spe->instructions_sample_period = attr.sample_period;
1663 		err = perf_session__deliver_synth_attr_event(session, &attr, id);
1664 		if (err)
1665 			return err;
1666 		spe->instructions_id = id;
1667 		arm_spe_set_event_name(evlist, id, "instructions");
1668 	}
1669 synth_instructions_out:
1670 
1671 	return 0;
1672 }
1673 
arm_spe__is_homogeneous(u64 ** metadata,int nr_cpu)1674 static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu)
1675 {
1676 	u64 midr;
1677 	int i;
1678 
1679 	if (!nr_cpu)
1680 		return false;
1681 
1682 	for (i = 0; i < nr_cpu; i++) {
1683 		if (!metadata[i])
1684 			return false;
1685 
1686 		if (i == 0) {
1687 			midr = metadata[i][ARM_SPE_CPU_MIDR];
1688 			continue;
1689 		}
1690 
1691 		if (midr != metadata[i][ARM_SPE_CPU_MIDR])
1692 			return false;
1693 	}
1694 
1695 	return true;
1696 }
1697 
arm_spe_process_auxtrace_info(union perf_event * event,struct perf_session * session)1698 int arm_spe_process_auxtrace_info(union perf_event *event,
1699 				  struct perf_session *session)
1700 {
1701 	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1702 	size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE;
1703 	struct perf_record_time_conv *tc = &session->time_conv;
1704 	struct arm_spe *spe;
1705 	u64 **metadata = NULL;
1706 	u64 metadata_ver;
1707 	int nr_cpu, err;
1708 
1709 	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) +
1710 					min_sz)
1711 		return -EINVAL;
1712 
1713 	metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver,
1714 					   &nr_cpu);
1715 	if (!metadata && metadata_ver != 1) {
1716 		pr_err("Failed to parse Arm SPE metadata.\n");
1717 		return -EINVAL;
1718 	}
1719 
1720 	spe = zalloc(sizeof(struct arm_spe));
1721 	if (!spe) {
1722 		err = -ENOMEM;
1723 		goto err_free_metadata;
1724 	}
1725 
1726 	err = auxtrace_queues__init(&spe->queues);
1727 	if (err)
1728 		goto err_free;
1729 
1730 	spe->session = session;
1731 	spe->machine = &session->machines.host; /* No kvm support */
1732 	spe->auxtrace_type = auxtrace_info->type;
1733 	if (metadata_ver == 1)
1734 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
1735 	else
1736 		spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2];
1737 	spe->metadata = metadata;
1738 	spe->metadata_ver = metadata_ver;
1739 	spe->metadata_nr_cpu = nr_cpu;
1740 	spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu);
1741 
1742 	spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
1743 
1744 	/*
1745 	 * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead
1746 	 * and the parameters for hardware clock are stored in the session
1747 	 * context.  Passes these parameters to the struct perf_tsc_conversion
1748 	 * in "spe->tc", which is used for later conversion between clock
1749 	 * counter and timestamp.
1750 	 *
1751 	 * For backward compatibility, copies the fields starting from
1752 	 * "time_cycles" only if they are contained in the event.
1753 	 */
1754 	spe->tc.time_shift = tc->time_shift;
1755 	spe->tc.time_mult = tc->time_mult;
1756 	spe->tc.time_zero = tc->time_zero;
1757 
1758 	if (event_contains(*tc, time_cycles)) {
1759 		spe->tc.time_cycles = tc->time_cycles;
1760 		spe->tc.time_mask = tc->time_mask;
1761 		spe->tc.cap_user_time_zero = tc->cap_user_time_zero;
1762 		spe->tc.cap_user_time_short = tc->cap_user_time_short;
1763 	}
1764 
1765 	spe->auxtrace.process_event = arm_spe_process_event;
1766 	spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
1767 	spe->auxtrace.flush_events = arm_spe_flush;
1768 	spe->auxtrace.free_events = arm_spe_free_events;
1769 	spe->auxtrace.free = arm_spe_free;
1770 	spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
1771 	session->auxtrace = &spe->auxtrace;
1772 
1773 	arm_spe_print_info(spe, &auxtrace_info->priv[0]);
1774 
1775 	if (dump_trace)
1776 		return 0;
1777 
1778 	if (session->itrace_synth_opts && session->itrace_synth_opts->set)
1779 		spe->synth_opts = *session->itrace_synth_opts;
1780 	else
1781 		itrace_synth_opts__set_default(&spe->synth_opts, false);
1782 
1783 	err = arm_spe_synth_events(spe, session);
1784 	if (err)
1785 		goto err_free_queues;
1786 
1787 	err = auxtrace_queues__process_index(&spe->queues, session);
1788 	if (err)
1789 		goto err_free_queues;
1790 
1791 	if (spe->queues.populated)
1792 		spe->data_queued = true;
1793 
1794 	return 0;
1795 
1796 err_free_queues:
1797 	auxtrace_queues__free(&spe->queues);
1798 	session->auxtrace = NULL;
1799 err_free:
1800 	free(spe);
1801 err_free_metadata:
1802 	arm_spe__free_metadata(metadata, nr_cpu);
1803 	return err;
1804 }
1805