xref: /kvm-unit-tests/x86/pmu.c (revision 699264f5ef8129c60e9db7c281e572016ad41a45)
1 
2 #include "x86/msr.h"
3 #include "x86/processor.h"
4 #include "x86/pmu.h"
5 #include "x86/apic-defs.h"
6 #include "x86/apic.h"
7 #include "x86/desc.h"
8 #include "x86/isr.h"
9 #include "vmalloc.h"
10 #include "alloc.h"
11 
12 #include "libcflat.h"
13 #include <stdint.h>
14 
15 #define N 1000000
16 
17 #define IBPB_JMP_INSNS		9
18 #define IBPB_JMP_BRANCHES	2
19 
20 #if defined(__i386__) || defined(_M_IX86) /* i386 */
21 #define IBPB_JMP_ASM(_wrmsr)				\
22 	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
23 	"mov $73, %%ecx;\n\t"				\
24 	_wrmsr "\n\t"					\
25 	"call 1f\n\t"					\
26 	"1: pop %%eax\n\t"				\
27 	"add $(2f-1b), %%eax\n\t"			\
28 	"jmp *%%eax;\n\t"                               \
29 	"nop;\n\t"					\
30 	"2: nop;\n\t"
31 #else /* x86_64 */
32 #define IBPB_JMP_ASM(_wrmsr)				\
33 	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
34 	"mov $73, %%ecx;\n\t"				\
35 	_wrmsr "\n\t"					\
36 	"call 1f\n\t"					\
37 	"1: pop %%rax\n\t"				\
38 	"add $(2f-1b), %%rax\n\t"                       \
39 	"jmp *%%rax;\n\t"                               \
40 	"nop;\n\t"					\
41 	"2: nop;\n\t"
42 #endif
43 
44 /* GLOBAL_CTRL enable + disable + clflush/mfence + IBPB_JMP */
45 #define EXTRA_INSNS  (3 + 3 + 2 + IBPB_JMP_INSNS)
46 #define LOOP_INSNS   (N * 10 + EXTRA_INSNS)
47 #define LOOP_BRANCHES  (N + IBPB_JMP_BRANCHES)
48 #define LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)				\
49 	_wrmsr1 "\n\t"							\
50 	"mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t"			\
51 	_clflush "\n\t"                                 		\
52 	"mfence;\n\t"                                   		\
53 	"1: mov (%1), %2; add $64, %1;\n\t"				\
54 	"nop; nop; nop; nop; nop; nop; nop;\n\t"			\
55 	"loop 1b;\n\t"							\
56 	IBPB_JMP_ASM(_wrmsr2) 						\
57 	"mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t"	\
58 	_wrmsr1 "\n\t"
59 
60 #define _loop_asm(_wrmsr1, _clflush, _wrmsr2)			\
61 do {								\
62 	asm volatile(LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)	\
63 		     : "=b"(tmp), "=r"(tmp2), "=r"(tmp3)	\
64 		     : "a"(eax), "d"(edx), "c"(global_ctl),	\
65 		       "0"(N), "1"(buf)				\
66 		     : "edi");					\
67 } while (0)
68 
69 /* the number of instructions and branches of the kvm_fep_asm() blob */
70 #define KVM_FEP_INSNS		22
71 #define KVM_FEP_BRANCHES	5
72 
73 /*
74  * KVM_FEP is a magic prefix that forces emulation so
75  * 'KVM_FEP "jne label\n"' just counts as a single instruction.
76  */
77 #define kvm_fep_asm(_wrmsr)			\
78 do {						\
79 	asm volatile(				\
80 		_wrmsr "\n\t"			\
81 		"mov %%ecx, %%edi;\n\t"		\
82 		"mov $0x0, %%eax;\n\t"		\
83 		"cmp $0x0, %%eax;\n\t"		\
84 		KVM_FEP "jne 1f\n\t"		\
85 		KVM_FEP "jne 1f\n\t"		\
86 		KVM_FEP "jne 1f\n\t"		\
87 		KVM_FEP "jne 1f\n\t"		\
88 		KVM_FEP "jne 1f\n\t"		\
89 		"mov $0xa, %%eax; cpuid;\n\t"	\
90 		"mov $0xa, %%eax; cpuid;\n\t"	\
91 		"mov $0xa, %%eax; cpuid;\n\t"	\
92 		"mov $0xa, %%eax; cpuid;\n\t"	\
93 		"mov $0xa, %%eax; cpuid;\n\t"	\
94 		"1: mov %%edi, %%ecx; \n\t"	\
95 		"xor %%eax, %%eax; \n\t"	\
96 		"xor %%edx, %%edx;\n\t"		\
97 		_wrmsr "\n\t"			\
98 		:				\
99 		: "a"(eax), "d"(edx), "c"(ecx)	\
100 		: "ebx", "edi");		\
101 } while (0)
102 
103 typedef struct {
104 	uint32_t ctr;
105 	uint32_t idx;
106 	uint64_t config;
107 	uint64_t count;
108 } pmu_counter_t;
109 
110 struct pmu_event {
111 	const char *name;
112 	uint32_t unit_sel;
113 	int min;
114 	int max;
115 } intel_gp_events[] = {
116 	{"core cycles", 0x003c, 1*N, 50*N},
117 	{"instructions", 0x00c0, 10*N, 10.2*N},
118 	{"ref cycles", 0x013c, 1*N, 30*N},
119 	{"llc references", 0x4f2e, 1, 2*N},
120 	{"llc misses", 0x412e, 1, 1*N},
121 	{"branches", 0x00c4, 1*N, 1.1*N},
122 	{"branch misses", 0x00c5, 1, 0.1*N},
123 }, amd_gp_events[] = {
124 	{"core cycles", 0x0076, 1*N, 50*N},
125 	{"instructions", 0x00c0, 10*N, 10.2*N},
126 	{"branches", 0x00c2, 1*N, 1.1*N},
127 	{"branch misses", 0x00c3, 1, 0.1*N},
128 }, fixed_events[] = {
129 	{"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
130 	{"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
131 	{"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
132 };
133 
134 /*
135  * Events index in intel_gp_events[], ensure consistent with
136  * intel_gp_events[].
137  */
138 enum {
139 	INTEL_INSTRUCTIONS_IDX  = 1,
140 	INTEL_REF_CYCLES_IDX	= 2,
141 	INTEL_LLC_MISSES_IDX	= 4,
142 	INTEL_BRANCHES_IDX	= 5,
143 	INTEL_BRANCH_MISS_IDX	= 6,
144 };
145 
146 /*
147  * Events index in amd_gp_events[], ensure consistent with
148  * amd_gp_events[].
149  */
150 enum {
151 	AMD_INSTRUCTIONS_IDX    = 1,
152 	AMD_BRANCHES_IDX	= 2,
153 	AMD_BRANCH_MISS_IDX	= 3,
154 };
155 
156 char *buf;
157 
158 static struct pmu_event *gp_events;
159 static unsigned int gp_events_size;
160 static unsigned int fixed_counters_num;
161 
has_ibpb(void)162 static int has_ibpb(void)
163 {
164 	return this_cpu_has(X86_FEATURE_SPEC_CTRL) ||
165 	       this_cpu_has(X86_FEATURE_AMD_IBPB);
166 }
167 
__loop(void)168 static inline void __loop(void)
169 {
170 	unsigned long tmp, tmp2, tmp3;
171 	u32 global_ctl = 0;
172 	u32 eax = 0;
173 	u32 edx = 0;
174 
175 	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
176 		_loop_asm("nop", "clflush (%1)", "wrmsr");
177 	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
178 		_loop_asm("nop", "clflush (%1)", "nop");
179 	else if (has_ibpb())
180 		_loop_asm("nop", "nop", "wrmsr");
181 	else
182 		_loop_asm("nop", "nop", "nop");
183 }
184 
185 /*
186  * Enable and disable counters in a whole asm blob to ensure
187  * no other instructions are counted in the window between
188  * counters enabling and really LOOP_ASM code executing.
189  * Thus counters can verify instructions and branches events
190  * against precise counts instead of a rough valid count range.
191  */
__precise_loop(u64 cntrs)192 static inline void __precise_loop(u64 cntrs)
193 {
194 	unsigned long tmp, tmp2, tmp3;
195 	u32 global_ctl = pmu.msr_global_ctl;
196 	u32 eax = cntrs & (BIT_ULL(32) - 1);
197 	u32 edx = cntrs >> 32;
198 
199 	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
200 		_loop_asm("wrmsr", "clflush (%1)", "wrmsr");
201 	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
202 		_loop_asm("wrmsr", "clflush (%1)", "nop");
203 	else if (has_ibpb())
204 		_loop_asm("wrmsr", "nop", "wrmsr");
205 	else
206 		_loop_asm("wrmsr", "nop", "nop");
207 }
208 
loop(u64 cntrs)209 static inline void loop(u64 cntrs)
210 {
211 	if (!this_cpu_has_perf_global_ctrl())
212 		__loop();
213 	else
214 		__precise_loop(cntrs);
215 }
216 
adjust_events_range(struct pmu_event * gp_events,int instruction_idx,int branch_idx,int branch_miss_idx)217 static void adjust_events_range(struct pmu_event *gp_events,
218 				int instruction_idx, int branch_idx,
219 				int branch_miss_idx)
220 {
221 	/*
222 	 * If HW supports GLOBAL_CTRL MSR, enabling and disabling PMCs are
223 	 * moved in __precise_loop(). Thus, instructions and branches events
224 	 * can be verified against a precise count instead of a rough range.
225 	 *
226 	 * Skip the precise checks on AMD, as AMD CPUs count VMRUN as a branch
227 	 * instruction in guest context, which* leads to intermittent failures
228 	 * as the counts will vary depending on how many asynchronous VM-Exits
229 	 * occur while running the measured code, e.g. if the host takes IRQs.
230 	 */
231 	if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) {
232 		gp_events[instruction_idx].min = LOOP_INSNS;
233 		gp_events[instruction_idx].max = LOOP_INSNS;
234 		gp_events[branch_idx].min = LOOP_BRANCHES;
235 		gp_events[branch_idx].max = LOOP_BRANCHES;
236 	}
237 
238 	/*
239 	 * For CPUs without IBPB support, no way to force to trigger a branch
240 	 * miss and the measured branch misses is possible to be 0.  Thus
241 	 * overwrite the lower boundary of branch misses event to 0 to avoid
242 	 * false positive.
243 	 */
244 	if (!has_ibpb())
245 		gp_events[branch_miss_idx].min = 0;
246 }
247 
248 volatile uint64_t irq_received;
249 
cnt_overflow(isr_regs_t * regs)250 static void cnt_overflow(isr_regs_t *regs)
251 {
252 	irq_received++;
253 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
254 	apic_write(APIC_EOI, 0);
255 }
256 
check_irq(void)257 static bool check_irq(void)
258 {
259 	int i;
260 	irq_received = 0;
261 	sti();
262 	for (i = 0; i < 100000 && !irq_received; i++)
263 		asm volatile("pause");
264 	cli();
265 	return irq_received;
266 }
267 
is_gp(pmu_counter_t * evt)268 static bool is_gp(pmu_counter_t *evt)
269 {
270 	if (!pmu.is_intel)
271 		return true;
272 
273 	return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 ||
274 		evt->ctr >= MSR_IA32_PMC0;
275 }
276 
event_to_global_idx(pmu_counter_t * cnt)277 static int event_to_global_idx(pmu_counter_t *cnt)
278 {
279 	if (pmu.is_intel)
280 		return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base :
281 			(MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
282 
283 	if (pmu.msr_gp_counter_base == MSR_F15H_PERF_CTR0)
284 		return (cnt->ctr - pmu.msr_gp_counter_base) / 2;
285 	else
286 		return cnt->ctr - pmu.msr_gp_counter_base;
287 }
288 
get_counter_event(pmu_counter_t * cnt)289 static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
290 {
291 	if (is_gp(cnt)) {
292 		int i;
293 
294 		for (i = 0; i < gp_events_size; i++)
295 			if (gp_events[i].unit_sel == (cnt->config & 0xffff))
296 				return &gp_events[i];
297 	} else {
298 		unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0;
299 
300 		if (idx < ARRAY_SIZE(fixed_events))
301 			return &fixed_events[idx];
302 	}
303 
304 	return (void*)0;
305 }
306 
global_enable(pmu_counter_t * cnt)307 static void global_enable(pmu_counter_t *cnt)
308 {
309 	if (!this_cpu_has_perf_global_ctrl())
310 		return;
311 
312 	cnt->idx = event_to_global_idx(cnt);
313 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx));
314 }
315 
global_disable(pmu_counter_t * cnt)316 static void global_disable(pmu_counter_t *cnt)
317 {
318 	if (!this_cpu_has_perf_global_ctrl())
319 		return;
320 
321 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx));
322 }
323 
__start_event(pmu_counter_t * evt,uint64_t count)324 static void __start_event(pmu_counter_t *evt, uint64_t count)
325 {
326     evt->count = count;
327     wrmsr(evt->ctr, evt->count);
328     if (is_gp(evt)) {
329 	    wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
330 		  evt->config | EVNTSEL_EN);
331     } else {
332 	    uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
333 	    int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
334 	    uint32_t usrospmi = 0;
335 
336 	    if (evt->config & EVNTSEL_OS)
337 		    usrospmi |= (1 << 0);
338 	    if (evt->config & EVNTSEL_USR)
339 		    usrospmi |= (1 << 1);
340 	    if (evt->config & EVNTSEL_INT)
341 		    usrospmi |= (1 << 3); // PMI on overflow
342 	    ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift);
343 	    wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl);
344     }
345     apic_write(APIC_LVTPC, PMI_VECTOR);
346 }
347 
start_event(pmu_counter_t * evt)348 static void start_event(pmu_counter_t *evt)
349 {
350 	__start_event(evt, 0);
351 	global_enable(evt);
352 }
353 
__stop_event(pmu_counter_t * evt)354 static void __stop_event(pmu_counter_t *evt)
355 {
356 	if (is_gp(evt)) {
357 		wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
358 		      evt->config & ~EVNTSEL_EN);
359 	} else {
360 		uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
361 		int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
362 		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift));
363 	}
364 	evt->count = rdmsr(evt->ctr);
365 }
366 
stop_event(pmu_counter_t * evt)367 static void stop_event(pmu_counter_t *evt)
368 {
369 	global_disable(evt);
370 	__stop_event(evt);
371 }
372 
measure_many(pmu_counter_t * evt,int count)373 static noinline void measure_many(pmu_counter_t *evt, int count)
374 {
375 	int i;
376 	u64 cntrs = 0;
377 
378 	for (i = 0; i < count; i++) {
379 		__start_event(&evt[i], 0);
380 		cntrs |= BIT_ULL(event_to_global_idx(&evt[i]));
381 	}
382 	loop(cntrs);
383 	for (i = 0; i < count; i++)
384 		__stop_event(&evt[i]);
385 }
386 
measure_one(pmu_counter_t * evt)387 static void measure_one(pmu_counter_t *evt)
388 {
389 	measure_many(evt, 1);
390 }
391 
__measure(pmu_counter_t * evt,uint64_t count)392 static noinline void __measure(pmu_counter_t *evt, uint64_t count)
393 {
394 	u64 cntrs = BIT_ULL(event_to_global_idx(evt));
395 
396 	__start_event(evt, count);
397 	loop(cntrs);
398 	__stop_event(evt);
399 }
400 
verify_event(uint64_t count,struct pmu_event * e)401 static bool verify_event(uint64_t count, struct pmu_event *e)
402 {
403 	bool pass;
404 
405 	if (!e)
406 		return false;
407 
408 	pass = count >= e->min && count <= e->max;
409 	if (!pass)
410 		printf("FAIL: %d <= %"PRId64" <= %d\n", e->min, count, e->max);
411 
412 	return pass;
413 }
414 
verify_counter(pmu_counter_t * cnt)415 static bool verify_counter(pmu_counter_t *cnt)
416 {
417 	return verify_event(cnt->count, get_counter_event(cnt));
418 }
419 
check_gp_counter(struct pmu_event * evt)420 static void check_gp_counter(struct pmu_event *evt)
421 {
422 	pmu_counter_t cnt = {
423 		.config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel,
424 	};
425 	int i;
426 
427 	for (i = 0; i < pmu.nr_gp_counters; i++) {
428 		cnt.ctr = MSR_GP_COUNTERx(i);
429 		measure_one(&cnt);
430 		report(verify_event(cnt.count, evt), "%s-%d", evt->name, i);
431 	}
432 }
433 
check_gp_counters(void)434 static void check_gp_counters(void)
435 {
436 	int i;
437 
438 	for (i = 0; i < gp_events_size; i++)
439 		if (pmu_gp_counter_is_available(i))
440 			check_gp_counter(&gp_events[i]);
441 		else
442 			printf("GP event '%s' is disabled\n",
443 					gp_events[i].name);
444 }
445 
check_fixed_counters(void)446 static void check_fixed_counters(void)
447 {
448 	pmu_counter_t cnt = {
449 		.config = EVNTSEL_OS | EVNTSEL_USR,
450 	};
451 	int i;
452 
453 	for (i = 0; i < fixed_counters_num; i++) {
454 		cnt.ctr = fixed_events[i].unit_sel;
455 		measure_one(&cnt);
456 		report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i);
457 	}
458 }
459 
check_counters_many(void)460 static void check_counters_many(void)
461 {
462 	pmu_counter_t cnt[48];
463 	int i, n;
464 
465 	for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) {
466 		if (!pmu_gp_counter_is_available(i))
467 			continue;
468 
469 		cnt[n].ctr = MSR_GP_COUNTERx(n);
470 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR |
471 			gp_events[i % gp_events_size].unit_sel;
472 		n++;
473 	}
474 	for (i = 0; i < fixed_counters_num; i++) {
475 		cnt[n].ctr = fixed_events[i].unit_sel;
476 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR;
477 		n++;
478 	}
479 
480 	assert(n <= ARRAY_SIZE(cnt));
481 	measure_many(cnt, n);
482 
483 	for (i = 0; i < n; i++)
484 		if (!verify_counter(&cnt[i]))
485 			break;
486 
487 	report(i == n, "all counters");
488 }
489 
measure_for_overflow(pmu_counter_t * cnt)490 static uint64_t measure_for_overflow(pmu_counter_t *cnt)
491 {
492 	__measure(cnt, 0);
493 	/*
494 	 * To generate overflow, i.e. roll over to '0', the initial count just
495 	 * needs to be preset to the negative expected count.  However, as per
496 	 * Intel's SDM, the preset count needs to be incremented by 1 to ensure
497 	 * the overflow interrupt is generated immediately instead of possibly
498 	 * waiting for the overflow to propagate through the counter.
499 	 */
500 	assert(cnt->count > 1);
501 	return 1 - cnt->count;
502 }
503 
check_counter_overflow(void)504 static void check_counter_overflow(void)
505 {
506 	int i;
507 	uint64_t overflow_preset;
508 	int instruction_idx = pmu.is_intel ?
509 			      INTEL_INSTRUCTIONS_IDX :
510 			      AMD_INSTRUCTIONS_IDX;
511 
512 	pmu_counter_t cnt = {
513 		.ctr = MSR_GP_COUNTERx(0),
514 		.config = EVNTSEL_OS | EVNTSEL_USR |
515 			  gp_events[instruction_idx].unit_sel /* instructions */,
516 	};
517 	overflow_preset = measure_for_overflow(&cnt);
518 
519 	/* clear status before test */
520 	if (this_cpu_has_perf_global_status())
521 		pmu_clear_global_status();
522 
523 	report_prefix_push("overflow");
524 
525 	for (i = 0; i < pmu.nr_gp_counters + 1; i++) {
526 		uint64_t status;
527 		int idx;
528 
529 		cnt.count = overflow_preset;
530 		if (pmu_use_full_writes())
531 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
532 
533 		if (i == pmu.nr_gp_counters) {
534 			if (!pmu.is_intel)
535 				break;
536 
537 			cnt.ctr = fixed_events[0].unit_sel;
538 			cnt.count = measure_for_overflow(&cnt);
539 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
540 		} else {
541 			cnt.ctr = MSR_GP_COUNTERx(i);
542 		}
543 
544 		if (i % 2)
545 			cnt.config |= EVNTSEL_INT;
546 		else
547 			cnt.config &= ~EVNTSEL_INT;
548 		idx = event_to_global_idx(&cnt);
549 		__measure(&cnt, cnt.count);
550 		if (pmu.is_intel)
551 			report(cnt.count == 1, "cntr-%d", i);
552 		else
553 			report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i);
554 
555 		if (!this_cpu_has_perf_global_status())
556 			continue;
557 
558 		status = rdmsr(pmu.msr_global_status);
559 		report(status & (1ull << idx), "status-%d", i);
560 		wrmsr(pmu.msr_global_status_clr, status);
561 		status = rdmsr(pmu.msr_global_status);
562 		report(!(status & (1ull << idx)), "status clear-%d", i);
563 		report(check_irq() == (i % 2), "irq-%d", i);
564 	}
565 
566 	report_prefix_pop();
567 }
568 
check_gp_counter_cmask(void)569 static void check_gp_counter_cmask(void)
570 {
571 	int instruction_idx = pmu.is_intel ?
572 			      INTEL_INSTRUCTIONS_IDX :
573 			      AMD_INSTRUCTIONS_IDX;
574 
575 	pmu_counter_t cnt = {
576 		.ctr = MSR_GP_COUNTERx(0),
577 		.config = EVNTSEL_OS | EVNTSEL_USR |
578 			  gp_events[instruction_idx].unit_sel /* instructions */,
579 	};
580 	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
581 	measure_one(&cnt);
582 	report(cnt.count < gp_events[instruction_idx].min, "cmask");
583 }
584 
do_rdpmc_fast(void * ptr)585 static void do_rdpmc_fast(void *ptr)
586 {
587 	pmu_counter_t *cnt = ptr;
588 	uint32_t idx = (uint32_t)cnt->idx | (1u << 31);
589 
590 	if (!is_gp(cnt))
591 		idx |= 1 << 30;
592 
593 	cnt->count = rdpmc(idx);
594 }
595 
596 
check_rdpmc(void)597 static void check_rdpmc(void)
598 {
599 	uint64_t val = 0xff0123456789ull;
600 	bool exc;
601 	int i;
602 
603 	report_prefix_push("rdpmc");
604 
605 	for (i = 0; i < pmu.nr_gp_counters; i++) {
606 		uint64_t x;
607 		pmu_counter_t cnt = {
608 			.ctr = MSR_GP_COUNTERx(i),
609 			.idx = i
610 		};
611 
612 	        /*
613 	         * Without full-width writes, only the low 32 bits are writable,
614 	         * and the value is sign-extended.
615 	         */
616 		if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0)
617 			x = (uint64_t)(int64_t)(int32_t)val;
618 		else
619 			x = (uint64_t)(int64_t)val;
620 
621 		/* Mask according to the number of supported bits */
622 		x &= (1ull << pmu.gp_counter_width) - 1;
623 
624 		wrmsr(MSR_GP_COUNTERx(i), val);
625 		report(rdpmc(i) == x, "cntr-%d", i);
626 
627 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
628 		if (exc)
629 			report_skip("fast-%d", i);
630 		else
631 			report(cnt.count == (u32)val, "fast-%d", i);
632 	}
633 	for (i = 0; i < fixed_counters_num; i++) {
634 		uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1);
635 		pmu_counter_t cnt = {
636 			.ctr = MSR_CORE_PERF_FIXED_CTR0 + i,
637 			.idx = i
638 		};
639 
640 		wrmsr(MSR_PERF_FIXED_CTRx(i), x);
641 		report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i);
642 
643 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
644 		if (exc)
645 			report_skip("fixed fast-%d", i);
646 		else
647 			report(cnt.count == (u32)x, "fixed fast-%d", i);
648 	}
649 
650 	report_prefix_pop();
651 }
652 
check_running_counter_wrmsr(void)653 static void check_running_counter_wrmsr(void)
654 {
655 	uint64_t status;
656 	uint64_t count;
657 	unsigned int instruction_idx = pmu.is_intel ?
658 				       INTEL_INSTRUCTIONS_IDX :
659 				       AMD_INSTRUCTIONS_IDX;
660 
661 	pmu_counter_t evt = {
662 		.ctr = MSR_GP_COUNTERx(0),
663 		.config = EVNTSEL_OS | EVNTSEL_USR |
664 			  gp_events[instruction_idx].unit_sel,
665 	};
666 
667 	report_prefix_push("running counter wrmsr");
668 
669 	start_event(&evt);
670 	__loop();
671 	wrmsr(MSR_GP_COUNTERx(0), 0);
672 	stop_event(&evt);
673 	report(evt.count < gp_events[instruction_idx].min, "cntr");
674 
675 	/* clear status before overflow test */
676 	if (this_cpu_has_perf_global_status())
677 		pmu_clear_global_status();
678 
679 	start_event(&evt);
680 
681 	count = -1;
682 	if (pmu_use_full_writes())
683 		count &= (1ull << pmu.gp_counter_width) - 1;
684 
685 	wrmsr(MSR_GP_COUNTERx(0), count);
686 
687 	__loop();
688 	stop_event(&evt);
689 
690 	if (this_cpu_has_perf_global_status()) {
691 		status = rdmsr(pmu.msr_global_status);
692 		report(status & 1, "status msr bit");
693 	}
694 
695 	report_prefix_pop();
696 }
697 
check_emulated_instr(void)698 static void check_emulated_instr(void)
699 {
700 	u32 eax, edx, ecx;
701 	uint64_t status, instr_start, brnch_start;
702 	uint64_t gp_counter_width = (1ull << pmu.gp_counter_width) - 1;
703 	unsigned int branch_idx = pmu.is_intel ?
704 				  INTEL_BRANCHES_IDX : AMD_BRANCHES_IDX;
705 	unsigned int instruction_idx = pmu.is_intel ?
706 				       INTEL_INSTRUCTIONS_IDX :
707 				       AMD_INSTRUCTIONS_IDX;
708 
709 	pmu_counter_t brnch_cnt = {
710 		.ctr = MSR_GP_COUNTERx(0),
711 		/* branch instructions */
712 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[branch_idx].unit_sel,
713 	};
714 	pmu_counter_t instr_cnt = {
715 		.ctr = MSR_GP_COUNTERx(1),
716 		/* instructions */
717 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel,
718 	};
719 	report_prefix_push("emulated instruction");
720 
721 	if (this_cpu_has_perf_global_status())
722 		pmu_clear_global_status();
723 
724 	__start_event(&brnch_cnt, 0);
725 	__start_event(&instr_cnt, 0);
726 
727 	brnch_start = -KVM_FEP_BRANCHES;
728 	instr_start = -KVM_FEP_INSNS;
729 	wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width);
730 	wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width);
731 
732 	if (this_cpu_has_perf_global_ctrl()) {
733 		eax = BIT(0) | BIT(1);
734 		ecx = pmu.msr_global_ctl;
735 		edx = 0;
736 		kvm_fep_asm("wrmsr");
737 	} else {
738 		eax = ecx = edx = 0;
739 		kvm_fep_asm("nop");
740 	}
741 
742 	__stop_event(&brnch_cnt);
743 	__stop_event(&instr_cnt);
744 
745 	// Check that the end count - start count is at least the expected
746 	// number of instructions and branches.
747 	if (this_cpu_has_perf_global_ctrl()) {
748 		report(instr_cnt.count - instr_start == KVM_FEP_INSNS,
749 		       "instruction count");
750 		report(brnch_cnt.count - brnch_start == KVM_FEP_BRANCHES,
751 		       "branch count");
752 	} else {
753 		report(instr_cnt.count - instr_start >= KVM_FEP_INSNS,
754 		       "instruction count");
755 		report(brnch_cnt.count - brnch_start >= KVM_FEP_BRANCHES,
756 		       "branch count");
757 	}
758 
759 	if (this_cpu_has_perf_global_status()) {
760 		// Additionally check that those counters overflowed properly.
761 		status = rdmsr(pmu.msr_global_status);
762 		report(status & BIT_ULL(0), "branch counter overflow");
763 		report(status & BIT_ULL(1), "instruction counter overflow");
764 	}
765 
766 	report_prefix_pop();
767 }
768 
769 #define XBEGIN_STARTED (~0u)
check_tsx_cycles(void)770 static void check_tsx_cycles(void)
771 {
772 	pmu_counter_t cnt;
773 	unsigned int i, ret = 0;
774 
775 	if (!this_cpu_has(X86_FEATURE_RTM))
776 		return;
777 
778 	report_prefix_push("TSX cycles");
779 
780 	for (i = 0; i < pmu.nr_gp_counters; i++) {
781 		cnt.ctr = MSR_GP_COUNTERx(i);
782 
783 		if (i == 2) {
784 			/* Transactional cycles committed only on gp counter 2 */
785 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x30000003c;
786 		} else {
787 			/* Transactional cycles */
788 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x10000003c;
789 		}
790 
791 		start_event(&cnt);
792 
793 		asm volatile("xbegin 1f\n\t"
794 				"1:\n\t"
795 				: "+a" (ret) :: "memory");
796 
797 		/* Generate a non-canonical #GP to trigger ABORT. */
798 		if (ret == XBEGIN_STARTED)
799 			*(int *)NONCANONICAL = 0;
800 
801 		stop_event(&cnt);
802 
803 		report(cnt.count > 0, "gp cntr-%d with a value of %" PRId64 "", i, cnt.count);
804 	}
805 
806 	report_prefix_pop();
807 }
808 
warm_up(void)809 static void warm_up(void)
810 {
811 	int i;
812 
813 	/*
814 	 * Since cycles event is always run as the first event, there would be
815 	 * a warm-up state to warm up the cache, it leads to the measured cycles
816 	 * value may exceed the pre-defined cycles upper boundary and cause
817 	 * false positive. To avoid this, introduce an warm-up state before
818 	 * the real verification.
819 	 */
820 	for (i = 0; i < 10; i++)
821 		loop(0);
822 }
823 
check_counters(void)824 static void check_counters(void)
825 {
826 	if (is_fep_available())
827 		check_emulated_instr();
828 
829 	warm_up();
830 	check_gp_counters();
831 	check_fixed_counters();
832 	check_rdpmc();
833 	check_counters_many();
834 	check_counter_overflow();
835 	check_gp_counter_cmask();
836 	check_running_counter_wrmsr();
837 	check_tsx_cycles();
838 }
839 
do_unsupported_width_counter_write(void * index)840 static void do_unsupported_width_counter_write(void *index)
841 {
842 	wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull);
843 }
844 
check_gp_counters_write_width(void)845 static void check_gp_counters_write_width(void)
846 {
847 	u64 val_64 = 0xffffff0123456789ull;
848 	u64 val_32 = val_64 & ((1ull << 32) - 1);
849 	u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1);
850 	int i;
851 
852 	/*
853 	 * MSR_IA32_PERFCTRn supports 64-bit writes,
854 	 * but only the lowest 32 bits are valid.
855 	 */
856 	for (i = 0; i < pmu.nr_gp_counters; i++) {
857 		wrmsr(MSR_IA32_PERFCTR0 + i, val_32);
858 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
859 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
860 
861 		wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width);
862 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
863 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
864 
865 		wrmsr(MSR_IA32_PERFCTR0 + i, val_64);
866 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
867 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
868 	}
869 
870 	/*
871 	 * MSR_IA32_PMCn supports writing values up to GP counter width,
872 	 * and only the lowest bits of GP counter width are valid.
873 	 */
874 	for (i = 0; i < pmu.nr_gp_counters; i++) {
875 		wrmsr(MSR_IA32_PMC0 + i, val_32);
876 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
877 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
878 
879 		wrmsr(MSR_IA32_PMC0 + i, val_max_width);
880 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width);
881 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width);
882 
883 		report(test_for_exception(GP_VECTOR,
884 			do_unsupported_width_counter_write, &i),
885 		"writing unsupported width to MSR_IA32_PMC%d raises #GP", i);
886 	}
887 }
888 
889 /*
890  * Per the SDM, reference cycles are currently implemented using the
891  * core crystal clock, TSC, or bus clock. Calibrate to the TSC
892  * frequency to set reasonable expectations.
893  */
set_ref_cycle_expectations(void)894 static void set_ref_cycle_expectations(void)
895 {
896 	pmu_counter_t cnt = {
897 		.ctr = MSR_IA32_PERFCTR0,
898 		.config = EVNTSEL_OS | EVNTSEL_USR |
899 			  intel_gp_events[INTEL_REF_CYCLES_IDX].unit_sel,
900 	};
901 	uint64_t tsc_delta;
902 	uint64_t t0, t1, t2, t3;
903 
904 	/* Bit 2 enumerates the availability of reference cycles events. */
905 	if (!pmu.nr_gp_counters || !pmu_gp_counter_is_available(2))
906 		return;
907 
908 	if (this_cpu_has_perf_global_ctrl())
909 		wrmsr(pmu.msr_global_ctl, 0);
910 
911 	t0 = fenced_rdtsc();
912 	start_event(&cnt);
913 	t1 = fenced_rdtsc();
914 
915 	/*
916 	 * This loop has to run long enough to dominate the VM-exit
917 	 * costs for playing with the PMU MSRs on start and stop.
918 	 *
919 	 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times
920 	 * the core crystal clock, this function calculated a guest
921 	 * TSC : ref cycles ratio of around 105 with ECX initialized
922 	 * to one billion.
923 	 */
924 	asm volatile("loop ." : "+c"((int){1000000000ull}));
925 
926 	t2 = fenced_rdtsc();
927 	stop_event(&cnt);
928 	t3 = fenced_rdtsc();
929 
930 	tsc_delta = ((t2 - t1) + (t3 - t0)) / 2;
931 
932 	if (!tsc_delta)
933 		return;
934 
935 	intel_gp_events[INTEL_REF_CYCLES_IDX].min =
936 		(intel_gp_events[INTEL_REF_CYCLES_IDX].min * cnt.count) / tsc_delta;
937 	intel_gp_events[INTEL_REF_CYCLES_IDX].max =
938 		(intel_gp_events[INTEL_REF_CYCLES_IDX].max * cnt.count) / tsc_delta;
939 }
940 
check_invalid_rdpmc_gp(void)941 static void check_invalid_rdpmc_gp(void)
942 {
943 	uint64_t val;
944 
945 	report(rdpmc_safe(64, &val) == GP_VECTOR,
946 	       "Expected #GP on RDPMC(64)");
947 }
948 
main(int ac,char ** av)949 int main(int ac, char **av)
950 {
951 	int instruction_idx;
952 	int branch_idx;
953 	int branch_miss_idx;
954 
955 	setup_vm();
956 	handle_irq(PMI_VECTOR, cnt_overflow);
957 	buf = malloc(N*64);
958 
959 	check_invalid_rdpmc_gp();
960 
961 	if (pmu.is_intel) {
962 		if (!pmu.version) {
963 			report_skip("No Intel Arch PMU is detected!");
964 			return report_summary();
965 		}
966 		gp_events = (struct pmu_event *)intel_gp_events;
967 		gp_events_size = sizeof(intel_gp_events)/sizeof(intel_gp_events[0]);
968 		instruction_idx = INTEL_INSTRUCTIONS_IDX;
969 		branch_idx = INTEL_BRANCHES_IDX;
970 		branch_miss_idx = INTEL_BRANCH_MISS_IDX;
971 
972 		/*
973 		 * For legacy Intel CPUS without clflush/clflushopt support,
974 		 * there is no way to force to trigger a LLC miss, thus set
975 		 * the minimum value to 0 to avoid false positives.
976 		 */
977 		if (!this_cpu_has(X86_FEATURE_CLFLUSH))
978 			gp_events[INTEL_LLC_MISSES_IDX].min = 0;
979 
980 		report_prefix_push("Intel");
981 		set_ref_cycle_expectations();
982 	} else {
983 		gp_events_size = sizeof(amd_gp_events)/sizeof(amd_gp_events[0]);
984 		gp_events = (struct pmu_event *)amd_gp_events;
985 		instruction_idx = AMD_INSTRUCTIONS_IDX;
986 		branch_idx = AMD_BRANCHES_IDX;
987 		branch_miss_idx = AMD_BRANCH_MISS_IDX;
988 		report_prefix_push("AMD");
989 	}
990 	adjust_events_range(gp_events, instruction_idx, branch_idx, branch_miss_idx);
991 
992 	printf("PMU version:         %d\n", pmu.version);
993 	printf("GP counters:         %d\n", pmu.nr_gp_counters);
994 	printf("GP counter width:    %d\n", pmu.gp_counter_width);
995 	printf("Mask length:         %d\n", pmu.gp_counter_mask_length);
996 	printf("Fixed counters:      %d\n", pmu.nr_fixed_counters);
997 	printf("Fixed counter width: %d\n", pmu.fixed_counter_width);
998 
999 	fixed_counters_num = MIN(pmu.nr_fixed_counters, ARRAY_SIZE(fixed_events));
1000 	if (pmu.nr_fixed_counters > ARRAY_SIZE(fixed_events))
1001 		report_info("Fixed counters number %d > defined fixed events %u.  "
1002 			    "Please update test case.", pmu.nr_fixed_counters,
1003 			    (unsigned)ARRAY_SIZE(fixed_events));
1004 
1005 	apic_write(APIC_LVTPC, PMI_VECTOR);
1006 
1007 	check_counters();
1008 
1009 	if (pmu_has_full_writes()) {
1010 		pmu.msr_gp_counter_base = MSR_IA32_PMC0;
1011 
1012 		report_prefix_push("full-width writes");
1013 		check_counters();
1014 		check_gp_counters_write_width();
1015 		report_prefix_pop();
1016 	}
1017 
1018 	if (!pmu.is_intel) {
1019 		report_prefix_push("K7");
1020 		pmu.nr_gp_counters = AMD64_NUM_COUNTERS;
1021 		pmu.msr_gp_counter_base = MSR_K7_PERFCTR0;
1022 		pmu.msr_gp_event_select_base = MSR_K7_EVNTSEL0;
1023 		check_counters();
1024 		report_prefix_pop();
1025 	}
1026 
1027 	return report_summary();
1028 }
1029