xref: /kvm-unit-tests/x86/pmu.c (revision f3f338619e4938c2509f5c691adc1f331b07c203)
1 
2 #include "x86/msr.h"
3 #include "x86/processor.h"
4 #include "x86/pmu.h"
5 #include "x86/apic-defs.h"
6 #include "x86/apic.h"
7 #include "x86/desc.h"
8 #include "x86/isr.h"
9 #include "vmalloc.h"
10 #include "alloc.h"
11 
12 #include "libcflat.h"
13 #include <stdint.h>
14 
15 #define N 1000000
16 
17 #define IBPB_JMP_INSNS		9
18 #define IBPB_JMP_BRANCHES	2
19 
20 #if defined(__i386__) || defined(_M_IX86) /* i386 */
21 #define IBPB_JMP_ASM(_wrmsr)				\
22 	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
23 	"mov $73, %%ecx;\n\t"				\
24 	_wrmsr "\n\t"					\
25 	"call 1f\n\t"					\
26 	"1: pop %%eax\n\t"				\
27 	"add $(2f-1b), %%eax\n\t"			\
28 	"jmp *%%eax;\n\t"                               \
29 	"nop;\n\t"					\
30 	"2: nop;\n\t"
31 #else /* x86_64 */
32 #define IBPB_JMP_ASM(_wrmsr)				\
33 	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
34 	"mov $73, %%ecx;\n\t"				\
35 	_wrmsr "\n\t"					\
36 	"call 1f\n\t"					\
37 	"1: pop %%rax\n\t"				\
38 	"add $(2f-1b), %%rax\n\t"                       \
39 	"jmp *%%rax;\n\t"                               \
40 	"nop;\n\t"					\
41 	"2: nop;\n\t"
42 #endif
43 
44 /* GLOBAL_CTRL enable + disable + clflush/mfence + IBPB_JMP */
45 #define EXTRA_INSNS  (3 + 3 + 2 + IBPB_JMP_INSNS)
46 #define LOOP_INSNS   (N * 10 + EXTRA_INSNS)
47 #define LOOP_BRANCHES  (N + IBPB_JMP_BRANCHES)
48 #define LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)				\
49 	_wrmsr1 "\n\t"							\
50 	"mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t"			\
51 	_clflush "\n\t"                                 		\
52 	"mfence;\n\t"                                   		\
53 	"1: mov (%1), %2; add $64, %1;\n\t"				\
54 	"nop; nop; nop; nop; nop; nop; nop;\n\t"			\
55 	"loop 1b;\n\t"							\
56 	IBPB_JMP_ASM(_wrmsr2) 						\
57 	"mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t"	\
58 	_wrmsr1 "\n\t"
59 
60 #define _loop_asm(_wrmsr1, _clflush, _wrmsr2)			\
61 do {								\
62 	asm volatile(LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)	\
63 		     : "=b"(tmp), "=r"(tmp2), "=r"(tmp3)	\
64 		     : "a"(eax), "d"(edx), "c"(global_ctl),	\
65 		       "0"(N), "1"(buf)				\
66 		     : "edi");					\
67 } while (0)
68 
69 /* the number of instructions and branches of the kvm_fep_asm() blob */
70 #define KVM_FEP_INSNS		22
71 #define KVM_FEP_BRANCHES	5
72 
73 /*
74  * KVM_FEP is a magic prefix that forces emulation so
75  * 'KVM_FEP "jne label\n"' just counts as a single instruction.
76  */
77 #define kvm_fep_asm(_wrmsr)			\
78 do {						\
79 	asm volatile(				\
80 		_wrmsr "\n\t"			\
81 		"mov %%ecx, %%edi;\n\t"		\
82 		"mov $0x0, %%eax;\n\t"		\
83 		"cmp $0x0, %%eax;\n\t"		\
84 		KVM_FEP "jne 1f\n\t"		\
85 		KVM_FEP "jne 1f\n\t"		\
86 		KVM_FEP "jne 1f\n\t"		\
87 		KVM_FEP "jne 1f\n\t"		\
88 		KVM_FEP "jne 1f\n\t"		\
89 		"mov $0xa, %%eax; cpuid;\n\t"	\
90 		"mov $0xa, %%eax; cpuid;\n\t"	\
91 		"mov $0xa, %%eax; cpuid;\n\t"	\
92 		"mov $0xa, %%eax; cpuid;\n\t"	\
93 		"mov $0xa, %%eax; cpuid;\n\t"	\
94 		"1: mov %%edi, %%ecx; \n\t"	\
95 		"xor %%eax, %%eax; \n\t"	\
96 		"xor %%edx, %%edx;\n\t"		\
97 		_wrmsr "\n\t"			\
98 		:				\
99 		: "a"(eax), "d"(edx), "c"(ecx)	\
100 		: "ebx", "edi");		\
101 } while (0)
102 
103 typedef struct {
104 	uint32_t ctr;
105 	uint32_t idx;
106 	uint64_t config;
107 	uint64_t count;
108 } pmu_counter_t;
109 
110 struct pmu_event {
111 	const char *name;
112 	uint32_t unit_sel;
113 	int min;
114 	int max;
115 } intel_gp_events[] = {
116 	{"core cycles", 0x003c, 1*N, 50*N},
117 	{"instructions", 0x00c0, 10*N, 10.2*N},
118 	{"ref cycles", 0x013c, 1*N, 30*N},
119 	{"llc references", 0x4f2e, 1, 2*N},
120 	{"llc misses", 0x412e, 1, 1*N},
121 	{"branches", 0x00c4, 1*N, 1.1*N},
122 	{"branch misses", 0x00c5, 1, 0.1*N},
123 }, amd_gp_events[] = {
124 	{"core cycles", 0x0076, 1*N, 50*N},
125 	{"instructions", 0x00c0, 10*N, 10.2*N},
126 	{"branches", 0x00c2, 1*N, 1.1*N},
127 	{"branch misses", 0x00c3, 1, 0.1*N},
128 }, fixed_events[] = {
129 	{"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
130 	{"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
131 	{"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
132 };
133 
134 /*
135  * Events index in intel_gp_events[], ensure consistent with
136  * intel_gp_events[].
137  */
138 enum {
139 	INTEL_INSTRUCTIONS_IDX  = 1,
140 	INTEL_REF_CYCLES_IDX	= 2,
141 	INTEL_LLC_MISSES_IDX	= 4,
142 	INTEL_BRANCHES_IDX	= 5,
143 	INTEL_BRANCH_MISS_IDX	= 6,
144 };
145 
146 /*
147  * Events index in amd_gp_events[], ensure consistent with
148  * amd_gp_events[].
149  */
150 enum {
151 	AMD_INSTRUCTIONS_IDX    = 1,
152 	AMD_BRANCHES_IDX	= 2,
153 	AMD_BRANCH_MISS_IDX	= 3,
154 };
155 
156 char *buf;
157 
158 static struct pmu_event *gp_events;
159 static unsigned int gp_events_size;
160 static unsigned int fixed_counters_num;
161 
has_ibpb(void)162 static int has_ibpb(void)
163 {
164 	return this_cpu_has(X86_FEATURE_SPEC_CTRL) ||
165 	       this_cpu_has(X86_FEATURE_AMD_IBPB);
166 }
167 
__loop(void)168 static inline void __loop(void)
169 {
170 	unsigned long tmp, tmp2, tmp3;
171 	u32 global_ctl = 0;
172 	u32 eax = 0;
173 	u32 edx = 0;
174 
175 	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
176 		_loop_asm("nop", "clflush (%1)", "wrmsr");
177 	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
178 		_loop_asm("nop", "clflush (%1)", "nop");
179 	else if (has_ibpb())
180 		_loop_asm("nop", "nop", "wrmsr");
181 	else
182 		_loop_asm("nop", "nop", "nop");
183 }
184 
185 /*
186  * Enable and disable counters in a whole asm blob to ensure
187  * no other instructions are counted in the window between
188  * counters enabling and really LOOP_ASM code executing.
189  * Thus counters can verify instructions and branches events
190  * against precise counts instead of a rough valid count range.
191  */
__precise_loop(u64 cntrs)192 static inline void __precise_loop(u64 cntrs)
193 {
194 	unsigned long tmp, tmp2, tmp3;
195 	u32 global_ctl = pmu.msr_global_ctl;
196 	u32 eax = cntrs & (BIT_ULL(32) - 1);
197 	u32 edx = cntrs >> 32;
198 
199 	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
200 		_loop_asm("wrmsr", "clflush (%1)", "wrmsr");
201 	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
202 		_loop_asm("wrmsr", "clflush (%1)", "nop");
203 	else if (has_ibpb())
204 		_loop_asm("wrmsr", "nop", "wrmsr");
205 	else
206 		_loop_asm("wrmsr", "nop", "nop");
207 }
208 
loop(u64 cntrs)209 static inline void loop(u64 cntrs)
210 {
211 	if (!this_cpu_has_perf_global_ctrl())
212 		__loop();
213 	else
214 		__precise_loop(cntrs);
215 }
216 
adjust_events_range(struct pmu_event * gp_events,int instruction_idx,int branch_idx,int branch_miss_idx)217 static void adjust_events_range(struct pmu_event *gp_events,
218 				int instruction_idx, int branch_idx,
219 				int branch_miss_idx)
220 {
221 	/*
222 	 * If HW supports GLOBAL_CTRL MSR, enabling and disabling PMCs are
223 	 * moved in __precise_loop(). Thus, instructions and branches events
224 	 * can be verified against a precise count instead of a rough range.
225 	 *
226 	 * Skip the precise checks on AMD, as AMD CPUs count VMRUN as a branch
227 	 * instruction in guest context, which* leads to intermittent failures
228 	 * as the counts will vary depending on how many asynchronous VM-Exits
229 	 * occur while running the measured code, e.g. if the host takes IRQs.
230 	 */
231 	if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) {
232 		gp_events[instruction_idx].min = LOOP_INSNS;
233 		gp_events[instruction_idx].max = LOOP_INSNS;
234 		gp_events[branch_idx].min = LOOP_BRANCHES;
235 		gp_events[branch_idx].max = LOOP_BRANCHES;
236 	}
237 
238 	/*
239 	 * For CPUs without IBPB support, no way to force to trigger a branch
240 	 * miss and the measured branch misses is possible to be 0.  Thus
241 	 * overwrite the lower boundary of branch misses event to 0 to avoid
242 	 * false positive.
243 	 */
244 	if (!has_ibpb())
245 		gp_events[branch_miss_idx].min = 0;
246 }
247 
248 volatile uint64_t irq_received;
249 
cnt_overflow(isr_regs_t * regs)250 static void cnt_overflow(isr_regs_t *regs)
251 {
252 	irq_received++;
253 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
254 	apic_write(APIC_EOI, 0);
255 }
256 
check_irq(void)257 static bool check_irq(void)
258 {
259 	int i;
260 	irq_received = 0;
261 	sti();
262 	for (i = 0; i < 100000 && !irq_received; i++)
263 		asm volatile("pause");
264 	cli();
265 	return irq_received;
266 }
267 
is_gp(pmu_counter_t * evt)268 static bool is_gp(pmu_counter_t *evt)
269 {
270 	if (!pmu.is_intel)
271 		return true;
272 
273 	return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 ||
274 		evt->ctr >= MSR_IA32_PMC0;
275 }
276 
event_to_global_idx(pmu_counter_t * cnt)277 static int event_to_global_idx(pmu_counter_t *cnt)
278 {
279 	if (pmu.is_intel)
280 		return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base :
281 			(MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
282 
283 	if (pmu.msr_gp_counter_base == MSR_F15H_PERF_CTR0)
284 		return (cnt->ctr - pmu.msr_gp_counter_base) / 2;
285 	else
286 		return cnt->ctr - pmu.msr_gp_counter_base;
287 }
288 
get_counter_event(pmu_counter_t * cnt)289 static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
290 {
291 	if (is_gp(cnt)) {
292 		int i;
293 
294 		for (i = 0; i < gp_events_size; i++)
295 			if (gp_events[i].unit_sel == (cnt->config & 0xffff))
296 				return &gp_events[i];
297 	} else {
298 		unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0;
299 
300 		if (idx < ARRAY_SIZE(fixed_events))
301 			return &fixed_events[idx];
302 	}
303 
304 	return (void*)0;
305 }
306 
global_enable(pmu_counter_t * cnt)307 static void global_enable(pmu_counter_t *cnt)
308 {
309 	if (!this_cpu_has_perf_global_ctrl())
310 		return;
311 
312 	cnt->idx = event_to_global_idx(cnt);
313 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx));
314 }
315 
global_disable(pmu_counter_t * cnt)316 static void global_disable(pmu_counter_t *cnt)
317 {
318 	if (!this_cpu_has_perf_global_ctrl())
319 		return;
320 
321 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx));
322 }
323 
__start_event(pmu_counter_t * evt,uint64_t count)324 static void __start_event(pmu_counter_t *evt, uint64_t count)
325 {
326     evt->count = count;
327     wrmsr(evt->ctr, evt->count);
328     if (is_gp(evt)) {
329 	    wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
330 		  evt->config | EVNTSEL_EN);
331     } else {
332 	    uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
333 	    int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
334 	    uint32_t usrospmi = 0;
335 
336 	    if (evt->config & EVNTSEL_OS)
337 		    usrospmi |= (1 << 0);
338 	    if (evt->config & EVNTSEL_USR)
339 		    usrospmi |= (1 << 1);
340 	    if (evt->config & EVNTSEL_INT)
341 		    usrospmi |= (1 << 3); // PMI on overflow
342 	    ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift);
343 	    wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl);
344     }
345     apic_write(APIC_LVTPC, PMI_VECTOR);
346 }
347 
start_event(pmu_counter_t * evt)348 static void start_event(pmu_counter_t *evt)
349 {
350 	__start_event(evt, 0);
351 	global_enable(evt);
352 }
353 
__stop_event(pmu_counter_t * evt)354 static void __stop_event(pmu_counter_t *evt)
355 {
356 	if (is_gp(evt)) {
357 		wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
358 		      evt->config & ~EVNTSEL_EN);
359 	} else {
360 		uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
361 		int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
362 		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift));
363 	}
364 	evt->count = rdmsr(evt->ctr);
365 }
366 
stop_event(pmu_counter_t * evt)367 static void stop_event(pmu_counter_t *evt)
368 {
369 	global_disable(evt);
370 	__stop_event(evt);
371 }
372 
measure_many(pmu_counter_t * evt,int count)373 static noinline void measure_many(pmu_counter_t *evt, int count)
374 {
375 	int i;
376 	u64 cntrs = 0;
377 
378 	for (i = 0; i < count; i++) {
379 		__start_event(&evt[i], 0);
380 		cntrs |= BIT_ULL(event_to_global_idx(&evt[i]));
381 	}
382 	loop(cntrs);
383 	for (i = 0; i < count; i++)
384 		__stop_event(&evt[i]);
385 }
386 
measure_one(pmu_counter_t * evt)387 static void measure_one(pmu_counter_t *evt)
388 {
389 	measure_many(evt, 1);
390 }
391 
__measure(pmu_counter_t * evt,uint64_t count)392 static noinline void __measure(pmu_counter_t *evt, uint64_t count)
393 {
394 	u64 cntrs = BIT_ULL(event_to_global_idx(evt));
395 
396 	__start_event(evt, count);
397 	loop(cntrs);
398 	__stop_event(evt);
399 }
400 
verify_event(uint64_t count,struct pmu_event * e)401 static bool verify_event(uint64_t count, struct pmu_event *e)
402 {
403 	bool pass;
404 
405 	if (!e)
406 		return false;
407 
408 	pass = count >= e->min && count <= e->max;
409 	if (!pass)
410 		printf("FAIL: %d <= %"PRId64" <= %d\n", e->min, count, e->max);
411 
412 	return pass;
413 }
414 
verify_counter(pmu_counter_t * cnt)415 static bool verify_counter(pmu_counter_t *cnt)
416 {
417 	return verify_event(cnt->count, get_counter_event(cnt));
418 }
419 
check_gp_counter(struct pmu_event * evt)420 static void check_gp_counter(struct pmu_event *evt)
421 {
422 	pmu_counter_t cnt = {
423 		.config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel,
424 	};
425 	int i;
426 
427 	for (i = 0; i < pmu.nr_gp_counters; i++) {
428 		cnt.ctr = MSR_GP_COUNTERx(i);
429 		measure_one(&cnt);
430 		report(verify_event(cnt.count, evt), "%s-%d", evt->name, i);
431 	}
432 }
433 
check_gp_counters(void)434 static void check_gp_counters(void)
435 {
436 	int i;
437 
438 	for (i = 0; i < gp_events_size; i++)
439 		if (pmu_arch_event_is_available(i))
440 			check_gp_counter(&gp_events[i]);
441 		else
442 			printf("GP event '%s' is disabled\n",
443 					gp_events[i].name);
444 }
445 
check_fixed_counters(void)446 static void check_fixed_counters(void)
447 {
448 	pmu_counter_t cnt = {
449 		.config = EVNTSEL_OS | EVNTSEL_USR,
450 	};
451 	int i;
452 
453 	for (i = 0; i < fixed_counters_num; i++) {
454 		cnt.ctr = fixed_events[i].unit_sel;
455 		measure_one(&cnt);
456 		report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i);
457 	}
458 }
459 
get_one_event(int idx)460 static struct pmu_event *get_one_event(int idx)
461 {
462 	int i;
463 
464 	if (pmu_arch_event_is_available(idx))
465 		return &gp_events[idx % gp_events_size];
466 
467 	for (i = 0; i < gp_events_size; i++) {
468 		if (pmu_arch_event_is_available(i))
469 			return &gp_events[i];
470 	}
471 
472 	return NULL;
473 }
474 
check_counters_many(void)475 static void check_counters_many(void)
476 {
477 	struct pmu_event *evt;
478 	pmu_counter_t cnt[48];
479 	int i, n;
480 
481 	for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) {
482 		evt = get_one_event(i);
483 		if (!evt)
484 			continue;
485 
486 		cnt[n].ctr = MSR_GP_COUNTERx(n);
487 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel;
488 		n++;
489 	}
490 	for (i = 0; i < fixed_counters_num; i++) {
491 		cnt[n].ctr = fixed_events[i].unit_sel;
492 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR;
493 		n++;
494 	}
495 
496 	assert(n <= ARRAY_SIZE(cnt));
497 	measure_many(cnt, n);
498 
499 	for (i = 0; i < n; i++)
500 		if (!verify_counter(&cnt[i]))
501 			break;
502 
503 	report(i == n, "all counters");
504 }
505 
measure_for_overflow(pmu_counter_t * cnt)506 static uint64_t measure_for_overflow(pmu_counter_t *cnt)
507 {
508 	__measure(cnt, 0);
509 	/*
510 	 * To generate overflow, i.e. roll over to '0', the initial count just
511 	 * needs to be preset to the negative expected count.  However, as per
512 	 * Intel's SDM, the preset count needs to be incremented by 1 to ensure
513 	 * the overflow interrupt is generated immediately instead of possibly
514 	 * waiting for the overflow to propagate through the counter.
515 	 */
516 	assert(cnt->count > 1);
517 	return 1 - cnt->count;
518 }
519 
check_counter_overflow(void)520 static void check_counter_overflow(void)
521 {
522 	int i;
523 	uint64_t overflow_preset;
524 	int instruction_idx = pmu.is_intel ?
525 			      INTEL_INSTRUCTIONS_IDX :
526 			      AMD_INSTRUCTIONS_IDX;
527 
528 	pmu_counter_t cnt = {
529 		.ctr = MSR_GP_COUNTERx(0),
530 		.config = EVNTSEL_OS | EVNTSEL_USR |
531 			  gp_events[instruction_idx].unit_sel /* instructions */,
532 	};
533 	overflow_preset = measure_for_overflow(&cnt);
534 
535 	/* clear status before test */
536 	if (this_cpu_has_perf_global_status())
537 		pmu_clear_global_status();
538 
539 	report_prefix_push("overflow");
540 
541 	for (i = 0; i < pmu.nr_gp_counters + 1; i++) {
542 		uint64_t status;
543 		int idx;
544 
545 		cnt.count = overflow_preset;
546 		if (pmu_use_full_writes())
547 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
548 
549 		if (i == pmu.nr_gp_counters) {
550 			if (!pmu.is_intel)
551 				break;
552 
553 			cnt.ctr = fixed_events[0].unit_sel;
554 			cnt.count = measure_for_overflow(&cnt);
555 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
556 		} else {
557 			cnt.ctr = MSR_GP_COUNTERx(i);
558 		}
559 
560 		if (i % 2)
561 			cnt.config |= EVNTSEL_INT;
562 		else
563 			cnt.config &= ~EVNTSEL_INT;
564 		idx = event_to_global_idx(&cnt);
565 		__measure(&cnt, cnt.count);
566 		if (pmu.is_intel)
567 			report(cnt.count == 1, "cntr-%d", i);
568 		else
569 			report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i);
570 
571 		if (!this_cpu_has_perf_global_status())
572 			continue;
573 
574 		status = rdmsr(pmu.msr_global_status);
575 		report(status & (1ull << idx), "status-%d", i);
576 		wrmsr(pmu.msr_global_status_clr, status);
577 		status = rdmsr(pmu.msr_global_status);
578 		report(!(status & (1ull << idx)), "status clear-%d", i);
579 		report(check_irq() == (i % 2), "irq-%d", i);
580 	}
581 
582 	report_prefix_pop();
583 }
584 
check_gp_counter_cmask(void)585 static void check_gp_counter_cmask(void)
586 {
587 	int instruction_idx = pmu.is_intel ?
588 			      INTEL_INSTRUCTIONS_IDX :
589 			      AMD_INSTRUCTIONS_IDX;
590 
591 	pmu_counter_t cnt = {
592 		.ctr = MSR_GP_COUNTERx(0),
593 		.config = EVNTSEL_OS | EVNTSEL_USR |
594 			  gp_events[instruction_idx].unit_sel /* instructions */,
595 	};
596 	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
597 	measure_one(&cnt);
598 	report(cnt.count < gp_events[instruction_idx].min, "cmask");
599 }
600 
do_rdpmc_fast(void * ptr)601 static void do_rdpmc_fast(void *ptr)
602 {
603 	pmu_counter_t *cnt = ptr;
604 	uint32_t idx = (uint32_t)cnt->idx | (1u << 31);
605 
606 	if (!is_gp(cnt))
607 		idx |= 1 << 30;
608 
609 	cnt->count = rdpmc(idx);
610 }
611 
612 
check_rdpmc(void)613 static void check_rdpmc(void)
614 {
615 	uint64_t val = 0xff0123456789ull;
616 	bool exc;
617 	int i;
618 
619 	report_prefix_push("rdpmc");
620 
621 	for (i = 0; i < pmu.nr_gp_counters; i++) {
622 		uint64_t x;
623 		pmu_counter_t cnt = {
624 			.ctr = MSR_GP_COUNTERx(i),
625 			.idx = i
626 		};
627 
628 	        /*
629 	         * Without full-width writes, only the low 32 bits are writable,
630 	         * and the value is sign-extended.
631 	         */
632 		if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0)
633 			x = (uint64_t)(int64_t)(int32_t)val;
634 		else
635 			x = (uint64_t)(int64_t)val;
636 
637 		/* Mask according to the number of supported bits */
638 		x &= (1ull << pmu.gp_counter_width) - 1;
639 
640 		wrmsr(MSR_GP_COUNTERx(i), val);
641 		report(rdpmc(i) == x, "cntr-%d", i);
642 
643 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
644 		if (exc)
645 			report_skip("fast-%d", i);
646 		else
647 			report(cnt.count == (u32)val, "fast-%d", i);
648 	}
649 	for (i = 0; i < fixed_counters_num; i++) {
650 		uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1);
651 		pmu_counter_t cnt = {
652 			.ctr = MSR_CORE_PERF_FIXED_CTR0 + i,
653 			.idx = i
654 		};
655 
656 		wrmsr(MSR_PERF_FIXED_CTRx(i), x);
657 		report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i);
658 
659 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
660 		if (exc)
661 			report_skip("fixed fast-%d", i);
662 		else
663 			report(cnt.count == (u32)x, "fixed fast-%d", i);
664 	}
665 
666 	report_prefix_pop();
667 }
668 
check_running_counter_wrmsr(void)669 static void check_running_counter_wrmsr(void)
670 {
671 	uint64_t status;
672 	uint64_t count;
673 	unsigned int instruction_idx = pmu.is_intel ?
674 				       INTEL_INSTRUCTIONS_IDX :
675 				       AMD_INSTRUCTIONS_IDX;
676 
677 	pmu_counter_t evt = {
678 		.ctr = MSR_GP_COUNTERx(0),
679 		.config = EVNTSEL_OS | EVNTSEL_USR |
680 			  gp_events[instruction_idx].unit_sel,
681 	};
682 
683 	report_prefix_push("running counter wrmsr");
684 
685 	start_event(&evt);
686 	__loop();
687 	wrmsr(MSR_GP_COUNTERx(0), 0);
688 	stop_event(&evt);
689 	report(evt.count < gp_events[instruction_idx].min, "cntr");
690 
691 	/* clear status before overflow test */
692 	if (this_cpu_has_perf_global_status())
693 		pmu_clear_global_status();
694 
695 	start_event(&evt);
696 
697 	count = -1;
698 	if (pmu_use_full_writes())
699 		count &= (1ull << pmu.gp_counter_width) - 1;
700 
701 	wrmsr(MSR_GP_COUNTERx(0), count);
702 
703 	__loop();
704 	stop_event(&evt);
705 
706 	if (this_cpu_has_perf_global_status()) {
707 		status = rdmsr(pmu.msr_global_status);
708 		report(status & 1, "status msr bit");
709 	}
710 
711 	report_prefix_pop();
712 }
713 
check_emulated_instr(void)714 static void check_emulated_instr(void)
715 {
716 	u32 eax, edx, ecx;
717 	uint64_t status, instr_start, brnch_start;
718 	uint64_t gp_counter_width = (1ull << pmu.gp_counter_width) - 1;
719 	unsigned int branch_idx = pmu.is_intel ?
720 				  INTEL_BRANCHES_IDX : AMD_BRANCHES_IDX;
721 	unsigned int instruction_idx = pmu.is_intel ?
722 				       INTEL_INSTRUCTIONS_IDX :
723 				       AMD_INSTRUCTIONS_IDX;
724 
725 	pmu_counter_t brnch_cnt = {
726 		.ctr = MSR_GP_COUNTERx(0),
727 		/* branch instructions */
728 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[branch_idx].unit_sel,
729 	};
730 	pmu_counter_t instr_cnt = {
731 		.ctr = MSR_GP_COUNTERx(1),
732 		/* instructions */
733 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel,
734 	};
735 	report_prefix_push("emulated instruction");
736 
737 	if (this_cpu_has_perf_global_status())
738 		pmu_clear_global_status();
739 
740 	__start_event(&brnch_cnt, 0);
741 	__start_event(&instr_cnt, 0);
742 
743 	brnch_start = -KVM_FEP_BRANCHES;
744 	instr_start = -KVM_FEP_INSNS;
745 	wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width);
746 	wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width);
747 
748 	if (this_cpu_has_perf_global_ctrl()) {
749 		eax = BIT(0) | BIT(1);
750 		ecx = pmu.msr_global_ctl;
751 		edx = 0;
752 		kvm_fep_asm("wrmsr");
753 	} else {
754 		eax = ecx = edx = 0;
755 		kvm_fep_asm("nop");
756 	}
757 
758 	__stop_event(&brnch_cnt);
759 	__stop_event(&instr_cnt);
760 
761 	// Check that the end count - start count is at least the expected
762 	// number of instructions and branches.
763 	if (this_cpu_has_perf_global_ctrl()) {
764 		report(instr_cnt.count - instr_start == KVM_FEP_INSNS,
765 		       "instruction count");
766 		report(brnch_cnt.count - brnch_start == KVM_FEP_BRANCHES,
767 		       "branch count");
768 	} else {
769 		report(instr_cnt.count - instr_start >= KVM_FEP_INSNS,
770 		       "instruction count");
771 		report(brnch_cnt.count - brnch_start >= KVM_FEP_BRANCHES,
772 		       "branch count");
773 	}
774 
775 	if (this_cpu_has_perf_global_status()) {
776 		// Additionally check that those counters overflowed properly.
777 		status = rdmsr(pmu.msr_global_status);
778 		report(status & BIT_ULL(0), "branch counter overflow");
779 		report(status & BIT_ULL(1), "instruction counter overflow");
780 	}
781 
782 	report_prefix_pop();
783 }
784 
785 #define XBEGIN_STARTED (~0u)
check_tsx_cycles(void)786 static void check_tsx_cycles(void)
787 {
788 	pmu_counter_t cnt;
789 	unsigned int i, ret = 0;
790 
791 	if (!this_cpu_has(X86_FEATURE_RTM))
792 		return;
793 
794 	report_prefix_push("TSX cycles");
795 
796 	for (i = 0; i < pmu.nr_gp_counters; i++) {
797 		cnt.ctr = MSR_GP_COUNTERx(i);
798 
799 		if (i == 2) {
800 			/* Transactional cycles committed only on gp counter 2 */
801 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x30000003c;
802 		} else {
803 			/* Transactional cycles */
804 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x10000003c;
805 		}
806 
807 		start_event(&cnt);
808 
809 		asm volatile("xbegin 1f\n\t"
810 				"1:\n\t"
811 				: "+a" (ret) :: "memory");
812 
813 		/* Generate a non-canonical #GP to trigger ABORT. */
814 		if (ret == XBEGIN_STARTED)
815 			*(int *)NONCANONICAL = 0;
816 
817 		stop_event(&cnt);
818 
819 		report(cnt.count > 0, "gp cntr-%d with a value of %" PRId64 "", i, cnt.count);
820 	}
821 
822 	report_prefix_pop();
823 }
824 
warm_up(void)825 static void warm_up(void)
826 {
827 	int i;
828 
829 	/*
830 	 * Since cycles event is always run as the first event, there would be
831 	 * a warm-up state to warm up the cache, it leads to the measured cycles
832 	 * value may exceed the pre-defined cycles upper boundary and cause
833 	 * false positive. To avoid this, introduce an warm-up state before
834 	 * the real verification.
835 	 */
836 	for (i = 0; i < 10; i++)
837 		loop(0);
838 }
839 
check_counters(void)840 static void check_counters(void)
841 {
842 	if (is_fep_available)
843 		check_emulated_instr();
844 
845 	warm_up();
846 	check_gp_counters();
847 	check_fixed_counters();
848 	check_rdpmc();
849 	check_counters_many();
850 	check_counter_overflow();
851 	check_gp_counter_cmask();
852 	check_running_counter_wrmsr();
853 	check_tsx_cycles();
854 }
855 
do_unsupported_width_counter_write(void * index)856 static void do_unsupported_width_counter_write(void *index)
857 {
858 	wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull);
859 }
860 
check_gp_counters_write_width(void)861 static void check_gp_counters_write_width(void)
862 {
863 	u64 val_64 = 0xffffff0123456789ull;
864 	u64 val_32 = val_64 & ((1ull << 32) - 1);
865 	u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1);
866 	int i;
867 
868 	/*
869 	 * MSR_IA32_PERFCTRn supports 64-bit writes,
870 	 * but only the lowest 32 bits are valid.
871 	 */
872 	for (i = 0; i < pmu.nr_gp_counters; i++) {
873 		wrmsr(MSR_IA32_PERFCTR0 + i, val_32);
874 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
875 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
876 
877 		wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width);
878 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
879 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
880 
881 		wrmsr(MSR_IA32_PERFCTR0 + i, val_64);
882 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
883 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
884 	}
885 
886 	/*
887 	 * MSR_IA32_PMCn supports writing values up to GP counter width,
888 	 * and only the lowest bits of GP counter width are valid.
889 	 */
890 	for (i = 0; i < pmu.nr_gp_counters; i++) {
891 		wrmsr(MSR_IA32_PMC0 + i, val_32);
892 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
893 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
894 
895 		wrmsr(MSR_IA32_PMC0 + i, val_max_width);
896 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width);
897 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width);
898 
899 		report(test_for_exception(GP_VECTOR,
900 			do_unsupported_width_counter_write, &i),
901 		"writing unsupported width to MSR_IA32_PMC%d raises #GP", i);
902 	}
903 }
904 
905 /*
906  * Per the SDM, reference cycles are currently implemented using the
907  * core crystal clock, TSC, or bus clock. Calibrate to the TSC
908  * frequency to set reasonable expectations.
909  */
set_ref_cycle_expectations(void)910 static void set_ref_cycle_expectations(void)
911 {
912 	pmu_counter_t cnt = {
913 		.ctr = MSR_IA32_PERFCTR0,
914 		.config = EVNTSEL_OS | EVNTSEL_USR |
915 			  intel_gp_events[INTEL_REF_CYCLES_IDX].unit_sel,
916 	};
917 	uint64_t tsc_delta;
918 	uint64_t t0, t1, t2, t3;
919 
920 	/* Bit 2 enumerates the availability of reference cycles events. */
921 	if (!pmu.nr_gp_counters || !pmu_arch_event_is_available(2))
922 		return;
923 
924 	t0 = fenced_rdtsc();
925 	start_event(&cnt);
926 	t1 = fenced_rdtsc();
927 
928 	/*
929 	 * This loop has to run long enough to dominate the VM-exit
930 	 * costs for playing with the PMU MSRs on start and stop.
931 	 *
932 	 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times
933 	 * the core crystal clock, this function calculated a guest
934 	 * TSC : ref cycles ratio of around 105 with ECX initialized
935 	 * to one billion.
936 	 */
937 	asm volatile("loop ." : "+c"((int){1000000000ull}));
938 
939 	t2 = fenced_rdtsc();
940 	stop_event(&cnt);
941 	t3 = fenced_rdtsc();
942 
943 	tsc_delta = ((t2 - t1) + (t3 - t0)) / 2;
944 
945 	if (!tsc_delta)
946 		return;
947 
948 	intel_gp_events[INTEL_REF_CYCLES_IDX].min =
949 		(intel_gp_events[INTEL_REF_CYCLES_IDX].min * cnt.count) / tsc_delta;
950 	intel_gp_events[INTEL_REF_CYCLES_IDX].max =
951 		(intel_gp_events[INTEL_REF_CYCLES_IDX].max * cnt.count) / tsc_delta;
952 }
953 
check_invalid_rdpmc_gp(void)954 static void check_invalid_rdpmc_gp(void)
955 {
956 	uint64_t val;
957 
958 	report(rdpmc_safe(64, &val) == GP_VECTOR,
959 	       "Expected #GP on RDPMC(64)");
960 }
961 
main(int ac,char ** av)962 int main(int ac, char **av)
963 {
964 	int instruction_idx;
965 	int branch_idx;
966 	int branch_miss_idx;
967 
968 	setup_vm();
969 	handle_irq(PMI_VECTOR, cnt_overflow);
970 	buf = malloc(N*64);
971 
972 	if (this_cpu_has_perf_global_ctrl())
973 		wrmsr(pmu.msr_global_ctl, 0);
974 
975 	check_invalid_rdpmc_gp();
976 
977 	if (pmu.is_intel) {
978 		if (!pmu.version) {
979 			report_skip("No Intel Arch PMU is detected!");
980 			return report_summary();
981 		}
982 		gp_events = (struct pmu_event *)intel_gp_events;
983 		gp_events_size = sizeof(intel_gp_events)/sizeof(intel_gp_events[0]);
984 		instruction_idx = INTEL_INSTRUCTIONS_IDX;
985 		branch_idx = INTEL_BRANCHES_IDX;
986 		branch_miss_idx = INTEL_BRANCH_MISS_IDX;
987 
988 		/*
989 		 * For legacy Intel CPUS without clflush/clflushopt support,
990 		 * there is no way to force to trigger a LLC miss, thus set
991 		 * the minimum value to 0 to avoid false positives.
992 		 */
993 		if (!this_cpu_has(X86_FEATURE_CLFLUSH))
994 			gp_events[INTEL_LLC_MISSES_IDX].min = 0;
995 
996 		report_prefix_push("Intel");
997 		set_ref_cycle_expectations();
998 	} else {
999 		gp_events_size = sizeof(amd_gp_events)/sizeof(amd_gp_events[0]);
1000 		gp_events = (struct pmu_event *)amd_gp_events;
1001 		instruction_idx = AMD_INSTRUCTIONS_IDX;
1002 		branch_idx = AMD_BRANCHES_IDX;
1003 		branch_miss_idx = AMD_BRANCH_MISS_IDX;
1004 		report_prefix_push("AMD");
1005 	}
1006 	adjust_events_range(gp_events, instruction_idx, branch_idx, branch_miss_idx);
1007 
1008 	printf("PMU version:         %d\n", pmu.version);
1009 	printf("GP counters:         %d\n", pmu.nr_gp_counters);
1010 	printf("GP counter width:    %d\n", pmu.gp_counter_width);
1011 	printf("Event Mask length:   %d\n", pmu.arch_event_mask_length);
1012 	printf("Arch Events (mask):  0x%x\n", pmu.arch_event_available);
1013 	printf("Fixed counters:      %d\n", pmu.nr_fixed_counters);
1014 	printf("Fixed counter width: %d\n", pmu.fixed_counter_width);
1015 
1016 	fixed_counters_num = MIN(pmu.nr_fixed_counters, ARRAY_SIZE(fixed_events));
1017 	if (pmu.nr_fixed_counters > ARRAY_SIZE(fixed_events))
1018 		report_info("Fixed counters number %d > defined fixed events %u.  "
1019 			    "Please update test case.", pmu.nr_fixed_counters,
1020 			    (unsigned)ARRAY_SIZE(fixed_events));
1021 
1022 	apic_write(APIC_LVTPC, PMI_VECTOR);
1023 
1024 	check_counters();
1025 
1026 	if (pmu_has_full_writes()) {
1027 		pmu.msr_gp_counter_base = MSR_IA32_PMC0;
1028 
1029 		report_prefix_push("full-width writes");
1030 		check_counters();
1031 		check_gp_counters_write_width();
1032 		report_prefix_pop();
1033 	}
1034 
1035 	if (!pmu.is_intel) {
1036 		report_prefix_push("K7");
1037 		pmu.nr_gp_counters = AMD64_NUM_COUNTERS;
1038 		pmu.msr_gp_counter_base = MSR_K7_PERFCTR0;
1039 		pmu.msr_gp_event_select_base = MSR_K7_EVNTSEL0;
1040 		check_counters();
1041 		report_prefix_pop();
1042 	}
1043 
1044 	return report_summary();
1045 }
1046