xref: /kvm-unit-tests/x86/pmu.c (revision 28437cdbec8b64bd7b761d37da584fbd4378818e)
1 
2 #include "x86/msr.h"
3 #include "x86/processor.h"
4 #include "x86/pmu.h"
5 #include "x86/apic-defs.h"
6 #include "x86/apic.h"
7 #include "x86/desc.h"
8 #include "x86/isr.h"
9 #include "vmalloc.h"
10 #include "alloc.h"
11 
12 #include "libcflat.h"
13 #include <stdint.h>
14 
15 #define N 1000000
16 
17 // These values match the number of instructions and branches in the
18 // assembly block in check_emulated_instr().
19 #define EXPECTED_INSTR 17
20 #define EXPECTED_BRNCH 5
21 
22 #define IBPB_JMP_INSNS		9
23 #define IBPB_JMP_BRANCHES	2
24 
25 #if defined(__i386__) || defined(_M_IX86) /* i386 */
26 #define IBPB_JMP_ASM(_wrmsr)				\
27 	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
28 	"mov $73, %%ecx;\n\t"				\
29 	_wrmsr "\n\t"					\
30 	"call 1f\n\t"					\
31 	"1: pop %%eax\n\t"				\
32 	"add $(2f-1b), %%eax\n\t"			\
33 	"jmp *%%eax;\n\t"                               \
34 	"nop;\n\t"					\
35 	"2: nop;\n\t"
36 #else /* x86_64 */
37 #define IBPB_JMP_ASM(_wrmsr)				\
38 	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
39 	"mov $73, %%ecx;\n\t"				\
40 	_wrmsr "\n\t"					\
41 	"call 1f\n\t"					\
42 	"1: pop %%rax\n\t"				\
43 	"add $(2f-1b), %%rax\n\t"                       \
44 	"jmp *%%rax;\n\t"                               \
45 	"nop;\n\t"					\
46 	"2: nop;\n\t"
47 #endif
48 
49 /* GLOBAL_CTRL enable + disable + clflush/mfence + IBPB_JMP */
50 #define EXTRA_INSNS  (3 + 3 + 2 + IBPB_JMP_INSNS)
51 #define LOOP_INSNS   (N * 10 + EXTRA_INSNS)
52 #define LOOP_BRANCHES  (N + IBPB_JMP_BRANCHES)
53 #define LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)				\
54 	_wrmsr1 "\n\t"							\
55 	"mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t"			\
56 	_clflush "\n\t"                                 		\
57 	"mfence;\n\t"                                   		\
58 	"1: mov (%1), %2; add $64, %1;\n\t"				\
59 	"nop; nop; nop; nop; nop; nop; nop;\n\t"			\
60 	"loop 1b;\n\t"							\
61 	IBPB_JMP_ASM(_wrmsr2) 						\
62 	"mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t"	\
63 	_wrmsr1 "\n\t"
64 
65 #define _loop_asm(_wrmsr1, _clflush, _wrmsr2)			\
66 do {								\
67 	asm volatile(LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)	\
68 		     : "=b"(tmp), "=r"(tmp2), "=r"(tmp3)	\
69 		     : "a"(eax), "d"(edx), "c"(global_ctl),	\
70 		       "0"(N), "1"(buf)				\
71 		     : "edi");					\
72 } while (0)
73 
74 typedef struct {
75 	uint32_t ctr;
76 	uint32_t idx;
77 	uint64_t config;
78 	uint64_t count;
79 } pmu_counter_t;
80 
81 struct pmu_event {
82 	const char *name;
83 	uint32_t unit_sel;
84 	int min;
85 	int max;
86 } intel_gp_events[] = {
87 	{"core cycles", 0x003c, 1*N, 50*N},
88 	{"instructions", 0x00c0, 10*N, 10.2*N},
89 	{"ref cycles", 0x013c, 1*N, 30*N},
90 	{"llc references", 0x4f2e, 1, 2*N},
91 	{"llc misses", 0x412e, 1, 1*N},
92 	{"branches", 0x00c4, 1*N, 1.1*N},
93 	{"branch misses", 0x00c5, 1, 0.1*N},
94 }, amd_gp_events[] = {
95 	{"core cycles", 0x0076, 1*N, 50*N},
96 	{"instructions", 0x00c0, 10*N, 10.2*N},
97 	{"branches", 0x00c2, 1*N, 1.1*N},
98 	{"branch misses", 0x00c3, 1, 0.1*N},
99 }, fixed_events[] = {
100 	{"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
101 	{"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
102 	{"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
103 };
104 
105 /*
106  * Events index in intel_gp_events[], ensure consistent with
107  * intel_gp_events[].
108  */
109 enum {
110 	INTEL_INSTRUCTIONS_IDX  = 1,
111 	INTEL_REF_CYCLES_IDX	= 2,
112 	INTEL_LLC_MISSES_IDX	= 4,
113 	INTEL_BRANCHES_IDX	= 5,
114 	INTEL_BRANCH_MISS_IDX	= 6,
115 };
116 
117 /*
118  * Events index in amd_gp_events[], ensure consistent with
119  * amd_gp_events[].
120  */
121 enum {
122 	AMD_INSTRUCTIONS_IDX    = 1,
123 	AMD_BRANCHES_IDX	= 2,
124 	AMD_BRANCH_MISS_IDX	= 3,
125 };
126 
127 char *buf;
128 
129 static struct pmu_event *gp_events;
130 static unsigned int gp_events_size;
131 static unsigned int fixed_counters_num;
132 
133 static int has_ibpb(void)
134 {
135 	return this_cpu_has(X86_FEATURE_SPEC_CTRL) ||
136 	       this_cpu_has(X86_FEATURE_AMD_IBPB);
137 }
138 
139 static inline void __loop(void)
140 {
141 	unsigned long tmp, tmp2, tmp3;
142 	u32 global_ctl = 0;
143 	u32 eax = 0;
144 	u32 edx = 0;
145 
146 	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
147 		_loop_asm("nop", "clflush (%1)", "wrmsr");
148 	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
149 		_loop_asm("nop", "clflush (%1)", "nop");
150 	else if (has_ibpb())
151 		_loop_asm("nop", "nop", "wrmsr");
152 	else
153 		_loop_asm("nop", "nop", "nop");
154 }
155 
156 /*
157  * Enable and disable counters in a whole asm blob to ensure
158  * no other instructions are counted in the window between
159  * counters enabling and really LOOP_ASM code executing.
160  * Thus counters can verify instructions and branches events
161  * against precise counts instead of a rough valid count range.
162  */
163 static inline void __precise_loop(u64 cntrs)
164 {
165 	unsigned long tmp, tmp2, tmp3;
166 	u32 global_ctl = pmu.msr_global_ctl;
167 	u32 eax = cntrs & (BIT_ULL(32) - 1);
168 	u32 edx = cntrs >> 32;
169 
170 	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
171 		_loop_asm("wrmsr", "clflush (%1)", "wrmsr");
172 	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
173 		_loop_asm("wrmsr", "clflush (%1)", "nop");
174 	else if (has_ibpb())
175 		_loop_asm("wrmsr", "nop", "wrmsr");
176 	else
177 		_loop_asm("wrmsr", "nop", "nop");
178 }
179 
180 static inline void loop(u64 cntrs)
181 {
182 	if (!this_cpu_has_perf_global_ctrl())
183 		__loop();
184 	else
185 		__precise_loop(cntrs);
186 }
187 
188 static void adjust_events_range(struct pmu_event *gp_events,
189 				int instruction_idx, int branch_idx,
190 				int branch_miss_idx)
191 {
192 	/*
193 	 * If HW supports GLOBAL_CTRL MSR, enabling and disabling PMCs are
194 	 * moved in __precise_loop(). Thus, instructions and branches events
195 	 * can be verified against a precise count instead of a rough range.
196 	 *
197 	 * Skip the precise checks on AMD, as AMD CPUs count VMRUN as a branch
198 	 * instruction in guest context, which* leads to intermittent failures
199 	 * as the counts will vary depending on how many asynchronous VM-Exits
200 	 * occur while running the measured code, e.g. if the host takes IRQs.
201 	 */
202 	if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) {
203 		gp_events[instruction_idx].min = LOOP_INSNS;
204 		gp_events[instruction_idx].max = LOOP_INSNS;
205 		gp_events[branch_idx].min = LOOP_BRANCHES;
206 		gp_events[branch_idx].max = LOOP_BRANCHES;
207 	}
208 
209 	/*
210 	 * For CPUs without IBPB support, no way to force to trigger a branch
211 	 * miss and the measured branch misses is possible to be 0.  Thus
212 	 * overwrite the lower boundary of branch misses event to 0 to avoid
213 	 * false positive.
214 	 */
215 	if (!has_ibpb())
216 		gp_events[branch_miss_idx].min = 0;
217 }
218 
219 volatile uint64_t irq_received;
220 
221 static void cnt_overflow(isr_regs_t *regs)
222 {
223 	irq_received++;
224 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
225 	apic_write(APIC_EOI, 0);
226 }
227 
228 static bool check_irq(void)
229 {
230 	int i;
231 	irq_received = 0;
232 	sti();
233 	for (i = 0; i < 100000 && !irq_received; i++)
234 		asm volatile("pause");
235 	cli();
236 	return irq_received;
237 }
238 
239 static bool is_gp(pmu_counter_t *evt)
240 {
241 	if (!pmu.is_intel)
242 		return true;
243 
244 	return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 ||
245 		evt->ctr >= MSR_IA32_PMC0;
246 }
247 
248 static int event_to_global_idx(pmu_counter_t *cnt)
249 {
250 	if (pmu.is_intel)
251 		return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base :
252 			(MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
253 
254 	if (pmu.msr_gp_counter_base == MSR_F15H_PERF_CTR0)
255 		return (cnt->ctr - pmu.msr_gp_counter_base) / 2;
256 	else
257 		return cnt->ctr - pmu.msr_gp_counter_base;
258 }
259 
260 static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
261 {
262 	if (is_gp(cnt)) {
263 		int i;
264 
265 		for (i = 0; i < gp_events_size; i++)
266 			if (gp_events[i].unit_sel == (cnt->config & 0xffff))
267 				return &gp_events[i];
268 	} else {
269 		unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0;
270 
271 		if (idx < ARRAY_SIZE(fixed_events))
272 			return &fixed_events[idx];
273 	}
274 
275 	return (void*)0;
276 }
277 
278 static void global_enable(pmu_counter_t *cnt)
279 {
280 	if (!this_cpu_has_perf_global_ctrl())
281 		return;
282 
283 	cnt->idx = event_to_global_idx(cnt);
284 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx));
285 }
286 
287 static void global_disable(pmu_counter_t *cnt)
288 {
289 	if (!this_cpu_has_perf_global_ctrl())
290 		return;
291 
292 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx));
293 }
294 
295 static void __start_event(pmu_counter_t *evt, uint64_t count)
296 {
297     evt->count = count;
298     wrmsr(evt->ctr, evt->count);
299     if (is_gp(evt)) {
300 	    wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
301 		  evt->config | EVNTSEL_EN);
302     } else {
303 	    uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
304 	    int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
305 	    uint32_t usrospmi = 0;
306 
307 	    if (evt->config & EVNTSEL_OS)
308 		    usrospmi |= (1 << 0);
309 	    if (evt->config & EVNTSEL_USR)
310 		    usrospmi |= (1 << 1);
311 	    if (evt->config & EVNTSEL_INT)
312 		    usrospmi |= (1 << 3); // PMI on overflow
313 	    ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift);
314 	    wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl);
315     }
316     apic_write(APIC_LVTPC, PMI_VECTOR);
317 }
318 
319 static void start_event(pmu_counter_t *evt)
320 {
321 	__start_event(evt, 0);
322 	global_enable(evt);
323 }
324 
325 static void __stop_event(pmu_counter_t *evt)
326 {
327 	if (is_gp(evt)) {
328 		wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
329 		      evt->config & ~EVNTSEL_EN);
330 	} else {
331 		uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
332 		int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
333 		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift));
334 	}
335 	evt->count = rdmsr(evt->ctr);
336 }
337 
338 static void stop_event(pmu_counter_t *evt)
339 {
340 	global_disable(evt);
341 	__stop_event(evt);
342 }
343 
344 static noinline void measure_many(pmu_counter_t *evt, int count)
345 {
346 	int i;
347 	u64 cntrs = 0;
348 
349 	for (i = 0; i < count; i++) {
350 		__start_event(&evt[i], 0);
351 		cntrs |= BIT_ULL(event_to_global_idx(&evt[i]));
352 	}
353 	loop(cntrs);
354 	for (i = 0; i < count; i++)
355 		__stop_event(&evt[i]);
356 }
357 
358 static void measure_one(pmu_counter_t *evt)
359 {
360 	measure_many(evt, 1);
361 }
362 
363 static noinline void __measure(pmu_counter_t *evt, uint64_t count)
364 {
365 	u64 cntrs = BIT_ULL(event_to_global_idx(evt));
366 
367 	__start_event(evt, count);
368 	loop(cntrs);
369 	__stop_event(evt);
370 }
371 
372 static bool verify_event(uint64_t count, struct pmu_event *e)
373 {
374 	bool pass;
375 
376 	if (!e)
377 		return false;
378 
379 	pass = count >= e->min && count <= e->max;
380 	if (!pass)
381 		printf("FAIL: %d <= %"PRId64" <= %d\n", e->min, count, e->max);
382 
383 	return pass;
384 }
385 
386 static bool verify_counter(pmu_counter_t *cnt)
387 {
388 	return verify_event(cnt->count, get_counter_event(cnt));
389 }
390 
391 static void check_gp_counter(struct pmu_event *evt)
392 {
393 	pmu_counter_t cnt = {
394 		.config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel,
395 	};
396 	int i;
397 
398 	for (i = 0; i < pmu.nr_gp_counters; i++) {
399 		cnt.ctr = MSR_GP_COUNTERx(i);
400 		measure_one(&cnt);
401 		report(verify_event(cnt.count, evt), "%s-%d", evt->name, i);
402 	}
403 }
404 
405 static void check_gp_counters(void)
406 {
407 	int i;
408 
409 	for (i = 0; i < gp_events_size; i++)
410 		if (pmu_gp_counter_is_available(i))
411 			check_gp_counter(&gp_events[i]);
412 		else
413 			printf("GP event '%s' is disabled\n",
414 					gp_events[i].name);
415 }
416 
417 static void check_fixed_counters(void)
418 {
419 	pmu_counter_t cnt = {
420 		.config = EVNTSEL_OS | EVNTSEL_USR,
421 	};
422 	int i;
423 
424 	for (i = 0; i < fixed_counters_num; i++) {
425 		cnt.ctr = fixed_events[i].unit_sel;
426 		measure_one(&cnt);
427 		report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i);
428 	}
429 }
430 
431 static void check_counters_many(void)
432 {
433 	pmu_counter_t cnt[48];
434 	int i, n;
435 
436 	for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) {
437 		if (!pmu_gp_counter_is_available(i))
438 			continue;
439 
440 		cnt[n].ctr = MSR_GP_COUNTERx(n);
441 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR |
442 			gp_events[i % gp_events_size].unit_sel;
443 		n++;
444 	}
445 	for (i = 0; i < fixed_counters_num; i++) {
446 		cnt[n].ctr = fixed_events[i].unit_sel;
447 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR;
448 		n++;
449 	}
450 
451 	assert(n <= ARRAY_SIZE(cnt));
452 	measure_many(cnt, n);
453 
454 	for (i = 0; i < n; i++)
455 		if (!verify_counter(&cnt[i]))
456 			break;
457 
458 	report(i == n, "all counters");
459 }
460 
461 static uint64_t measure_for_overflow(pmu_counter_t *cnt)
462 {
463 	__measure(cnt, 0);
464 	/*
465 	 * To generate overflow, i.e. roll over to '0', the initial count just
466 	 * needs to be preset to the negative expected count.  However, as per
467 	 * Intel's SDM, the preset count needs to be incremented by 1 to ensure
468 	 * the overflow interrupt is generated immediately instead of possibly
469 	 * waiting for the overflow to propagate through the counter.
470 	 */
471 	assert(cnt->count > 1);
472 	return 1 - cnt->count;
473 }
474 
475 static void check_counter_overflow(void)
476 {
477 	int i;
478 	uint64_t overflow_preset;
479 	int instruction_idx = pmu.is_intel ?
480 			      INTEL_INSTRUCTIONS_IDX :
481 			      AMD_INSTRUCTIONS_IDX;
482 
483 	pmu_counter_t cnt = {
484 		.ctr = MSR_GP_COUNTERx(0),
485 		.config = EVNTSEL_OS | EVNTSEL_USR |
486 			  gp_events[instruction_idx].unit_sel /* instructions */,
487 	};
488 	overflow_preset = measure_for_overflow(&cnt);
489 
490 	/* clear status before test */
491 	if (this_cpu_has_perf_global_status())
492 		pmu_clear_global_status();
493 
494 	report_prefix_push("overflow");
495 
496 	for (i = 0; i < pmu.nr_gp_counters + 1; i++) {
497 		uint64_t status;
498 		int idx;
499 
500 		cnt.count = overflow_preset;
501 		if (pmu_use_full_writes())
502 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
503 
504 		if (i == pmu.nr_gp_counters) {
505 			if (!pmu.is_intel)
506 				break;
507 
508 			cnt.ctr = fixed_events[0].unit_sel;
509 			cnt.count = measure_for_overflow(&cnt);
510 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
511 		} else {
512 			cnt.ctr = MSR_GP_COUNTERx(i);
513 		}
514 
515 		if (i % 2)
516 			cnt.config |= EVNTSEL_INT;
517 		else
518 			cnt.config &= ~EVNTSEL_INT;
519 		idx = event_to_global_idx(&cnt);
520 		__measure(&cnt, cnt.count);
521 		if (pmu.is_intel)
522 			report(cnt.count == 1, "cntr-%d", i);
523 		else
524 			report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i);
525 
526 		if (!this_cpu_has_perf_global_status())
527 			continue;
528 
529 		status = rdmsr(pmu.msr_global_status);
530 		report(status & (1ull << idx), "status-%d", i);
531 		wrmsr(pmu.msr_global_status_clr, status);
532 		status = rdmsr(pmu.msr_global_status);
533 		report(!(status & (1ull << idx)), "status clear-%d", i);
534 		report(check_irq() == (i % 2), "irq-%d", i);
535 	}
536 
537 	report_prefix_pop();
538 }
539 
540 static void check_gp_counter_cmask(void)
541 {
542 	int instruction_idx = pmu.is_intel ?
543 			      INTEL_INSTRUCTIONS_IDX :
544 			      AMD_INSTRUCTIONS_IDX;
545 
546 	pmu_counter_t cnt = {
547 		.ctr = MSR_GP_COUNTERx(0),
548 		.config = EVNTSEL_OS | EVNTSEL_USR |
549 			  gp_events[instruction_idx].unit_sel /* instructions */,
550 	};
551 	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
552 	measure_one(&cnt);
553 	report(cnt.count < gp_events[instruction_idx].min, "cmask");
554 }
555 
556 static void do_rdpmc_fast(void *ptr)
557 {
558 	pmu_counter_t *cnt = ptr;
559 	uint32_t idx = (uint32_t)cnt->idx | (1u << 31);
560 
561 	if (!is_gp(cnt))
562 		idx |= 1 << 30;
563 
564 	cnt->count = rdpmc(idx);
565 }
566 
567 
568 static void check_rdpmc(void)
569 {
570 	uint64_t val = 0xff0123456789ull;
571 	bool exc;
572 	int i;
573 
574 	report_prefix_push("rdpmc");
575 
576 	for (i = 0; i < pmu.nr_gp_counters; i++) {
577 		uint64_t x;
578 		pmu_counter_t cnt = {
579 			.ctr = MSR_GP_COUNTERx(i),
580 			.idx = i
581 		};
582 
583 	        /*
584 	         * Without full-width writes, only the low 32 bits are writable,
585 	         * and the value is sign-extended.
586 	         */
587 		if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0)
588 			x = (uint64_t)(int64_t)(int32_t)val;
589 		else
590 			x = (uint64_t)(int64_t)val;
591 
592 		/* Mask according to the number of supported bits */
593 		x &= (1ull << pmu.gp_counter_width) - 1;
594 
595 		wrmsr(MSR_GP_COUNTERx(i), val);
596 		report(rdpmc(i) == x, "cntr-%d", i);
597 
598 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
599 		if (exc)
600 			report_skip("fast-%d", i);
601 		else
602 			report(cnt.count == (u32)val, "fast-%d", i);
603 	}
604 	for (i = 0; i < fixed_counters_num; i++) {
605 		uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1);
606 		pmu_counter_t cnt = {
607 			.ctr = MSR_CORE_PERF_FIXED_CTR0 + i,
608 			.idx = i
609 		};
610 
611 		wrmsr(MSR_PERF_FIXED_CTRx(i), x);
612 		report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i);
613 
614 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
615 		if (exc)
616 			report_skip("fixed fast-%d", i);
617 		else
618 			report(cnt.count == (u32)x, "fixed fast-%d", i);
619 	}
620 
621 	report_prefix_pop();
622 }
623 
624 static void check_running_counter_wrmsr(void)
625 {
626 	uint64_t status;
627 	uint64_t count;
628 	unsigned int instruction_idx = pmu.is_intel ?
629 				       INTEL_INSTRUCTIONS_IDX :
630 				       AMD_INSTRUCTIONS_IDX;
631 
632 	pmu_counter_t evt = {
633 		.ctr = MSR_GP_COUNTERx(0),
634 		.config = EVNTSEL_OS | EVNTSEL_USR |
635 			  gp_events[instruction_idx].unit_sel,
636 	};
637 
638 	report_prefix_push("running counter wrmsr");
639 
640 	start_event(&evt);
641 	__loop();
642 	wrmsr(MSR_GP_COUNTERx(0), 0);
643 	stop_event(&evt);
644 	report(evt.count < gp_events[instruction_idx].min, "cntr");
645 
646 	/* clear status before overflow test */
647 	if (this_cpu_has_perf_global_status())
648 		pmu_clear_global_status();
649 
650 	start_event(&evt);
651 
652 	count = -1;
653 	if (pmu_use_full_writes())
654 		count &= (1ull << pmu.gp_counter_width) - 1;
655 
656 	wrmsr(MSR_GP_COUNTERx(0), count);
657 
658 	__loop();
659 	stop_event(&evt);
660 
661 	if (this_cpu_has_perf_global_status()) {
662 		status = rdmsr(pmu.msr_global_status);
663 		report(status & 1, "status msr bit");
664 	}
665 
666 	report_prefix_pop();
667 }
668 
669 static void check_emulated_instr(void)
670 {
671 	uint64_t status, instr_start, brnch_start;
672 	uint64_t gp_counter_width = (1ull << pmu.gp_counter_width) - 1;
673 	unsigned int branch_idx = pmu.is_intel ?
674 				  INTEL_BRANCHES_IDX : AMD_BRANCHES_IDX;
675 	unsigned int instruction_idx = pmu.is_intel ?
676 				       INTEL_INSTRUCTIONS_IDX :
677 				       AMD_INSTRUCTIONS_IDX;
678 	pmu_counter_t brnch_cnt = {
679 		.ctr = MSR_GP_COUNTERx(0),
680 		/* branch instructions */
681 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[branch_idx].unit_sel,
682 	};
683 	pmu_counter_t instr_cnt = {
684 		.ctr = MSR_GP_COUNTERx(1),
685 		/* instructions */
686 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel,
687 	};
688 	report_prefix_push("emulated instruction");
689 
690 	if (this_cpu_has_perf_global_status())
691 		pmu_clear_global_status();
692 
693 	start_event(&brnch_cnt);
694 	start_event(&instr_cnt);
695 
696 	brnch_start = -EXPECTED_BRNCH;
697 	instr_start = -EXPECTED_INSTR;
698 	wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width);
699 	wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width);
700 	// KVM_FEP is a magic prefix that forces emulation so
701 	// 'KVM_FEP "jne label\n"' just counts as a single instruction.
702 	asm volatile(
703 		"mov $0x0, %%eax\n"
704 		"cmp $0x0, %%eax\n"
705 		KVM_FEP "jne label\n"
706 		KVM_FEP "jne label\n"
707 		KVM_FEP "jne label\n"
708 		KVM_FEP "jne label\n"
709 		KVM_FEP "jne label\n"
710 		"mov $0xa, %%eax\n"
711 		"cpuid\n"
712 		"mov $0xa, %%eax\n"
713 		"cpuid\n"
714 		"mov $0xa, %%eax\n"
715 		"cpuid\n"
716 		"mov $0xa, %%eax\n"
717 		"cpuid\n"
718 		"mov $0xa, %%eax\n"
719 		"cpuid\n"
720 		"label:\n"
721 		:
722 		:
723 		: "eax", "ebx", "ecx", "edx");
724 
725 	if (this_cpu_has_perf_global_ctrl())
726 		wrmsr(pmu.msr_global_ctl, 0);
727 
728 	stop_event(&brnch_cnt);
729 	stop_event(&instr_cnt);
730 
731 	// Check that the end count - start count is at least the expected
732 	// number of instructions and branches.
733 	report(instr_cnt.count - instr_start >= EXPECTED_INSTR,
734 	       "instruction count");
735 	report(brnch_cnt.count - brnch_start >= EXPECTED_BRNCH,
736 	       "branch count");
737 	if (this_cpu_has_perf_global_status()) {
738 		// Additionally check that those counters overflowed properly.
739 		status = rdmsr(pmu.msr_global_status);
740 		report(status & 1, "branch counter overflow");
741 		report(status & 2, "instruction counter overflow");
742 	}
743 
744 	report_prefix_pop();
745 }
746 
747 #define XBEGIN_STARTED (~0u)
748 static void check_tsx_cycles(void)
749 {
750 	pmu_counter_t cnt;
751 	unsigned int i, ret = 0;
752 
753 	if (!this_cpu_has(X86_FEATURE_RTM))
754 		return;
755 
756 	report_prefix_push("TSX cycles");
757 
758 	for (i = 0; i < pmu.nr_gp_counters; i++) {
759 		cnt.ctr = MSR_GP_COUNTERx(i);
760 
761 		if (i == 2) {
762 			/* Transactional cycles committed only on gp counter 2 */
763 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x30000003c;
764 		} else {
765 			/* Transactional cycles */
766 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x10000003c;
767 		}
768 
769 		start_event(&cnt);
770 
771 		asm volatile("xbegin 1f\n\t"
772 				"1:\n\t"
773 				: "+a" (ret) :: "memory");
774 
775 		/* Generate a non-canonical #GP to trigger ABORT. */
776 		if (ret == XBEGIN_STARTED)
777 			*(int *)NONCANONICAL = 0;
778 
779 		stop_event(&cnt);
780 
781 		report(cnt.count > 0, "gp cntr-%d with a value of %" PRId64 "", i, cnt.count);
782 	}
783 
784 	report_prefix_pop();
785 }
786 
787 static void warm_up(void)
788 {
789 	int i;
790 
791 	/*
792 	 * Since cycles event is always run as the first event, there would be
793 	 * a warm-up state to warm up the cache, it leads to the measured cycles
794 	 * value may exceed the pre-defined cycles upper boundary and cause
795 	 * false positive. To avoid this, introduce an warm-up state before
796 	 * the real verification.
797 	 */
798 	for (i = 0; i < 10; i++)
799 		loop(0);
800 }
801 
802 static void check_counters(void)
803 {
804 	if (is_fep_available())
805 		check_emulated_instr();
806 
807 	warm_up();
808 	check_gp_counters();
809 	check_fixed_counters();
810 	check_rdpmc();
811 	check_counters_many();
812 	check_counter_overflow();
813 	check_gp_counter_cmask();
814 	check_running_counter_wrmsr();
815 	check_tsx_cycles();
816 }
817 
818 static void do_unsupported_width_counter_write(void *index)
819 {
820 	wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull);
821 }
822 
823 static void check_gp_counters_write_width(void)
824 {
825 	u64 val_64 = 0xffffff0123456789ull;
826 	u64 val_32 = val_64 & ((1ull << 32) - 1);
827 	u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1);
828 	int i;
829 
830 	/*
831 	 * MSR_IA32_PERFCTRn supports 64-bit writes,
832 	 * but only the lowest 32 bits are valid.
833 	 */
834 	for (i = 0; i < pmu.nr_gp_counters; i++) {
835 		wrmsr(MSR_IA32_PERFCTR0 + i, val_32);
836 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
837 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
838 
839 		wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width);
840 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
841 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
842 
843 		wrmsr(MSR_IA32_PERFCTR0 + i, val_64);
844 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
845 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
846 	}
847 
848 	/*
849 	 * MSR_IA32_PMCn supports writing values up to GP counter width,
850 	 * and only the lowest bits of GP counter width are valid.
851 	 */
852 	for (i = 0; i < pmu.nr_gp_counters; i++) {
853 		wrmsr(MSR_IA32_PMC0 + i, val_32);
854 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
855 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
856 
857 		wrmsr(MSR_IA32_PMC0 + i, val_max_width);
858 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width);
859 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width);
860 
861 		report(test_for_exception(GP_VECTOR,
862 			do_unsupported_width_counter_write, &i),
863 		"writing unsupported width to MSR_IA32_PMC%d raises #GP", i);
864 	}
865 }
866 
867 /*
868  * Per the SDM, reference cycles are currently implemented using the
869  * core crystal clock, TSC, or bus clock. Calibrate to the TSC
870  * frequency to set reasonable expectations.
871  */
872 static void set_ref_cycle_expectations(void)
873 {
874 	pmu_counter_t cnt = {
875 		.ctr = MSR_IA32_PERFCTR0,
876 		.config = EVNTSEL_OS | EVNTSEL_USR |
877 			  intel_gp_events[INTEL_REF_CYCLES_IDX].unit_sel,
878 	};
879 	uint64_t tsc_delta;
880 	uint64_t t0, t1, t2, t3;
881 
882 	/* Bit 2 enumerates the availability of reference cycles events. */
883 	if (!pmu.nr_gp_counters || !pmu_gp_counter_is_available(2))
884 		return;
885 
886 	if (this_cpu_has_perf_global_ctrl())
887 		wrmsr(pmu.msr_global_ctl, 0);
888 
889 	t0 = fenced_rdtsc();
890 	start_event(&cnt);
891 	t1 = fenced_rdtsc();
892 
893 	/*
894 	 * This loop has to run long enough to dominate the VM-exit
895 	 * costs for playing with the PMU MSRs on start and stop.
896 	 *
897 	 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times
898 	 * the core crystal clock, this function calculated a guest
899 	 * TSC : ref cycles ratio of around 105 with ECX initialized
900 	 * to one billion.
901 	 */
902 	asm volatile("loop ." : "+c"((int){1000000000ull}));
903 
904 	t2 = fenced_rdtsc();
905 	stop_event(&cnt);
906 	t3 = fenced_rdtsc();
907 
908 	tsc_delta = ((t2 - t1) + (t3 - t0)) / 2;
909 
910 	if (!tsc_delta)
911 		return;
912 
913 	intel_gp_events[INTEL_REF_CYCLES_IDX].min =
914 		(intel_gp_events[INTEL_REF_CYCLES_IDX].min * cnt.count) / tsc_delta;
915 	intel_gp_events[INTEL_REF_CYCLES_IDX].max =
916 		(intel_gp_events[INTEL_REF_CYCLES_IDX].max * cnt.count) / tsc_delta;
917 }
918 
919 static void check_invalid_rdpmc_gp(void)
920 {
921 	uint64_t val;
922 
923 	report(rdpmc_safe(64, &val) == GP_VECTOR,
924 	       "Expected #GP on RDPMC(64)");
925 }
926 
927 int main(int ac, char **av)
928 {
929 	int instruction_idx;
930 	int branch_idx;
931 	int branch_miss_idx;
932 
933 	setup_vm();
934 	handle_irq(PMI_VECTOR, cnt_overflow);
935 	buf = malloc(N*64);
936 
937 	check_invalid_rdpmc_gp();
938 
939 	if (pmu.is_intel) {
940 		if (!pmu.version) {
941 			report_skip("No Intel Arch PMU is detected!");
942 			return report_summary();
943 		}
944 		gp_events = (struct pmu_event *)intel_gp_events;
945 		gp_events_size = sizeof(intel_gp_events)/sizeof(intel_gp_events[0]);
946 		instruction_idx = INTEL_INSTRUCTIONS_IDX;
947 		branch_idx = INTEL_BRANCHES_IDX;
948 		branch_miss_idx = INTEL_BRANCH_MISS_IDX;
949 
950 		/*
951 		 * For legacy Intel CPUS without clflush/clflushopt support,
952 		 * there is no way to force to trigger a LLC miss, thus set
953 		 * the minimum value to 0 to avoid false positives.
954 		 */
955 		if (!this_cpu_has(X86_FEATURE_CLFLUSH))
956 			gp_events[INTEL_LLC_MISSES_IDX].min = 0;
957 
958 		report_prefix_push("Intel");
959 		set_ref_cycle_expectations();
960 	} else {
961 		gp_events_size = sizeof(amd_gp_events)/sizeof(amd_gp_events[0]);
962 		gp_events = (struct pmu_event *)amd_gp_events;
963 		instruction_idx = AMD_INSTRUCTIONS_IDX;
964 		branch_idx = AMD_BRANCHES_IDX;
965 		branch_miss_idx = AMD_BRANCH_MISS_IDX;
966 		report_prefix_push("AMD");
967 	}
968 	adjust_events_range(gp_events, instruction_idx, branch_idx, branch_miss_idx);
969 
970 	printf("PMU version:         %d\n", pmu.version);
971 	printf("GP counters:         %d\n", pmu.nr_gp_counters);
972 	printf("GP counter width:    %d\n", pmu.gp_counter_width);
973 	printf("Mask length:         %d\n", pmu.gp_counter_mask_length);
974 	printf("Fixed counters:      %d\n", pmu.nr_fixed_counters);
975 	printf("Fixed counter width: %d\n", pmu.fixed_counter_width);
976 
977 	fixed_counters_num = MIN(pmu.nr_fixed_counters, ARRAY_SIZE(fixed_events));
978 	if (pmu.nr_fixed_counters > ARRAY_SIZE(fixed_events))
979 		report_info("Fixed counters number %d > defined fixed events %u.  "
980 			    "Please update test case.", pmu.nr_fixed_counters,
981 			    (uint32_t)ARRAY_SIZE(fixed_events));
982 
983 	apic_write(APIC_LVTPC, PMI_VECTOR);
984 
985 	check_counters();
986 
987 	if (pmu_has_full_writes()) {
988 		pmu.msr_gp_counter_base = MSR_IA32_PMC0;
989 
990 		report_prefix_push("full-width writes");
991 		check_counters();
992 		check_gp_counters_write_width();
993 		report_prefix_pop();
994 	}
995 
996 	if (!pmu.is_intel) {
997 		report_prefix_push("K7");
998 		pmu.nr_gp_counters = AMD64_NUM_COUNTERS;
999 		pmu.msr_gp_counter_base = MSR_K7_PERFCTR0;
1000 		pmu.msr_gp_event_select_base = MSR_K7_EVNTSEL0;
1001 		check_counters();
1002 		report_prefix_pop();
1003 	}
1004 
1005 	return report_summary();
1006 }
1007