xref: /kvm-unit-tests/x86/pmu.c (revision 8dbfe326bec80a779a8bf6c58310514d306f26df)
1 
2 #include "x86/msr.h"
3 #include "x86/processor.h"
4 #include "x86/pmu.h"
5 #include "x86/apic-defs.h"
6 #include "x86/apic.h"
7 #include "x86/desc.h"
8 #include "x86/isr.h"
9 #include "vmalloc.h"
10 #include "alloc.h"
11 
12 #include "libcflat.h"
13 #include <stdint.h>
14 
15 #define N 1000000
16 
17 // These values match the number of instructions and branches in the
18 // assembly block in check_emulated_instr().
19 #define EXPECTED_INSTR 17
20 #define EXPECTED_BRNCH 5
21 
22 #define IBPB_JMP_INSNS		9
23 #define IBPB_JMP_BRANCHES	2
24 
25 #if defined(__i386__) || defined(_M_IX86) /* i386 */
26 #define IBPB_JMP_ASM(_wrmsr)				\
27 	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
28 	"mov $73, %%ecx;\n\t"				\
29 	_wrmsr "\n\t"					\
30 	"call 1f\n\t"					\
31 	"1: pop %%eax\n\t"				\
32 	"add $(2f-1b), %%eax\n\t"			\
33 	"jmp *%%eax;\n\t"                               \
34 	"nop;\n\t"					\
35 	"2: nop;\n\t"
36 #else /* x86_64 */
37 #define IBPB_JMP_ASM(_wrmsr)				\
38 	"mov $1, %%eax; xor %%edx, %%edx;\n\t"		\
39 	"mov $73, %%ecx;\n\t"				\
40 	_wrmsr "\n\t"					\
41 	"call 1f\n\t"					\
42 	"1: pop %%rax\n\t"				\
43 	"add $(2f-1b), %%rax\n\t"                       \
44 	"jmp *%%rax;\n\t"                               \
45 	"nop;\n\t"					\
46 	"2: nop;\n\t"
47 #endif
48 
49 /* GLOBAL_CTRL enable + disable + clflush/mfence + IBPB_JMP */
50 #define EXTRA_INSNS  (3 + 3 + 2 + IBPB_JMP_INSNS)
51 #define LOOP_INSNS   (N * 10 + EXTRA_INSNS)
52 #define LOOP_BRANCHES  (N + IBPB_JMP_BRANCHES)
53 #define LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)				\
54 	_wrmsr1 "\n\t"							\
55 	"mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t"			\
56 	_clflush "\n\t"                                 		\
57 	"mfence;\n\t"                                   		\
58 	"1: mov (%1), %2; add $64, %1;\n\t"				\
59 	"nop; nop; nop; nop; nop; nop; nop;\n\t"			\
60 	"loop 1b;\n\t"							\
61 	IBPB_JMP_ASM(_wrmsr2) 						\
62 	"mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t"	\
63 	_wrmsr1 "\n\t"
64 
65 #define _loop_asm(_wrmsr1, _clflush, _wrmsr2)			\
66 do {								\
67 	asm volatile(LOOP_ASM(_wrmsr1, _clflush, _wrmsr2)	\
68 		     : "=b"(tmp), "=r"(tmp2), "=r"(tmp3)	\
69 		     : "a"(eax), "d"(edx), "c"(global_ctl),	\
70 		       "0"(N), "1"(buf)				\
71 		     : "edi");					\
72 } while (0)
73 
74 typedef struct {
75 	uint32_t ctr;
76 	uint32_t idx;
77 	uint64_t config;
78 	uint64_t count;
79 } pmu_counter_t;
80 
81 struct pmu_event {
82 	const char *name;
83 	uint32_t unit_sel;
84 	int min;
85 	int max;
86 } intel_gp_events[] = {
87 	{"core cycles", 0x003c, 1*N, 50*N},
88 	{"instructions", 0x00c0, 10*N, 10.2*N},
89 	{"ref cycles", 0x013c, 1*N, 30*N},
90 	{"llc references", 0x4f2e, 1, 2*N},
91 	{"llc misses", 0x412e, 1, 1*N},
92 	{"branches", 0x00c4, 1*N, 1.1*N},
93 	{"branch misses", 0x00c5, 0, 0.1*N},
94 }, amd_gp_events[] = {
95 	{"core cycles", 0x0076, 1*N, 50*N},
96 	{"instructions", 0x00c0, 10*N, 10.2*N},
97 	{"branches", 0x00c2, 1*N, 1.1*N},
98 	{"branch misses", 0x00c3, 0, 0.1*N},
99 }, fixed_events[] = {
100 	{"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
101 	{"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
102 	{"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
103 };
104 
105 /*
106  * Events index in intel_gp_events[], ensure consistent with
107  * intel_gp_events[].
108  */
109 enum {
110 	INTEL_INSTRUCTIONS_IDX  = 1,
111 	INTEL_REF_CYCLES_IDX	= 2,
112 	INTEL_LLC_MISSES_IDX	= 4,
113 	INTEL_BRANCHES_IDX	= 5,
114 };
115 
116 /*
117  * Events index in amd_gp_events[], ensure consistent with
118  * amd_gp_events[].
119  */
120 enum {
121 	AMD_INSTRUCTIONS_IDX    = 1,
122 	AMD_BRANCHES_IDX	= 2,
123 };
124 
125 char *buf;
126 
127 static struct pmu_event *gp_events;
128 static unsigned int gp_events_size;
129 static unsigned int fixed_counters_num;
130 
131 static int has_ibpb(void)
132 {
133 	return this_cpu_has(X86_FEATURE_SPEC_CTRL) ||
134 	       this_cpu_has(X86_FEATURE_AMD_IBPB);
135 }
136 
137 static inline void __loop(void)
138 {
139 	unsigned long tmp, tmp2, tmp3;
140 	u32 global_ctl = 0;
141 	u32 eax = 0;
142 	u32 edx = 0;
143 
144 	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
145 		_loop_asm("nop", "clflush (%1)", "wrmsr");
146 	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
147 		_loop_asm("nop", "clflush (%1)", "nop");
148 	else if (has_ibpb())
149 		_loop_asm("nop", "nop", "wrmsr");
150 	else
151 		_loop_asm("nop", "nop", "nop");
152 }
153 
154 /*
155  * Enable and disable counters in a whole asm blob to ensure
156  * no other instructions are counted in the window between
157  * counters enabling and really LOOP_ASM code executing.
158  * Thus counters can verify instructions and branches events
159  * against precise counts instead of a rough valid count range.
160  */
161 static inline void __precise_loop(u64 cntrs)
162 {
163 	unsigned long tmp, tmp2, tmp3;
164 	u32 global_ctl = pmu.msr_global_ctl;
165 	u32 eax = cntrs & (BIT_ULL(32) - 1);
166 	u32 edx = cntrs >> 32;
167 
168 	if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
169 		_loop_asm("wrmsr", "clflush (%1)", "wrmsr");
170 	else if (this_cpu_has(X86_FEATURE_CLFLUSH))
171 		_loop_asm("wrmsr", "clflush (%1)", "nop");
172 	else if (has_ibpb())
173 		_loop_asm("wrmsr", "nop", "wrmsr");
174 	else
175 		_loop_asm("wrmsr", "nop", "nop");
176 }
177 
178 static inline void loop(u64 cntrs)
179 {
180 	if (!this_cpu_has_perf_global_ctrl())
181 		__loop();
182 	else
183 		__precise_loop(cntrs);
184 }
185 
186 static void adjust_events_range(struct pmu_event *gp_events,
187 				int instruction_idx, int branch_idx)
188 {
189 	/*
190 	 * If HW supports GLOBAL_CTRL MSR, enabling and disabling PMCs are
191 	 * moved in __precise_loop(). Thus, instructions and branches events
192 	 * can be verified against a precise count instead of a rough range.
193 	 *
194 	 * Skip the precise checks on AMD, as AMD CPUs count VMRUN as a branch
195 	 * instruction in guest context, which* leads to intermittent failures
196 	 * as the counts will vary depending on how many asynchronous VM-Exits
197 	 * occur while running the measured code, e.g. if the host takes IRQs.
198 	 */
199 	if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) {
200 		gp_events[instruction_idx].min = LOOP_INSNS;
201 		gp_events[instruction_idx].max = LOOP_INSNS;
202 		gp_events[branch_idx].min = LOOP_BRANCHES;
203 		gp_events[branch_idx].max = LOOP_BRANCHES;
204 	}
205 }
206 
207 volatile uint64_t irq_received;
208 
209 static void cnt_overflow(isr_regs_t *regs)
210 {
211 	irq_received++;
212 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
213 	apic_write(APIC_EOI, 0);
214 }
215 
216 static bool check_irq(void)
217 {
218 	int i;
219 	irq_received = 0;
220 	sti();
221 	for (i = 0; i < 100000 && !irq_received; i++)
222 		asm volatile("pause");
223 	cli();
224 	return irq_received;
225 }
226 
227 static bool is_gp(pmu_counter_t *evt)
228 {
229 	if (!pmu.is_intel)
230 		return true;
231 
232 	return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 ||
233 		evt->ctr >= MSR_IA32_PMC0;
234 }
235 
236 static int event_to_global_idx(pmu_counter_t *cnt)
237 {
238 	if (pmu.is_intel)
239 		return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base :
240 			(MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
241 
242 	if (pmu.msr_gp_counter_base == MSR_F15H_PERF_CTR0)
243 		return (cnt->ctr - pmu.msr_gp_counter_base) / 2;
244 	else
245 		return cnt->ctr - pmu.msr_gp_counter_base;
246 }
247 
248 static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
249 {
250 	if (is_gp(cnt)) {
251 		int i;
252 
253 		for (i = 0; i < gp_events_size; i++)
254 			if (gp_events[i].unit_sel == (cnt->config & 0xffff))
255 				return &gp_events[i];
256 	} else {
257 		unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0;
258 
259 		if (idx < ARRAY_SIZE(fixed_events))
260 			return &fixed_events[idx];
261 	}
262 
263 	return (void*)0;
264 }
265 
266 static void global_enable(pmu_counter_t *cnt)
267 {
268 	if (!this_cpu_has_perf_global_ctrl())
269 		return;
270 
271 	cnt->idx = event_to_global_idx(cnt);
272 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx));
273 }
274 
275 static void global_disable(pmu_counter_t *cnt)
276 {
277 	if (!this_cpu_has_perf_global_ctrl())
278 		return;
279 
280 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx));
281 }
282 
283 static void __start_event(pmu_counter_t *evt, uint64_t count)
284 {
285     evt->count = count;
286     wrmsr(evt->ctr, evt->count);
287     if (is_gp(evt)) {
288 	    wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
289 		  evt->config | EVNTSEL_EN);
290     } else {
291 	    uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
292 	    int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
293 	    uint32_t usrospmi = 0;
294 
295 	    if (evt->config & EVNTSEL_OS)
296 		    usrospmi |= (1 << 0);
297 	    if (evt->config & EVNTSEL_USR)
298 		    usrospmi |= (1 << 1);
299 	    if (evt->config & EVNTSEL_INT)
300 		    usrospmi |= (1 << 3); // PMI on overflow
301 	    ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift);
302 	    wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl);
303     }
304     apic_write(APIC_LVTPC, PMI_VECTOR);
305 }
306 
307 static void start_event(pmu_counter_t *evt)
308 {
309 	__start_event(evt, 0);
310 	global_enable(evt);
311 }
312 
313 static void __stop_event(pmu_counter_t *evt)
314 {
315 	if (is_gp(evt)) {
316 		wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
317 		      evt->config & ~EVNTSEL_EN);
318 	} else {
319 		uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
320 		int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
321 		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift));
322 	}
323 	evt->count = rdmsr(evt->ctr);
324 }
325 
326 static void stop_event(pmu_counter_t *evt)
327 {
328 	global_disable(evt);
329 	__stop_event(evt);
330 }
331 
332 static noinline void measure_many(pmu_counter_t *evt, int count)
333 {
334 	int i;
335 	u64 cntrs = 0;
336 
337 	for (i = 0; i < count; i++) {
338 		__start_event(&evt[i], 0);
339 		cntrs |= BIT_ULL(event_to_global_idx(&evt[i]));
340 	}
341 	loop(cntrs);
342 	for (i = 0; i < count; i++)
343 		__stop_event(&evt[i]);
344 }
345 
346 static void measure_one(pmu_counter_t *evt)
347 {
348 	measure_many(evt, 1);
349 }
350 
351 static noinline void __measure(pmu_counter_t *evt, uint64_t count)
352 {
353 	u64 cntrs = BIT_ULL(event_to_global_idx(evt));
354 
355 	__start_event(evt, count);
356 	loop(cntrs);
357 	__stop_event(evt);
358 }
359 
360 static bool verify_event(uint64_t count, struct pmu_event *e)
361 {
362 	bool pass;
363 
364 	if (!e)
365 		return false;
366 
367 	pass = count >= e->min && count <= e->max;
368 	if (!pass)
369 		printf("FAIL: %d <= %"PRId64" <= %d\n", e->min, count, e->max);
370 
371 	return pass;
372 }
373 
374 static bool verify_counter(pmu_counter_t *cnt)
375 {
376 	return verify_event(cnt->count, get_counter_event(cnt));
377 }
378 
379 static void check_gp_counter(struct pmu_event *evt)
380 {
381 	pmu_counter_t cnt = {
382 		.config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel,
383 	};
384 	int i;
385 
386 	for (i = 0; i < pmu.nr_gp_counters; i++) {
387 		cnt.ctr = MSR_GP_COUNTERx(i);
388 		measure_one(&cnt);
389 		report(verify_event(cnt.count, evt), "%s-%d", evt->name, i);
390 	}
391 }
392 
393 static void check_gp_counters(void)
394 {
395 	int i;
396 
397 	for (i = 0; i < gp_events_size; i++)
398 		if (pmu_gp_counter_is_available(i))
399 			check_gp_counter(&gp_events[i]);
400 		else
401 			printf("GP event '%s' is disabled\n",
402 					gp_events[i].name);
403 }
404 
405 static void check_fixed_counters(void)
406 {
407 	pmu_counter_t cnt = {
408 		.config = EVNTSEL_OS | EVNTSEL_USR,
409 	};
410 	int i;
411 
412 	for (i = 0; i < fixed_counters_num; i++) {
413 		cnt.ctr = fixed_events[i].unit_sel;
414 		measure_one(&cnt);
415 		report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i);
416 	}
417 }
418 
419 static void check_counters_many(void)
420 {
421 	pmu_counter_t cnt[48];
422 	int i, n;
423 
424 	for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) {
425 		if (!pmu_gp_counter_is_available(i))
426 			continue;
427 
428 		cnt[n].ctr = MSR_GP_COUNTERx(n);
429 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR |
430 			gp_events[i % gp_events_size].unit_sel;
431 		n++;
432 	}
433 	for (i = 0; i < fixed_counters_num; i++) {
434 		cnt[n].ctr = fixed_events[i].unit_sel;
435 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR;
436 		n++;
437 	}
438 
439 	assert(n <= ARRAY_SIZE(cnt));
440 	measure_many(cnt, n);
441 
442 	for (i = 0; i < n; i++)
443 		if (!verify_counter(&cnt[i]))
444 			break;
445 
446 	report(i == n, "all counters");
447 }
448 
449 static uint64_t measure_for_overflow(pmu_counter_t *cnt)
450 {
451 	__measure(cnt, 0);
452 	/*
453 	 * To generate overflow, i.e. roll over to '0', the initial count just
454 	 * needs to be preset to the negative expected count.  However, as per
455 	 * Intel's SDM, the preset count needs to be incremented by 1 to ensure
456 	 * the overflow interrupt is generated immediately instead of possibly
457 	 * waiting for the overflow to propagate through the counter.
458 	 */
459 	assert(cnt->count > 1);
460 	return 1 - cnt->count;
461 }
462 
463 static void check_counter_overflow(void)
464 {
465 	int i;
466 	uint64_t overflow_preset;
467 	int instruction_idx = pmu.is_intel ?
468 			      INTEL_INSTRUCTIONS_IDX :
469 			      AMD_INSTRUCTIONS_IDX;
470 
471 	pmu_counter_t cnt = {
472 		.ctr = MSR_GP_COUNTERx(0),
473 		.config = EVNTSEL_OS | EVNTSEL_USR |
474 			  gp_events[instruction_idx].unit_sel /* instructions */,
475 	};
476 	overflow_preset = measure_for_overflow(&cnt);
477 
478 	/* clear status before test */
479 	if (this_cpu_has_perf_global_status())
480 		pmu_clear_global_status();
481 
482 	report_prefix_push("overflow");
483 
484 	for (i = 0; i < pmu.nr_gp_counters + 1; i++) {
485 		uint64_t status;
486 		int idx;
487 
488 		cnt.count = overflow_preset;
489 		if (pmu_use_full_writes())
490 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
491 
492 		if (i == pmu.nr_gp_counters) {
493 			if (!pmu.is_intel)
494 				break;
495 
496 			cnt.ctr = fixed_events[0].unit_sel;
497 			cnt.count = measure_for_overflow(&cnt);
498 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
499 		} else {
500 			cnt.ctr = MSR_GP_COUNTERx(i);
501 		}
502 
503 		if (i % 2)
504 			cnt.config |= EVNTSEL_INT;
505 		else
506 			cnt.config &= ~EVNTSEL_INT;
507 		idx = event_to_global_idx(&cnt);
508 		__measure(&cnt, cnt.count);
509 		if (pmu.is_intel)
510 			report(cnt.count == 1, "cntr-%d", i);
511 		else
512 			report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i);
513 
514 		if (!this_cpu_has_perf_global_status())
515 			continue;
516 
517 		status = rdmsr(pmu.msr_global_status);
518 		report(status & (1ull << idx), "status-%d", i);
519 		wrmsr(pmu.msr_global_status_clr, status);
520 		status = rdmsr(pmu.msr_global_status);
521 		report(!(status & (1ull << idx)), "status clear-%d", i);
522 		report(check_irq() == (i % 2), "irq-%d", i);
523 	}
524 
525 	report_prefix_pop();
526 }
527 
528 static void check_gp_counter_cmask(void)
529 {
530 	int instruction_idx = pmu.is_intel ?
531 			      INTEL_INSTRUCTIONS_IDX :
532 			      AMD_INSTRUCTIONS_IDX;
533 
534 	pmu_counter_t cnt = {
535 		.ctr = MSR_GP_COUNTERx(0),
536 		.config = EVNTSEL_OS | EVNTSEL_USR |
537 			  gp_events[instruction_idx].unit_sel /* instructions */,
538 	};
539 	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
540 	measure_one(&cnt);
541 	report(cnt.count < gp_events[instruction_idx].min, "cmask");
542 }
543 
544 static void do_rdpmc_fast(void *ptr)
545 {
546 	pmu_counter_t *cnt = ptr;
547 	uint32_t idx = (uint32_t)cnt->idx | (1u << 31);
548 
549 	if (!is_gp(cnt))
550 		idx |= 1 << 30;
551 
552 	cnt->count = rdpmc(idx);
553 }
554 
555 
556 static void check_rdpmc(void)
557 {
558 	uint64_t val = 0xff0123456789ull;
559 	bool exc;
560 	int i;
561 
562 	report_prefix_push("rdpmc");
563 
564 	for (i = 0; i < pmu.nr_gp_counters; i++) {
565 		uint64_t x;
566 		pmu_counter_t cnt = {
567 			.ctr = MSR_GP_COUNTERx(i),
568 			.idx = i
569 		};
570 
571 	        /*
572 	         * Without full-width writes, only the low 32 bits are writable,
573 	         * and the value is sign-extended.
574 	         */
575 		if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0)
576 			x = (uint64_t)(int64_t)(int32_t)val;
577 		else
578 			x = (uint64_t)(int64_t)val;
579 
580 		/* Mask according to the number of supported bits */
581 		x &= (1ull << pmu.gp_counter_width) - 1;
582 
583 		wrmsr(MSR_GP_COUNTERx(i), val);
584 		report(rdpmc(i) == x, "cntr-%d", i);
585 
586 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
587 		if (exc)
588 			report_skip("fast-%d", i);
589 		else
590 			report(cnt.count == (u32)val, "fast-%d", i);
591 	}
592 	for (i = 0; i < fixed_counters_num; i++) {
593 		uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1);
594 		pmu_counter_t cnt = {
595 			.ctr = MSR_CORE_PERF_FIXED_CTR0 + i,
596 			.idx = i
597 		};
598 
599 		wrmsr(MSR_PERF_FIXED_CTRx(i), x);
600 		report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i);
601 
602 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
603 		if (exc)
604 			report_skip("fixed fast-%d", i);
605 		else
606 			report(cnt.count == (u32)x, "fixed fast-%d", i);
607 	}
608 
609 	report_prefix_pop();
610 }
611 
612 static void check_running_counter_wrmsr(void)
613 {
614 	uint64_t status;
615 	uint64_t count;
616 	unsigned int instruction_idx = pmu.is_intel ?
617 				       INTEL_INSTRUCTIONS_IDX :
618 				       AMD_INSTRUCTIONS_IDX;
619 
620 	pmu_counter_t evt = {
621 		.ctr = MSR_GP_COUNTERx(0),
622 		.config = EVNTSEL_OS | EVNTSEL_USR |
623 			  gp_events[instruction_idx].unit_sel,
624 	};
625 
626 	report_prefix_push("running counter wrmsr");
627 
628 	start_event(&evt);
629 	__loop();
630 	wrmsr(MSR_GP_COUNTERx(0), 0);
631 	stop_event(&evt);
632 	report(evt.count < gp_events[instruction_idx].min, "cntr");
633 
634 	/* clear status before overflow test */
635 	if (this_cpu_has_perf_global_status())
636 		pmu_clear_global_status();
637 
638 	start_event(&evt);
639 
640 	count = -1;
641 	if (pmu_use_full_writes())
642 		count &= (1ull << pmu.gp_counter_width) - 1;
643 
644 	wrmsr(MSR_GP_COUNTERx(0), count);
645 
646 	__loop();
647 	stop_event(&evt);
648 
649 	if (this_cpu_has_perf_global_status()) {
650 		status = rdmsr(pmu.msr_global_status);
651 		report(status & 1, "status msr bit");
652 	}
653 
654 	report_prefix_pop();
655 }
656 
657 static void check_emulated_instr(void)
658 {
659 	uint64_t status, instr_start, brnch_start;
660 	uint64_t gp_counter_width = (1ull << pmu.gp_counter_width) - 1;
661 	unsigned int branch_idx = pmu.is_intel ?
662 				  INTEL_BRANCHES_IDX : AMD_BRANCHES_IDX;
663 	unsigned int instruction_idx = pmu.is_intel ?
664 				       INTEL_INSTRUCTIONS_IDX :
665 				       AMD_INSTRUCTIONS_IDX;
666 	pmu_counter_t brnch_cnt = {
667 		.ctr = MSR_GP_COUNTERx(0),
668 		/* branch instructions */
669 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[branch_idx].unit_sel,
670 	};
671 	pmu_counter_t instr_cnt = {
672 		.ctr = MSR_GP_COUNTERx(1),
673 		/* instructions */
674 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel,
675 	};
676 	report_prefix_push("emulated instruction");
677 
678 	if (this_cpu_has_perf_global_status())
679 		pmu_clear_global_status();
680 
681 	start_event(&brnch_cnt);
682 	start_event(&instr_cnt);
683 
684 	brnch_start = -EXPECTED_BRNCH;
685 	instr_start = -EXPECTED_INSTR;
686 	wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width);
687 	wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width);
688 	// KVM_FEP is a magic prefix that forces emulation so
689 	// 'KVM_FEP "jne label\n"' just counts as a single instruction.
690 	asm volatile(
691 		"mov $0x0, %%eax\n"
692 		"cmp $0x0, %%eax\n"
693 		KVM_FEP "jne label\n"
694 		KVM_FEP "jne label\n"
695 		KVM_FEP "jne label\n"
696 		KVM_FEP "jne label\n"
697 		KVM_FEP "jne label\n"
698 		"mov $0xa, %%eax\n"
699 		"cpuid\n"
700 		"mov $0xa, %%eax\n"
701 		"cpuid\n"
702 		"mov $0xa, %%eax\n"
703 		"cpuid\n"
704 		"mov $0xa, %%eax\n"
705 		"cpuid\n"
706 		"mov $0xa, %%eax\n"
707 		"cpuid\n"
708 		"label:\n"
709 		:
710 		:
711 		: "eax", "ebx", "ecx", "edx");
712 
713 	if (this_cpu_has_perf_global_ctrl())
714 		wrmsr(pmu.msr_global_ctl, 0);
715 
716 	stop_event(&brnch_cnt);
717 	stop_event(&instr_cnt);
718 
719 	// Check that the end count - start count is at least the expected
720 	// number of instructions and branches.
721 	report(instr_cnt.count - instr_start >= EXPECTED_INSTR,
722 	       "instruction count");
723 	report(brnch_cnt.count - brnch_start >= EXPECTED_BRNCH,
724 	       "branch count");
725 	if (this_cpu_has_perf_global_status()) {
726 		// Additionally check that those counters overflowed properly.
727 		status = rdmsr(pmu.msr_global_status);
728 		report(status & 1, "branch counter overflow");
729 		report(status & 2, "instruction counter overflow");
730 	}
731 
732 	report_prefix_pop();
733 }
734 
735 #define XBEGIN_STARTED (~0u)
736 static void check_tsx_cycles(void)
737 {
738 	pmu_counter_t cnt;
739 	unsigned int i, ret = 0;
740 
741 	if (!this_cpu_has(X86_FEATURE_RTM))
742 		return;
743 
744 	report_prefix_push("TSX cycles");
745 
746 	for (i = 0; i < pmu.nr_gp_counters; i++) {
747 		cnt.ctr = MSR_GP_COUNTERx(i);
748 
749 		if (i == 2) {
750 			/* Transactional cycles committed only on gp counter 2 */
751 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x30000003c;
752 		} else {
753 			/* Transactional cycles */
754 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x10000003c;
755 		}
756 
757 		start_event(&cnt);
758 
759 		asm volatile("xbegin 1f\n\t"
760 				"1:\n\t"
761 				: "+a" (ret) :: "memory");
762 
763 		/* Generate a non-canonical #GP to trigger ABORT. */
764 		if (ret == XBEGIN_STARTED)
765 			*(int *)NONCANONICAL = 0;
766 
767 		stop_event(&cnt);
768 
769 		report(cnt.count > 0, "gp cntr-%d with a value of %" PRId64 "", i, cnt.count);
770 	}
771 
772 	report_prefix_pop();
773 }
774 
775 static void warm_up(void)
776 {
777 	int i;
778 
779 	/*
780 	 * Since cycles event is always run as the first event, there would be
781 	 * a warm-up state to warm up the cache, it leads to the measured cycles
782 	 * value may exceed the pre-defined cycles upper boundary and cause
783 	 * false positive. To avoid this, introduce an warm-up state before
784 	 * the real verification.
785 	 */
786 	for (i = 0; i < 10; i++)
787 		loop(0);
788 }
789 
790 static void check_counters(void)
791 {
792 	if (is_fep_available())
793 		check_emulated_instr();
794 
795 	warm_up();
796 	check_gp_counters();
797 	check_fixed_counters();
798 	check_rdpmc();
799 	check_counters_many();
800 	check_counter_overflow();
801 	check_gp_counter_cmask();
802 	check_running_counter_wrmsr();
803 	check_tsx_cycles();
804 }
805 
806 static void do_unsupported_width_counter_write(void *index)
807 {
808 	wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull);
809 }
810 
811 static void check_gp_counters_write_width(void)
812 {
813 	u64 val_64 = 0xffffff0123456789ull;
814 	u64 val_32 = val_64 & ((1ull << 32) - 1);
815 	u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1);
816 	int i;
817 
818 	/*
819 	 * MSR_IA32_PERFCTRn supports 64-bit writes,
820 	 * but only the lowest 32 bits are valid.
821 	 */
822 	for (i = 0; i < pmu.nr_gp_counters; i++) {
823 		wrmsr(MSR_IA32_PERFCTR0 + i, val_32);
824 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
825 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
826 
827 		wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width);
828 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
829 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
830 
831 		wrmsr(MSR_IA32_PERFCTR0 + i, val_64);
832 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
833 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
834 	}
835 
836 	/*
837 	 * MSR_IA32_PMCn supports writing values up to GP counter width,
838 	 * and only the lowest bits of GP counter width are valid.
839 	 */
840 	for (i = 0; i < pmu.nr_gp_counters; i++) {
841 		wrmsr(MSR_IA32_PMC0 + i, val_32);
842 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
843 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
844 
845 		wrmsr(MSR_IA32_PMC0 + i, val_max_width);
846 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width);
847 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width);
848 
849 		report(test_for_exception(GP_VECTOR,
850 			do_unsupported_width_counter_write, &i),
851 		"writing unsupported width to MSR_IA32_PMC%d raises #GP", i);
852 	}
853 }
854 
855 /*
856  * Per the SDM, reference cycles are currently implemented using the
857  * core crystal clock, TSC, or bus clock. Calibrate to the TSC
858  * frequency to set reasonable expectations.
859  */
860 static void set_ref_cycle_expectations(void)
861 {
862 	pmu_counter_t cnt = {
863 		.ctr = MSR_IA32_PERFCTR0,
864 		.config = EVNTSEL_OS | EVNTSEL_USR |
865 			  intel_gp_events[INTEL_REF_CYCLES_IDX].unit_sel,
866 	};
867 	uint64_t tsc_delta;
868 	uint64_t t0, t1, t2, t3;
869 
870 	/* Bit 2 enumerates the availability of reference cycles events. */
871 	if (!pmu.nr_gp_counters || !pmu_gp_counter_is_available(2))
872 		return;
873 
874 	if (this_cpu_has_perf_global_ctrl())
875 		wrmsr(pmu.msr_global_ctl, 0);
876 
877 	t0 = fenced_rdtsc();
878 	start_event(&cnt);
879 	t1 = fenced_rdtsc();
880 
881 	/*
882 	 * This loop has to run long enough to dominate the VM-exit
883 	 * costs for playing with the PMU MSRs on start and stop.
884 	 *
885 	 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times
886 	 * the core crystal clock, this function calculated a guest
887 	 * TSC : ref cycles ratio of around 105 with ECX initialized
888 	 * to one billion.
889 	 */
890 	asm volatile("loop ." : "+c"((int){1000000000ull}));
891 
892 	t2 = fenced_rdtsc();
893 	stop_event(&cnt);
894 	t3 = fenced_rdtsc();
895 
896 	tsc_delta = ((t2 - t1) + (t3 - t0)) / 2;
897 
898 	if (!tsc_delta)
899 		return;
900 
901 	intel_gp_events[INTEL_REF_CYCLES_IDX].min =
902 		(intel_gp_events[INTEL_REF_CYCLES_IDX].min * cnt.count) / tsc_delta;
903 	intel_gp_events[INTEL_REF_CYCLES_IDX].max =
904 		(intel_gp_events[INTEL_REF_CYCLES_IDX].max * cnt.count) / tsc_delta;
905 }
906 
907 static void check_invalid_rdpmc_gp(void)
908 {
909 	uint64_t val;
910 
911 	report(rdpmc_safe(64, &val) == GP_VECTOR,
912 	       "Expected #GP on RDPMC(64)");
913 }
914 
915 int main(int ac, char **av)
916 {
917 	int instruction_idx;
918 	int branch_idx;
919 
920 	setup_vm();
921 	handle_irq(PMI_VECTOR, cnt_overflow);
922 	buf = malloc(N*64);
923 
924 	check_invalid_rdpmc_gp();
925 
926 	if (pmu.is_intel) {
927 		if (!pmu.version) {
928 			report_skip("No Intel Arch PMU is detected!");
929 			return report_summary();
930 		}
931 		gp_events = (struct pmu_event *)intel_gp_events;
932 		gp_events_size = sizeof(intel_gp_events)/sizeof(intel_gp_events[0]);
933 		instruction_idx = INTEL_INSTRUCTIONS_IDX;
934 		branch_idx = INTEL_BRANCHES_IDX;
935 
936 		/*
937 		 * For legacy Intel CPUS without clflush/clflushopt support,
938 		 * there is no way to force to trigger a LLC miss, thus set
939 		 * the minimum value to 0 to avoid false positives.
940 		 */
941 		if (!this_cpu_has(X86_FEATURE_CLFLUSH))
942 			gp_events[INTEL_LLC_MISSES_IDX].min = 0;
943 
944 		report_prefix_push("Intel");
945 		set_ref_cycle_expectations();
946 	} else {
947 		gp_events_size = sizeof(amd_gp_events)/sizeof(amd_gp_events[0]);
948 		gp_events = (struct pmu_event *)amd_gp_events;
949 		instruction_idx = AMD_INSTRUCTIONS_IDX;
950 		branch_idx = AMD_BRANCHES_IDX;
951 		report_prefix_push("AMD");
952 	}
953 	adjust_events_range(gp_events, instruction_idx, branch_idx);
954 
955 	printf("PMU version:         %d\n", pmu.version);
956 	printf("GP counters:         %d\n", pmu.nr_gp_counters);
957 	printf("GP counter width:    %d\n", pmu.gp_counter_width);
958 	printf("Mask length:         %d\n", pmu.gp_counter_mask_length);
959 	printf("Fixed counters:      %d\n", pmu.nr_fixed_counters);
960 	printf("Fixed counter width: %d\n", pmu.fixed_counter_width);
961 
962 	fixed_counters_num = MIN(pmu.nr_fixed_counters, ARRAY_SIZE(fixed_events));
963 	if (pmu.nr_fixed_counters > ARRAY_SIZE(fixed_events))
964 		report_info("Fixed counters number %d > defined fixed events %u.  "
965 			    "Please update test case.", pmu.nr_fixed_counters,
966 			    (uint32_t)ARRAY_SIZE(fixed_events));
967 
968 	apic_write(APIC_LVTPC, PMI_VECTOR);
969 
970 	check_counters();
971 
972 	if (pmu_has_full_writes()) {
973 		pmu.msr_gp_counter_base = MSR_IA32_PMC0;
974 
975 		report_prefix_push("full-width writes");
976 		check_counters();
977 		check_gp_counters_write_width();
978 		report_prefix_pop();
979 	}
980 
981 	if (!pmu.is_intel) {
982 		report_prefix_push("K7");
983 		pmu.nr_gp_counters = AMD64_NUM_COUNTERS;
984 		pmu.msr_gp_counter_base = MSR_K7_PERFCTR0;
985 		pmu.msr_gp_event_select_base = MSR_K7_EVNTSEL0;
986 		check_counters();
987 		report_prefix_pop();
988 	}
989 
990 	return report_summary();
991 }
992