xref: /kvm-unit-tests/x86/pmu.c (revision e0d0022fbd4cf97ffac5a2c23259a61c86f73699)
1 
2 #include "x86/msr.h"
3 #include "x86/processor.h"
4 #include "x86/pmu.h"
5 #include "x86/apic-defs.h"
6 #include "x86/apic.h"
7 #include "x86/desc.h"
8 #include "x86/isr.h"
9 #include "vmalloc.h"
10 #include "alloc.h"
11 
12 #include "libcflat.h"
13 #include <stdint.h>
14 
15 #define N 1000000
16 
17 // These values match the number of instructions and branches in the
18 // assembly block in check_emulated_instr().
19 #define EXPECTED_INSTR 17
20 #define EXPECTED_BRNCH 5
21 
22 /* Enable GLOBAL_CTRL + disable GLOBAL_CTRL + clflush/mfence instructions */
23 #define EXTRA_INSNS  (3 + 3 +2)
24 #define LOOP_INSNS   (N * 10 + EXTRA_INSNS)
25 #define LOOP_BRANCHES  (N)
26 #define LOOP_ASM(_wrmsr, _clflush)					\
27 	_wrmsr "\n\t"							\
28 	"mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t"			\
29 	_clflush "\n\t"                                 		\
30 	"mfence;\n\t"                                   		\
31 	"1: mov (%1), %2; add $64, %1;\n\t"				\
32 	"nop; nop; nop; nop; nop; nop; nop;\n\t"			\
33 	"loop 1b;\n\t"							\
34 	"mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t"	\
35 	_wrmsr "\n\t"
36 
37 #define _loop_asm(_wrmsr, _clflush)				\
38 do {								\
39 	asm volatile(LOOP_ASM(_wrmsr, _clflush)			\
40 		     : "=b"(tmp), "=r"(tmp2), "=r"(tmp3)	\
41 		     : "a"(eax), "d"(edx), "c"(global_ctl),	\
42 		       "0"(N), "1"(buf)				\
43 		     : "edi");					\
44 } while (0)
45 
46 typedef struct {
47 	uint32_t ctr;
48 	uint32_t idx;
49 	uint64_t config;
50 	uint64_t count;
51 } pmu_counter_t;
52 
53 struct pmu_event {
54 	const char *name;
55 	uint32_t unit_sel;
56 	int min;
57 	int max;
58 } intel_gp_events[] = {
59 	{"core cycles", 0x003c, 1*N, 50*N},
60 	{"instructions", 0x00c0, 10*N, 10.2*N},
61 	{"ref cycles", 0x013c, 1*N, 30*N},
62 	{"llc references", 0x4f2e, 1, 2*N},
63 	{"llc misses", 0x412e, 1, 1*N},
64 	{"branches", 0x00c4, 1*N, 1.1*N},
65 	{"branch misses", 0x00c5, 0, 0.1*N},
66 }, amd_gp_events[] = {
67 	{"core cycles", 0x0076, 1*N, 50*N},
68 	{"instructions", 0x00c0, 10*N, 10.2*N},
69 	{"branches", 0x00c2, 1*N, 1.1*N},
70 	{"branch misses", 0x00c3, 0, 0.1*N},
71 }, fixed_events[] = {
72 	{"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
73 	{"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
74 	{"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
75 };
76 
77 /*
78  * Events index in intel_gp_events[], ensure consistent with
79  * intel_gp_events[].
80  */
81 enum {
82 	INTEL_INSTRUCTIONS_IDX  = 1,
83 	INTEL_REF_CYCLES_IDX	= 2,
84 	INTEL_LLC_MISSES_IDX	= 4,
85 	INTEL_BRANCHES_IDX	= 5,
86 };
87 
88 /*
89  * Events index in amd_gp_events[], ensure consistent with
90  * amd_gp_events[].
91  */
92 enum {
93 	AMD_INSTRUCTIONS_IDX    = 1,
94 	AMD_BRANCHES_IDX	= 2,
95 };
96 
97 char *buf;
98 
99 static struct pmu_event *gp_events;
100 static unsigned int gp_events_size;
101 static unsigned int fixed_counters_num;
102 
103 static inline void __loop(void)
104 {
105 	unsigned long tmp, tmp2, tmp3;
106 	u32 global_ctl = 0;
107 	u32 eax = 0;
108 	u32 edx = 0;
109 
110 	if (this_cpu_has(X86_FEATURE_CLFLUSH))
111 		_loop_asm("nop", "clflush (%1)");
112 	else
113 		_loop_asm("nop", "nop");
114 }
115 
116 /*
117  * Enable and disable counters in a whole asm blob to ensure
118  * no other instructions are counted in the window between
119  * counters enabling and really LOOP_ASM code executing.
120  * Thus counters can verify instructions and branches events
121  * against precise counts instead of a rough valid count range.
122  */
123 static inline void __precise_loop(u64 cntrs)
124 {
125 	unsigned long tmp, tmp2, tmp3;
126 	u32 global_ctl = pmu.msr_global_ctl;
127 	u32 eax = cntrs & (BIT_ULL(32) - 1);
128 	u32 edx = cntrs >> 32;
129 
130 	if (this_cpu_has(X86_FEATURE_CLFLUSH))
131 		_loop_asm("wrmsr", "clflush (%1)");
132 	else
133 		_loop_asm("wrmsr", "nop");
134 }
135 
136 static inline void loop(u64 cntrs)
137 {
138 	if (!this_cpu_has_perf_global_ctrl())
139 		__loop();
140 	else
141 		__precise_loop(cntrs);
142 }
143 
144 static void adjust_events_range(struct pmu_event *gp_events,
145 				int instruction_idx, int branch_idx)
146 {
147 	/*
148 	 * If HW supports GLOBAL_CTRL MSR, enabling and disabling PMCs are
149 	 * moved in __precise_loop(). Thus, instructions and branches events
150 	 * can be verified against a precise count instead of a rough range.
151 	 *
152 	 * Skip the precise checks on AMD, as AMD CPUs count VMRUN as a branch
153 	 * instruction in guest context, which* leads to intermittent failures
154 	 * as the counts will vary depending on how many asynchronous VM-Exits
155 	 * occur while running the measured code, e.g. if the host takes IRQs.
156 	 */
157 	if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) {
158 		gp_events[instruction_idx].min = LOOP_INSNS;
159 		gp_events[instruction_idx].max = LOOP_INSNS;
160 		gp_events[branch_idx].min = LOOP_BRANCHES;
161 		gp_events[branch_idx].max = LOOP_BRANCHES;
162 	}
163 }
164 
165 volatile uint64_t irq_received;
166 
167 static void cnt_overflow(isr_regs_t *regs)
168 {
169 	irq_received++;
170 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
171 	apic_write(APIC_EOI, 0);
172 }
173 
174 static bool check_irq(void)
175 {
176 	int i;
177 	irq_received = 0;
178 	sti();
179 	for (i = 0; i < 100000 && !irq_received; i++)
180 		asm volatile("pause");
181 	cli();
182 	return irq_received;
183 }
184 
185 static bool is_gp(pmu_counter_t *evt)
186 {
187 	if (!pmu.is_intel)
188 		return true;
189 
190 	return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 ||
191 		evt->ctr >= MSR_IA32_PMC0;
192 }
193 
194 static int event_to_global_idx(pmu_counter_t *cnt)
195 {
196 	if (pmu.is_intel)
197 		return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base :
198 			(MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
199 
200 	if (pmu.msr_gp_counter_base == MSR_F15H_PERF_CTR0)
201 		return (cnt->ctr - pmu.msr_gp_counter_base) / 2;
202 	else
203 		return cnt->ctr - pmu.msr_gp_counter_base;
204 }
205 
206 static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
207 {
208 	if (is_gp(cnt)) {
209 		int i;
210 
211 		for (i = 0; i < gp_events_size; i++)
212 			if (gp_events[i].unit_sel == (cnt->config & 0xffff))
213 				return &gp_events[i];
214 	} else {
215 		unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0;
216 
217 		if (idx < ARRAY_SIZE(fixed_events))
218 			return &fixed_events[idx];
219 	}
220 
221 	return (void*)0;
222 }
223 
224 static void global_enable(pmu_counter_t *cnt)
225 {
226 	if (!this_cpu_has_perf_global_ctrl())
227 		return;
228 
229 	cnt->idx = event_to_global_idx(cnt);
230 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx));
231 }
232 
233 static void global_disable(pmu_counter_t *cnt)
234 {
235 	if (!this_cpu_has_perf_global_ctrl())
236 		return;
237 
238 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx));
239 }
240 
241 static void __start_event(pmu_counter_t *evt, uint64_t count)
242 {
243     evt->count = count;
244     wrmsr(evt->ctr, evt->count);
245     if (is_gp(evt)) {
246 	    wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
247 		  evt->config | EVNTSEL_EN);
248     } else {
249 	    uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
250 	    int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
251 	    uint32_t usrospmi = 0;
252 
253 	    if (evt->config & EVNTSEL_OS)
254 		    usrospmi |= (1 << 0);
255 	    if (evt->config & EVNTSEL_USR)
256 		    usrospmi |= (1 << 1);
257 	    if (evt->config & EVNTSEL_INT)
258 		    usrospmi |= (1 << 3); // PMI on overflow
259 	    ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift);
260 	    wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl);
261     }
262     apic_write(APIC_LVTPC, PMI_VECTOR);
263 }
264 
265 static void start_event(pmu_counter_t *evt)
266 {
267 	__start_event(evt, 0);
268 	global_enable(evt);
269 }
270 
271 static void __stop_event(pmu_counter_t *evt)
272 {
273 	if (is_gp(evt)) {
274 		wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
275 		      evt->config & ~EVNTSEL_EN);
276 	} else {
277 		uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
278 		int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
279 		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift));
280 	}
281 	evt->count = rdmsr(evt->ctr);
282 }
283 
284 static void stop_event(pmu_counter_t *evt)
285 {
286 	global_disable(evt);
287 	__stop_event(evt);
288 }
289 
290 static noinline void measure_many(pmu_counter_t *evt, int count)
291 {
292 	int i;
293 	u64 cntrs = 0;
294 
295 	for (i = 0; i < count; i++) {
296 		__start_event(&evt[i], 0);
297 		cntrs |= BIT_ULL(event_to_global_idx(&evt[i]));
298 	}
299 	loop(cntrs);
300 	for (i = 0; i < count; i++)
301 		__stop_event(&evt[i]);
302 }
303 
304 static void measure_one(pmu_counter_t *evt)
305 {
306 	measure_many(evt, 1);
307 }
308 
309 static noinline void __measure(pmu_counter_t *evt, uint64_t count)
310 {
311 	u64 cntrs = BIT_ULL(event_to_global_idx(evt));
312 
313 	__start_event(evt, count);
314 	loop(cntrs);
315 	__stop_event(evt);
316 }
317 
318 static bool verify_event(uint64_t count, struct pmu_event *e)
319 {
320 	bool pass;
321 
322 	if (!e)
323 		return false;
324 
325 	pass = count >= e->min && count <= e->max;
326 	if (!pass)
327 		printf("FAIL: %d <= %"PRId64" <= %d\n", e->min, count, e->max);
328 
329 	return pass;
330 }
331 
332 static bool verify_counter(pmu_counter_t *cnt)
333 {
334 	return verify_event(cnt->count, get_counter_event(cnt));
335 }
336 
337 static void check_gp_counter(struct pmu_event *evt)
338 {
339 	pmu_counter_t cnt = {
340 		.config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel,
341 	};
342 	int i;
343 
344 	for (i = 0; i < pmu.nr_gp_counters; i++) {
345 		cnt.ctr = MSR_GP_COUNTERx(i);
346 		measure_one(&cnt);
347 		report(verify_event(cnt.count, evt), "%s-%d", evt->name, i);
348 	}
349 }
350 
351 static void check_gp_counters(void)
352 {
353 	int i;
354 
355 	for (i = 0; i < gp_events_size; i++)
356 		if (pmu_gp_counter_is_available(i))
357 			check_gp_counter(&gp_events[i]);
358 		else
359 			printf("GP event '%s' is disabled\n",
360 					gp_events[i].name);
361 }
362 
363 static void check_fixed_counters(void)
364 {
365 	pmu_counter_t cnt = {
366 		.config = EVNTSEL_OS | EVNTSEL_USR,
367 	};
368 	int i;
369 
370 	for (i = 0; i < fixed_counters_num; i++) {
371 		cnt.ctr = fixed_events[i].unit_sel;
372 		measure_one(&cnt);
373 		report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i);
374 	}
375 }
376 
377 static void check_counters_many(void)
378 {
379 	pmu_counter_t cnt[48];
380 	int i, n;
381 
382 	for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) {
383 		if (!pmu_gp_counter_is_available(i))
384 			continue;
385 
386 		cnt[n].ctr = MSR_GP_COUNTERx(n);
387 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR |
388 			gp_events[i % gp_events_size].unit_sel;
389 		n++;
390 	}
391 	for (i = 0; i < fixed_counters_num; i++) {
392 		cnt[n].ctr = fixed_events[i].unit_sel;
393 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR;
394 		n++;
395 	}
396 
397 	assert(n <= ARRAY_SIZE(cnt));
398 	measure_many(cnt, n);
399 
400 	for (i = 0; i < n; i++)
401 		if (!verify_counter(&cnt[i]))
402 			break;
403 
404 	report(i == n, "all counters");
405 }
406 
407 static uint64_t measure_for_overflow(pmu_counter_t *cnt)
408 {
409 	__measure(cnt, 0);
410 	/*
411 	 * To generate overflow, i.e. roll over to '0', the initial count just
412 	 * needs to be preset to the negative expected count.  However, as per
413 	 * Intel's SDM, the preset count needs to be incremented by 1 to ensure
414 	 * the overflow interrupt is generated immediately instead of possibly
415 	 * waiting for the overflow to propagate through the counter.
416 	 */
417 	assert(cnt->count > 1);
418 	return 1 - cnt->count;
419 }
420 
421 static void check_counter_overflow(void)
422 {
423 	int i;
424 	uint64_t overflow_preset;
425 	int instruction_idx = pmu.is_intel ?
426 			      INTEL_INSTRUCTIONS_IDX :
427 			      AMD_INSTRUCTIONS_IDX;
428 
429 	pmu_counter_t cnt = {
430 		.ctr = MSR_GP_COUNTERx(0),
431 		.config = EVNTSEL_OS | EVNTSEL_USR |
432 			  gp_events[instruction_idx].unit_sel /* instructions */,
433 	};
434 	overflow_preset = measure_for_overflow(&cnt);
435 
436 	/* clear status before test */
437 	if (this_cpu_has_perf_global_status())
438 		pmu_clear_global_status();
439 
440 	report_prefix_push("overflow");
441 
442 	for (i = 0; i < pmu.nr_gp_counters + 1; i++) {
443 		uint64_t status;
444 		int idx;
445 
446 		cnt.count = overflow_preset;
447 		if (pmu_use_full_writes())
448 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
449 
450 		if (i == pmu.nr_gp_counters) {
451 			if (!pmu.is_intel)
452 				break;
453 
454 			cnt.ctr = fixed_events[0].unit_sel;
455 			cnt.count = measure_for_overflow(&cnt);
456 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
457 		} else {
458 			cnt.ctr = MSR_GP_COUNTERx(i);
459 		}
460 
461 		if (i % 2)
462 			cnt.config |= EVNTSEL_INT;
463 		else
464 			cnt.config &= ~EVNTSEL_INT;
465 		idx = event_to_global_idx(&cnt);
466 		__measure(&cnt, cnt.count);
467 		if (pmu.is_intel)
468 			report(cnt.count == 1, "cntr-%d", i);
469 		else
470 			report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i);
471 
472 		if (!this_cpu_has_perf_global_status())
473 			continue;
474 
475 		status = rdmsr(pmu.msr_global_status);
476 		report(status & (1ull << idx), "status-%d", i);
477 		wrmsr(pmu.msr_global_status_clr, status);
478 		status = rdmsr(pmu.msr_global_status);
479 		report(!(status & (1ull << idx)), "status clear-%d", i);
480 		report(check_irq() == (i % 2), "irq-%d", i);
481 	}
482 
483 	report_prefix_pop();
484 }
485 
486 static void check_gp_counter_cmask(void)
487 {
488 	int instruction_idx = pmu.is_intel ?
489 			      INTEL_INSTRUCTIONS_IDX :
490 			      AMD_INSTRUCTIONS_IDX;
491 
492 	pmu_counter_t cnt = {
493 		.ctr = MSR_GP_COUNTERx(0),
494 		.config = EVNTSEL_OS | EVNTSEL_USR |
495 			  gp_events[instruction_idx].unit_sel /* instructions */,
496 	};
497 	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
498 	measure_one(&cnt);
499 	report(cnt.count < gp_events[instruction_idx].min, "cmask");
500 }
501 
502 static void do_rdpmc_fast(void *ptr)
503 {
504 	pmu_counter_t *cnt = ptr;
505 	uint32_t idx = (uint32_t)cnt->idx | (1u << 31);
506 
507 	if (!is_gp(cnt))
508 		idx |= 1 << 30;
509 
510 	cnt->count = rdpmc(idx);
511 }
512 
513 
514 static void check_rdpmc(void)
515 {
516 	uint64_t val = 0xff0123456789ull;
517 	bool exc;
518 	int i;
519 
520 	report_prefix_push("rdpmc");
521 
522 	for (i = 0; i < pmu.nr_gp_counters; i++) {
523 		uint64_t x;
524 		pmu_counter_t cnt = {
525 			.ctr = MSR_GP_COUNTERx(i),
526 			.idx = i
527 		};
528 
529 	        /*
530 	         * Without full-width writes, only the low 32 bits are writable,
531 	         * and the value is sign-extended.
532 	         */
533 		if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0)
534 			x = (uint64_t)(int64_t)(int32_t)val;
535 		else
536 			x = (uint64_t)(int64_t)val;
537 
538 		/* Mask according to the number of supported bits */
539 		x &= (1ull << pmu.gp_counter_width) - 1;
540 
541 		wrmsr(MSR_GP_COUNTERx(i), val);
542 		report(rdpmc(i) == x, "cntr-%d", i);
543 
544 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
545 		if (exc)
546 			report_skip("fast-%d", i);
547 		else
548 			report(cnt.count == (u32)val, "fast-%d", i);
549 	}
550 	for (i = 0; i < fixed_counters_num; i++) {
551 		uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1);
552 		pmu_counter_t cnt = {
553 			.ctr = MSR_CORE_PERF_FIXED_CTR0 + i,
554 			.idx = i
555 		};
556 
557 		wrmsr(MSR_PERF_FIXED_CTRx(i), x);
558 		report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i);
559 
560 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
561 		if (exc)
562 			report_skip("fixed fast-%d", i);
563 		else
564 			report(cnt.count == (u32)x, "fixed fast-%d", i);
565 	}
566 
567 	report_prefix_pop();
568 }
569 
570 static void check_running_counter_wrmsr(void)
571 {
572 	uint64_t status;
573 	uint64_t count;
574 	unsigned int instruction_idx = pmu.is_intel ?
575 				       INTEL_INSTRUCTIONS_IDX :
576 				       AMD_INSTRUCTIONS_IDX;
577 
578 	pmu_counter_t evt = {
579 		.ctr = MSR_GP_COUNTERx(0),
580 		.config = EVNTSEL_OS | EVNTSEL_USR |
581 			  gp_events[instruction_idx].unit_sel,
582 	};
583 
584 	report_prefix_push("running counter wrmsr");
585 
586 	start_event(&evt);
587 	__loop();
588 	wrmsr(MSR_GP_COUNTERx(0), 0);
589 	stop_event(&evt);
590 	report(evt.count < gp_events[instruction_idx].min, "cntr");
591 
592 	/* clear status before overflow test */
593 	if (this_cpu_has_perf_global_status())
594 		pmu_clear_global_status();
595 
596 	start_event(&evt);
597 
598 	count = -1;
599 	if (pmu_use_full_writes())
600 		count &= (1ull << pmu.gp_counter_width) - 1;
601 
602 	wrmsr(MSR_GP_COUNTERx(0), count);
603 
604 	__loop();
605 	stop_event(&evt);
606 
607 	if (this_cpu_has_perf_global_status()) {
608 		status = rdmsr(pmu.msr_global_status);
609 		report(status & 1, "status msr bit");
610 	}
611 
612 	report_prefix_pop();
613 }
614 
615 static void check_emulated_instr(void)
616 {
617 	uint64_t status, instr_start, brnch_start;
618 	uint64_t gp_counter_width = (1ull << pmu.gp_counter_width) - 1;
619 	unsigned int branch_idx = pmu.is_intel ?
620 				  INTEL_BRANCHES_IDX : AMD_BRANCHES_IDX;
621 	unsigned int instruction_idx = pmu.is_intel ?
622 				       INTEL_INSTRUCTIONS_IDX :
623 				       AMD_INSTRUCTIONS_IDX;
624 	pmu_counter_t brnch_cnt = {
625 		.ctr = MSR_GP_COUNTERx(0),
626 		/* branch instructions */
627 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[branch_idx].unit_sel,
628 	};
629 	pmu_counter_t instr_cnt = {
630 		.ctr = MSR_GP_COUNTERx(1),
631 		/* instructions */
632 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel,
633 	};
634 	report_prefix_push("emulated instruction");
635 
636 	if (this_cpu_has_perf_global_status())
637 		pmu_clear_global_status();
638 
639 	start_event(&brnch_cnt);
640 	start_event(&instr_cnt);
641 
642 	brnch_start = -EXPECTED_BRNCH;
643 	instr_start = -EXPECTED_INSTR;
644 	wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width);
645 	wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width);
646 	// KVM_FEP is a magic prefix that forces emulation so
647 	// 'KVM_FEP "jne label\n"' just counts as a single instruction.
648 	asm volatile(
649 		"mov $0x0, %%eax\n"
650 		"cmp $0x0, %%eax\n"
651 		KVM_FEP "jne label\n"
652 		KVM_FEP "jne label\n"
653 		KVM_FEP "jne label\n"
654 		KVM_FEP "jne label\n"
655 		KVM_FEP "jne label\n"
656 		"mov $0xa, %%eax\n"
657 		"cpuid\n"
658 		"mov $0xa, %%eax\n"
659 		"cpuid\n"
660 		"mov $0xa, %%eax\n"
661 		"cpuid\n"
662 		"mov $0xa, %%eax\n"
663 		"cpuid\n"
664 		"mov $0xa, %%eax\n"
665 		"cpuid\n"
666 		"label:\n"
667 		:
668 		:
669 		: "eax", "ebx", "ecx", "edx");
670 
671 	if (this_cpu_has_perf_global_ctrl())
672 		wrmsr(pmu.msr_global_ctl, 0);
673 
674 	stop_event(&brnch_cnt);
675 	stop_event(&instr_cnt);
676 
677 	// Check that the end count - start count is at least the expected
678 	// number of instructions and branches.
679 	report(instr_cnt.count - instr_start >= EXPECTED_INSTR,
680 	       "instruction count");
681 	report(brnch_cnt.count - brnch_start >= EXPECTED_BRNCH,
682 	       "branch count");
683 	if (this_cpu_has_perf_global_status()) {
684 		// Additionally check that those counters overflowed properly.
685 		status = rdmsr(pmu.msr_global_status);
686 		report(status & 1, "branch counter overflow");
687 		report(status & 2, "instruction counter overflow");
688 	}
689 
690 	report_prefix_pop();
691 }
692 
693 #define XBEGIN_STARTED (~0u)
694 static void check_tsx_cycles(void)
695 {
696 	pmu_counter_t cnt;
697 	unsigned int i, ret = 0;
698 
699 	if (!this_cpu_has(X86_FEATURE_RTM))
700 		return;
701 
702 	report_prefix_push("TSX cycles");
703 
704 	for (i = 0; i < pmu.nr_gp_counters; i++) {
705 		cnt.ctr = MSR_GP_COUNTERx(i);
706 
707 		if (i == 2) {
708 			/* Transactional cycles committed only on gp counter 2 */
709 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x30000003c;
710 		} else {
711 			/* Transactional cycles */
712 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x10000003c;
713 		}
714 
715 		start_event(&cnt);
716 
717 		asm volatile("xbegin 1f\n\t"
718 				"1:\n\t"
719 				: "+a" (ret) :: "memory");
720 
721 		/* Generate a non-canonical #GP to trigger ABORT. */
722 		if (ret == XBEGIN_STARTED)
723 			*(int *)NONCANONICAL = 0;
724 
725 		stop_event(&cnt);
726 
727 		report(cnt.count > 0, "gp cntr-%d with a value of %" PRId64 "", i, cnt.count);
728 	}
729 
730 	report_prefix_pop();
731 }
732 
733 static void warm_up(void)
734 {
735 	int i;
736 
737 	/*
738 	 * Since cycles event is always run as the first event, there would be
739 	 * a warm-up state to warm up the cache, it leads to the measured cycles
740 	 * value may exceed the pre-defined cycles upper boundary and cause
741 	 * false positive. To avoid this, introduce an warm-up state before
742 	 * the real verification.
743 	 */
744 	for (i = 0; i < 10; i++)
745 		loop(0);
746 }
747 
748 static void check_counters(void)
749 {
750 	if (is_fep_available())
751 		check_emulated_instr();
752 
753 	warm_up();
754 	check_gp_counters();
755 	check_fixed_counters();
756 	check_rdpmc();
757 	check_counters_many();
758 	check_counter_overflow();
759 	check_gp_counter_cmask();
760 	check_running_counter_wrmsr();
761 	check_tsx_cycles();
762 }
763 
764 static void do_unsupported_width_counter_write(void *index)
765 {
766 	wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull);
767 }
768 
769 static void check_gp_counters_write_width(void)
770 {
771 	u64 val_64 = 0xffffff0123456789ull;
772 	u64 val_32 = val_64 & ((1ull << 32) - 1);
773 	u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1);
774 	int i;
775 
776 	/*
777 	 * MSR_IA32_PERFCTRn supports 64-bit writes,
778 	 * but only the lowest 32 bits are valid.
779 	 */
780 	for (i = 0; i < pmu.nr_gp_counters; i++) {
781 		wrmsr(MSR_IA32_PERFCTR0 + i, val_32);
782 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
783 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
784 
785 		wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width);
786 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
787 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
788 
789 		wrmsr(MSR_IA32_PERFCTR0 + i, val_64);
790 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
791 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
792 	}
793 
794 	/*
795 	 * MSR_IA32_PMCn supports writing values up to GP counter width,
796 	 * and only the lowest bits of GP counter width are valid.
797 	 */
798 	for (i = 0; i < pmu.nr_gp_counters; i++) {
799 		wrmsr(MSR_IA32_PMC0 + i, val_32);
800 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
801 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
802 
803 		wrmsr(MSR_IA32_PMC0 + i, val_max_width);
804 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width);
805 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width);
806 
807 		report(test_for_exception(GP_VECTOR,
808 			do_unsupported_width_counter_write, &i),
809 		"writing unsupported width to MSR_IA32_PMC%d raises #GP", i);
810 	}
811 }
812 
813 /*
814  * Per the SDM, reference cycles are currently implemented using the
815  * core crystal clock, TSC, or bus clock. Calibrate to the TSC
816  * frequency to set reasonable expectations.
817  */
818 static void set_ref_cycle_expectations(void)
819 {
820 	pmu_counter_t cnt = {
821 		.ctr = MSR_IA32_PERFCTR0,
822 		.config = EVNTSEL_OS | EVNTSEL_USR |
823 			  intel_gp_events[INTEL_REF_CYCLES_IDX].unit_sel,
824 	};
825 	uint64_t tsc_delta;
826 	uint64_t t0, t1, t2, t3;
827 
828 	/* Bit 2 enumerates the availability of reference cycles events. */
829 	if (!pmu.nr_gp_counters || !pmu_gp_counter_is_available(2))
830 		return;
831 
832 	if (this_cpu_has_perf_global_ctrl())
833 		wrmsr(pmu.msr_global_ctl, 0);
834 
835 	t0 = fenced_rdtsc();
836 	start_event(&cnt);
837 	t1 = fenced_rdtsc();
838 
839 	/*
840 	 * This loop has to run long enough to dominate the VM-exit
841 	 * costs for playing with the PMU MSRs on start and stop.
842 	 *
843 	 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times
844 	 * the core crystal clock, this function calculated a guest
845 	 * TSC : ref cycles ratio of around 105 with ECX initialized
846 	 * to one billion.
847 	 */
848 	asm volatile("loop ." : "+c"((int){1000000000ull}));
849 
850 	t2 = fenced_rdtsc();
851 	stop_event(&cnt);
852 	t3 = fenced_rdtsc();
853 
854 	tsc_delta = ((t2 - t1) + (t3 - t0)) / 2;
855 
856 	if (!tsc_delta)
857 		return;
858 
859 	intel_gp_events[INTEL_REF_CYCLES_IDX].min =
860 		(intel_gp_events[INTEL_REF_CYCLES_IDX].min * cnt.count) / tsc_delta;
861 	intel_gp_events[INTEL_REF_CYCLES_IDX].max =
862 		(intel_gp_events[INTEL_REF_CYCLES_IDX].max * cnt.count) / tsc_delta;
863 }
864 
865 static void check_invalid_rdpmc_gp(void)
866 {
867 	uint64_t val;
868 
869 	report(rdpmc_safe(64, &val) == GP_VECTOR,
870 	       "Expected #GP on RDPMC(64)");
871 }
872 
873 int main(int ac, char **av)
874 {
875 	int instruction_idx;
876 	int branch_idx;
877 
878 	setup_vm();
879 	handle_irq(PMI_VECTOR, cnt_overflow);
880 	buf = malloc(N*64);
881 
882 	check_invalid_rdpmc_gp();
883 
884 	if (pmu.is_intel) {
885 		if (!pmu.version) {
886 			report_skip("No Intel Arch PMU is detected!");
887 			return report_summary();
888 		}
889 		gp_events = (struct pmu_event *)intel_gp_events;
890 		gp_events_size = sizeof(intel_gp_events)/sizeof(intel_gp_events[0]);
891 		instruction_idx = INTEL_INSTRUCTIONS_IDX;
892 		branch_idx = INTEL_BRANCHES_IDX;
893 
894 		/*
895 		 * For legacy Intel CPUS without clflush/clflushopt support,
896 		 * there is no way to force to trigger a LLC miss, thus set
897 		 * the minimum value to 0 to avoid false positives.
898 		 */
899 		if (!this_cpu_has(X86_FEATURE_CLFLUSH))
900 			gp_events[INTEL_LLC_MISSES_IDX].min = 0;
901 
902 		report_prefix_push("Intel");
903 		set_ref_cycle_expectations();
904 	} else {
905 		gp_events_size = sizeof(amd_gp_events)/sizeof(amd_gp_events[0]);
906 		gp_events = (struct pmu_event *)amd_gp_events;
907 		instruction_idx = AMD_INSTRUCTIONS_IDX;
908 		branch_idx = AMD_BRANCHES_IDX;
909 		report_prefix_push("AMD");
910 	}
911 	adjust_events_range(gp_events, instruction_idx, branch_idx);
912 
913 	printf("PMU version:         %d\n", pmu.version);
914 	printf("GP counters:         %d\n", pmu.nr_gp_counters);
915 	printf("GP counter width:    %d\n", pmu.gp_counter_width);
916 	printf("Mask length:         %d\n", pmu.gp_counter_mask_length);
917 	printf("Fixed counters:      %d\n", pmu.nr_fixed_counters);
918 	printf("Fixed counter width: %d\n", pmu.fixed_counter_width);
919 
920 	fixed_counters_num = MIN(pmu.nr_fixed_counters, ARRAY_SIZE(fixed_events));
921 	if (pmu.nr_fixed_counters > ARRAY_SIZE(fixed_events))
922 		report_info("Fixed counters number %d > defined fixed events %u.  "
923 			    "Please update test case.", pmu.nr_fixed_counters,
924 			    (uint32_t)ARRAY_SIZE(fixed_events));
925 
926 	apic_write(APIC_LVTPC, PMI_VECTOR);
927 
928 	check_counters();
929 
930 	if (pmu_has_full_writes()) {
931 		pmu.msr_gp_counter_base = MSR_IA32_PMC0;
932 
933 		report_prefix_push("full-width writes");
934 		check_counters();
935 		check_gp_counters_write_width();
936 		report_prefix_pop();
937 	}
938 
939 	if (!pmu.is_intel) {
940 		report_prefix_push("K7");
941 		pmu.nr_gp_counters = AMD64_NUM_COUNTERS;
942 		pmu.msr_gp_counter_base = MSR_K7_PERFCTR0;
943 		pmu.msr_gp_event_select_base = MSR_K7_EVNTSEL0;
944 		check_counters();
945 		report_prefix_pop();
946 	}
947 
948 	return report_summary();
949 }
950