xref: /kvm-unit-tests/x86/pmu.c (revision 38b5b42631c22610ca48adae92951e5c47037dd2)
1 
2 #include "x86/msr.h"
3 #include "x86/processor.h"
4 #include "x86/pmu.h"
5 #include "x86/apic-defs.h"
6 #include "x86/apic.h"
7 #include "x86/desc.h"
8 #include "x86/isr.h"
9 #include "vmalloc.h"
10 #include "alloc.h"
11 
12 #include "libcflat.h"
13 #include <stdint.h>
14 
15 #define N 1000000
16 
17 // These values match the number of instructions and branches in the
18 // assembly block in check_emulated_instr().
19 #define EXPECTED_INSTR 17
20 #define EXPECTED_BRNCH 5
21 
22 /* Enable GLOBAL_CTRL + disable GLOBAL_CTRL + clflush/mfence instructions */
23 #define EXTRA_INSNS  (3 + 3 +2)
24 #define LOOP_INSNS   (N * 10 + EXTRA_INSNS)
25 #define LOOP_BRANCHES  (N)
26 #define LOOP_ASM(_wrmsr, _clflush)					\
27 	_wrmsr "\n\t"							\
28 	"mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t"			\
29 	_clflush "\n\t"                                 		\
30 	"mfence;\n\t"                                   		\
31 	"1: mov (%1), %2; add $64, %1;\n\t"				\
32 	"nop; nop; nop; nop; nop; nop; nop;\n\t"			\
33 	"loop 1b;\n\t"							\
34 	"mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t"	\
35 	_wrmsr "\n\t"
36 
37 #define _loop_asm(_wrmsr, _clflush)				\
38 do {								\
39 	asm volatile(LOOP_ASM(_wrmsr, _clflush)			\
40 		     : "=b"(tmp), "=r"(tmp2), "=r"(tmp3)	\
41 		     : "a"(eax), "d"(edx), "c"(global_ctl),	\
42 		       "0"(N), "1"(buf)				\
43 		     : "edi");					\
44 } while (0)
45 
46 typedef struct {
47 	uint32_t ctr;
48 	uint32_t idx;
49 	uint64_t config;
50 	uint64_t count;
51 } pmu_counter_t;
52 
53 struct pmu_event {
54 	const char *name;
55 	uint32_t unit_sel;
56 	int min;
57 	int max;
58 } intel_gp_events[] = {
59 	{"core cycles", 0x003c, 1*N, 50*N},
60 	{"instructions", 0x00c0, 10*N, 10.2*N},
61 	{"ref cycles", 0x013c, 1*N, 30*N},
62 	{"llc references", 0x4f2e, 1, 2*N},
63 	{"llc misses", 0x412e, 1, 1*N},
64 	{"branches", 0x00c4, 1*N, 1.1*N},
65 	{"branch misses", 0x00c5, 0, 0.1*N},
66 }, amd_gp_events[] = {
67 	{"core cycles", 0x0076, 1*N, 50*N},
68 	{"instructions", 0x00c0, 10*N, 10.2*N},
69 	{"branches", 0x00c2, 1*N, 1.1*N},
70 	{"branch misses", 0x00c3, 0, 0.1*N},
71 }, fixed_events[] = {
72 	{"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
73 	{"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
74 	{"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
75 };
76 
77 /*
78  * Events index in intel_gp_events[], ensure consistent with
79  * intel_gp_events[].
80  */
81 enum {
82 	INTEL_INSTRUCTIONS_IDX  = 1,
83 	INTEL_REF_CYCLES_IDX	= 2,
84 	INTEL_BRANCHES_IDX	= 5,
85 };
86 
87 /*
88  * Events index in amd_gp_events[], ensure consistent with
89  * amd_gp_events[].
90  */
91 enum {
92 	AMD_INSTRUCTIONS_IDX    = 1,
93 	AMD_BRANCHES_IDX	= 2,
94 };
95 
96 char *buf;
97 
98 static struct pmu_event *gp_events;
99 static unsigned int gp_events_size;
100 static unsigned int fixed_counters_num;
101 
102 static inline void __loop(void)
103 {
104 	unsigned long tmp, tmp2, tmp3;
105 	u32 global_ctl = 0;
106 	u32 eax = 0;
107 	u32 edx = 0;
108 
109 	if (this_cpu_has(X86_FEATURE_CLFLUSH))
110 		_loop_asm("nop", "clflush (%1)");
111 	else
112 		_loop_asm("nop", "nop");
113 }
114 
115 /*
116  * Enable and disable counters in a whole asm blob to ensure
117  * no other instructions are counted in the window between
118  * counters enabling and really LOOP_ASM code executing.
119  * Thus counters can verify instructions and branches events
120  * against precise counts instead of a rough valid count range.
121  */
122 static inline void __precise_loop(u64 cntrs)
123 {
124 	unsigned long tmp, tmp2, tmp3;
125 	u32 global_ctl = pmu.msr_global_ctl;
126 	u32 eax = cntrs & (BIT_ULL(32) - 1);
127 	u32 edx = cntrs >> 32;
128 
129 	if (this_cpu_has(X86_FEATURE_CLFLUSH))
130 		_loop_asm("wrmsr", "clflush (%1)");
131 	else
132 		_loop_asm("wrmsr", "nop");
133 }
134 
135 static inline void loop(u64 cntrs)
136 {
137 	if (!this_cpu_has_perf_global_ctrl())
138 		__loop();
139 	else
140 		__precise_loop(cntrs);
141 }
142 
143 static void adjust_events_range(struct pmu_event *gp_events,
144 				int instruction_idx, int branch_idx)
145 {
146 	/*
147 	 * If HW supports GLOBAL_CTRL MSR, enabling and disabling PMCs are
148 	 * moved in __precise_loop(). Thus, instructions and branches events
149 	 * can be verified against a precise count instead of a rough range.
150 	 *
151 	 * Skip the precise checks on AMD, as AMD CPUs count VMRUN as a branch
152 	 * instruction in guest context, which* leads to intermittent failures
153 	 * as the counts will vary depending on how many asynchronous VM-Exits
154 	 * occur while running the measured code, e.g. if the host takes IRQs.
155 	 */
156 	if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) {
157 		gp_events[instruction_idx].min = LOOP_INSNS;
158 		gp_events[instruction_idx].max = LOOP_INSNS;
159 		gp_events[branch_idx].min = LOOP_BRANCHES;
160 		gp_events[branch_idx].max = LOOP_BRANCHES;
161 	}
162 }
163 
164 volatile uint64_t irq_received;
165 
166 static void cnt_overflow(isr_regs_t *regs)
167 {
168 	irq_received++;
169 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
170 	apic_write(APIC_EOI, 0);
171 }
172 
173 static bool check_irq(void)
174 {
175 	int i;
176 	irq_received = 0;
177 	sti();
178 	for (i = 0; i < 100000 && !irq_received; i++)
179 		asm volatile("pause");
180 	cli();
181 	return irq_received;
182 }
183 
184 static bool is_gp(pmu_counter_t *evt)
185 {
186 	if (!pmu.is_intel)
187 		return true;
188 
189 	return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 ||
190 		evt->ctr >= MSR_IA32_PMC0;
191 }
192 
193 static int event_to_global_idx(pmu_counter_t *cnt)
194 {
195 	if (pmu.is_intel)
196 		return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base :
197 			(MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
198 
199 	if (pmu.msr_gp_counter_base == MSR_F15H_PERF_CTR0)
200 		return (cnt->ctr - pmu.msr_gp_counter_base) / 2;
201 	else
202 		return cnt->ctr - pmu.msr_gp_counter_base;
203 }
204 
205 static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
206 {
207 	if (is_gp(cnt)) {
208 		int i;
209 
210 		for (i = 0; i < gp_events_size; i++)
211 			if (gp_events[i].unit_sel == (cnt->config & 0xffff))
212 				return &gp_events[i];
213 	} else {
214 		unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0;
215 
216 		if (idx < ARRAY_SIZE(fixed_events))
217 			return &fixed_events[idx];
218 	}
219 
220 	return (void*)0;
221 }
222 
223 static void global_enable(pmu_counter_t *cnt)
224 {
225 	if (!this_cpu_has_perf_global_ctrl())
226 		return;
227 
228 	cnt->idx = event_to_global_idx(cnt);
229 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx));
230 }
231 
232 static void global_disable(pmu_counter_t *cnt)
233 {
234 	if (!this_cpu_has_perf_global_ctrl())
235 		return;
236 
237 	wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx));
238 }
239 
240 static void __start_event(pmu_counter_t *evt, uint64_t count)
241 {
242     evt->count = count;
243     wrmsr(evt->ctr, evt->count);
244     if (is_gp(evt)) {
245 	    wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
246 		  evt->config | EVNTSEL_EN);
247     } else {
248 	    uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
249 	    int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
250 	    uint32_t usrospmi = 0;
251 
252 	    if (evt->config & EVNTSEL_OS)
253 		    usrospmi |= (1 << 0);
254 	    if (evt->config & EVNTSEL_USR)
255 		    usrospmi |= (1 << 1);
256 	    if (evt->config & EVNTSEL_INT)
257 		    usrospmi |= (1 << 3); // PMI on overflow
258 	    ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift);
259 	    wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl);
260     }
261     apic_write(APIC_LVTPC, PMI_VECTOR);
262 }
263 
264 static void start_event(pmu_counter_t *evt)
265 {
266 	__start_event(evt, 0);
267 	global_enable(evt);
268 }
269 
270 static void __stop_event(pmu_counter_t *evt)
271 {
272 	if (is_gp(evt)) {
273 		wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
274 		      evt->config & ~EVNTSEL_EN);
275 	} else {
276 		uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
277 		int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
278 		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift));
279 	}
280 	evt->count = rdmsr(evt->ctr);
281 }
282 
283 static void stop_event(pmu_counter_t *evt)
284 {
285 	global_disable(evt);
286 	__stop_event(evt);
287 }
288 
289 static noinline void measure_many(pmu_counter_t *evt, int count)
290 {
291 	int i;
292 	u64 cntrs = 0;
293 
294 	for (i = 0; i < count; i++) {
295 		__start_event(&evt[i], 0);
296 		cntrs |= BIT_ULL(event_to_global_idx(&evt[i]));
297 	}
298 	loop(cntrs);
299 	for (i = 0; i < count; i++)
300 		__stop_event(&evt[i]);
301 }
302 
303 static void measure_one(pmu_counter_t *evt)
304 {
305 	measure_many(evt, 1);
306 }
307 
308 static noinline void __measure(pmu_counter_t *evt, uint64_t count)
309 {
310 	u64 cntrs = BIT_ULL(event_to_global_idx(evt));
311 
312 	__start_event(evt, count);
313 	loop(cntrs);
314 	__stop_event(evt);
315 }
316 
317 static bool verify_event(uint64_t count, struct pmu_event *e)
318 {
319 	bool pass;
320 
321 	if (!e)
322 		return false;
323 
324 	pass = count >= e->min && count <= e->max;
325 	if (!pass)
326 		printf("FAIL: %d <= %"PRId64" <= %d\n", e->min, count, e->max);
327 
328 	return pass;
329 }
330 
331 static bool verify_counter(pmu_counter_t *cnt)
332 {
333 	return verify_event(cnt->count, get_counter_event(cnt));
334 }
335 
336 static void check_gp_counter(struct pmu_event *evt)
337 {
338 	pmu_counter_t cnt = {
339 		.config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel,
340 	};
341 	int i;
342 
343 	for (i = 0; i < pmu.nr_gp_counters; i++) {
344 		cnt.ctr = MSR_GP_COUNTERx(i);
345 		measure_one(&cnt);
346 		report(verify_event(cnt.count, evt), "%s-%d", evt->name, i);
347 	}
348 }
349 
350 static void check_gp_counters(void)
351 {
352 	int i;
353 
354 	for (i = 0; i < gp_events_size; i++)
355 		if (pmu_gp_counter_is_available(i))
356 			check_gp_counter(&gp_events[i]);
357 		else
358 			printf("GP event '%s' is disabled\n",
359 					gp_events[i].name);
360 }
361 
362 static void check_fixed_counters(void)
363 {
364 	pmu_counter_t cnt = {
365 		.config = EVNTSEL_OS | EVNTSEL_USR,
366 	};
367 	int i;
368 
369 	for (i = 0; i < fixed_counters_num; i++) {
370 		cnt.ctr = fixed_events[i].unit_sel;
371 		measure_one(&cnt);
372 		report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i);
373 	}
374 }
375 
376 static void check_counters_many(void)
377 {
378 	pmu_counter_t cnt[48];
379 	int i, n;
380 
381 	for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) {
382 		if (!pmu_gp_counter_is_available(i))
383 			continue;
384 
385 		cnt[n].ctr = MSR_GP_COUNTERx(n);
386 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR |
387 			gp_events[i % gp_events_size].unit_sel;
388 		n++;
389 	}
390 	for (i = 0; i < fixed_counters_num; i++) {
391 		cnt[n].ctr = fixed_events[i].unit_sel;
392 		cnt[n].config = EVNTSEL_OS | EVNTSEL_USR;
393 		n++;
394 	}
395 
396 	assert(n <= ARRAY_SIZE(cnt));
397 	measure_many(cnt, n);
398 
399 	for (i = 0; i < n; i++)
400 		if (!verify_counter(&cnt[i]))
401 			break;
402 
403 	report(i == n, "all counters");
404 }
405 
406 static uint64_t measure_for_overflow(pmu_counter_t *cnt)
407 {
408 	__measure(cnt, 0);
409 	/*
410 	 * To generate overflow, i.e. roll over to '0', the initial count just
411 	 * needs to be preset to the negative expected count.  However, as per
412 	 * Intel's SDM, the preset count needs to be incremented by 1 to ensure
413 	 * the overflow interrupt is generated immediately instead of possibly
414 	 * waiting for the overflow to propagate through the counter.
415 	 */
416 	assert(cnt->count > 1);
417 	return 1 - cnt->count;
418 }
419 
420 static void check_counter_overflow(void)
421 {
422 	int i;
423 	uint64_t overflow_preset;
424 	int instruction_idx = pmu.is_intel ?
425 			      INTEL_INSTRUCTIONS_IDX :
426 			      AMD_INSTRUCTIONS_IDX;
427 
428 	pmu_counter_t cnt = {
429 		.ctr = MSR_GP_COUNTERx(0),
430 		.config = EVNTSEL_OS | EVNTSEL_USR |
431 			  gp_events[instruction_idx].unit_sel /* instructions */,
432 	};
433 	overflow_preset = measure_for_overflow(&cnt);
434 
435 	/* clear status before test */
436 	if (this_cpu_has_perf_global_status())
437 		pmu_clear_global_status();
438 
439 	report_prefix_push("overflow");
440 
441 	for (i = 0; i < pmu.nr_gp_counters + 1; i++) {
442 		uint64_t status;
443 		int idx;
444 
445 		cnt.count = overflow_preset;
446 		if (pmu_use_full_writes())
447 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
448 
449 		if (i == pmu.nr_gp_counters) {
450 			if (!pmu.is_intel)
451 				break;
452 
453 			cnt.ctr = fixed_events[0].unit_sel;
454 			cnt.count = measure_for_overflow(&cnt);
455 			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
456 		} else {
457 			cnt.ctr = MSR_GP_COUNTERx(i);
458 		}
459 
460 		if (i % 2)
461 			cnt.config |= EVNTSEL_INT;
462 		else
463 			cnt.config &= ~EVNTSEL_INT;
464 		idx = event_to_global_idx(&cnt);
465 		__measure(&cnt, cnt.count);
466 		if (pmu.is_intel)
467 			report(cnt.count == 1, "cntr-%d", i);
468 		else
469 			report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i);
470 
471 		if (!this_cpu_has_perf_global_status())
472 			continue;
473 
474 		status = rdmsr(pmu.msr_global_status);
475 		report(status & (1ull << idx), "status-%d", i);
476 		wrmsr(pmu.msr_global_status_clr, status);
477 		status = rdmsr(pmu.msr_global_status);
478 		report(!(status & (1ull << idx)), "status clear-%d", i);
479 		report(check_irq() == (i % 2), "irq-%d", i);
480 	}
481 
482 	report_prefix_pop();
483 }
484 
485 static void check_gp_counter_cmask(void)
486 {
487 	int instruction_idx = pmu.is_intel ?
488 			      INTEL_INSTRUCTIONS_IDX :
489 			      AMD_INSTRUCTIONS_IDX;
490 
491 	pmu_counter_t cnt = {
492 		.ctr = MSR_GP_COUNTERx(0),
493 		.config = EVNTSEL_OS | EVNTSEL_USR |
494 			  gp_events[instruction_idx].unit_sel /* instructions */,
495 	};
496 	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
497 	measure_one(&cnt);
498 	report(cnt.count < gp_events[instruction_idx].min, "cmask");
499 }
500 
501 static void do_rdpmc_fast(void *ptr)
502 {
503 	pmu_counter_t *cnt = ptr;
504 	uint32_t idx = (uint32_t)cnt->idx | (1u << 31);
505 
506 	if (!is_gp(cnt))
507 		idx |= 1 << 30;
508 
509 	cnt->count = rdpmc(idx);
510 }
511 
512 
513 static void check_rdpmc(void)
514 {
515 	uint64_t val = 0xff0123456789ull;
516 	bool exc;
517 	int i;
518 
519 	report_prefix_push("rdpmc");
520 
521 	for (i = 0; i < pmu.nr_gp_counters; i++) {
522 		uint64_t x;
523 		pmu_counter_t cnt = {
524 			.ctr = MSR_GP_COUNTERx(i),
525 			.idx = i
526 		};
527 
528 	        /*
529 	         * Without full-width writes, only the low 32 bits are writable,
530 	         * and the value is sign-extended.
531 	         */
532 		if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0)
533 			x = (uint64_t)(int64_t)(int32_t)val;
534 		else
535 			x = (uint64_t)(int64_t)val;
536 
537 		/* Mask according to the number of supported bits */
538 		x &= (1ull << pmu.gp_counter_width) - 1;
539 
540 		wrmsr(MSR_GP_COUNTERx(i), val);
541 		report(rdpmc(i) == x, "cntr-%d", i);
542 
543 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
544 		if (exc)
545 			report_skip("fast-%d", i);
546 		else
547 			report(cnt.count == (u32)val, "fast-%d", i);
548 	}
549 	for (i = 0; i < fixed_counters_num; i++) {
550 		uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1);
551 		pmu_counter_t cnt = {
552 			.ctr = MSR_CORE_PERF_FIXED_CTR0 + i,
553 			.idx = i
554 		};
555 
556 		wrmsr(MSR_PERF_FIXED_CTRx(i), x);
557 		report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i);
558 
559 		exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
560 		if (exc)
561 			report_skip("fixed fast-%d", i);
562 		else
563 			report(cnt.count == (u32)x, "fixed fast-%d", i);
564 	}
565 
566 	report_prefix_pop();
567 }
568 
569 static void check_running_counter_wrmsr(void)
570 {
571 	uint64_t status;
572 	uint64_t count;
573 	unsigned int instruction_idx = pmu.is_intel ?
574 				       INTEL_INSTRUCTIONS_IDX :
575 				       AMD_INSTRUCTIONS_IDX;
576 
577 	pmu_counter_t evt = {
578 		.ctr = MSR_GP_COUNTERx(0),
579 		.config = EVNTSEL_OS | EVNTSEL_USR |
580 			  gp_events[instruction_idx].unit_sel,
581 	};
582 
583 	report_prefix_push("running counter wrmsr");
584 
585 	start_event(&evt);
586 	__loop();
587 	wrmsr(MSR_GP_COUNTERx(0), 0);
588 	stop_event(&evt);
589 	report(evt.count < gp_events[instruction_idx].min, "cntr");
590 
591 	/* clear status before overflow test */
592 	if (this_cpu_has_perf_global_status())
593 		pmu_clear_global_status();
594 
595 	start_event(&evt);
596 
597 	count = -1;
598 	if (pmu_use_full_writes())
599 		count &= (1ull << pmu.gp_counter_width) - 1;
600 
601 	wrmsr(MSR_GP_COUNTERx(0), count);
602 
603 	__loop();
604 	stop_event(&evt);
605 
606 	if (this_cpu_has_perf_global_status()) {
607 		status = rdmsr(pmu.msr_global_status);
608 		report(status & 1, "status msr bit");
609 	}
610 
611 	report_prefix_pop();
612 }
613 
614 static void check_emulated_instr(void)
615 {
616 	uint64_t status, instr_start, brnch_start;
617 	uint64_t gp_counter_width = (1ull << pmu.gp_counter_width) - 1;
618 	unsigned int branch_idx = pmu.is_intel ?
619 				  INTEL_BRANCHES_IDX : AMD_BRANCHES_IDX;
620 	unsigned int instruction_idx = pmu.is_intel ?
621 				       INTEL_INSTRUCTIONS_IDX :
622 				       AMD_INSTRUCTIONS_IDX;
623 	pmu_counter_t brnch_cnt = {
624 		.ctr = MSR_GP_COUNTERx(0),
625 		/* branch instructions */
626 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[branch_idx].unit_sel,
627 	};
628 	pmu_counter_t instr_cnt = {
629 		.ctr = MSR_GP_COUNTERx(1),
630 		/* instructions */
631 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel,
632 	};
633 	report_prefix_push("emulated instruction");
634 
635 	if (this_cpu_has_perf_global_status())
636 		pmu_clear_global_status();
637 
638 	start_event(&brnch_cnt);
639 	start_event(&instr_cnt);
640 
641 	brnch_start = -EXPECTED_BRNCH;
642 	instr_start = -EXPECTED_INSTR;
643 	wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width);
644 	wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width);
645 	// KVM_FEP is a magic prefix that forces emulation so
646 	// 'KVM_FEP "jne label\n"' just counts as a single instruction.
647 	asm volatile(
648 		"mov $0x0, %%eax\n"
649 		"cmp $0x0, %%eax\n"
650 		KVM_FEP "jne label\n"
651 		KVM_FEP "jne label\n"
652 		KVM_FEP "jne label\n"
653 		KVM_FEP "jne label\n"
654 		KVM_FEP "jne label\n"
655 		"mov $0xa, %%eax\n"
656 		"cpuid\n"
657 		"mov $0xa, %%eax\n"
658 		"cpuid\n"
659 		"mov $0xa, %%eax\n"
660 		"cpuid\n"
661 		"mov $0xa, %%eax\n"
662 		"cpuid\n"
663 		"mov $0xa, %%eax\n"
664 		"cpuid\n"
665 		"label:\n"
666 		:
667 		:
668 		: "eax", "ebx", "ecx", "edx");
669 
670 	if (this_cpu_has_perf_global_ctrl())
671 		wrmsr(pmu.msr_global_ctl, 0);
672 
673 	stop_event(&brnch_cnt);
674 	stop_event(&instr_cnt);
675 
676 	// Check that the end count - start count is at least the expected
677 	// number of instructions and branches.
678 	report(instr_cnt.count - instr_start >= EXPECTED_INSTR,
679 	       "instruction count");
680 	report(brnch_cnt.count - brnch_start >= EXPECTED_BRNCH,
681 	       "branch count");
682 	if (this_cpu_has_perf_global_status()) {
683 		// Additionally check that those counters overflowed properly.
684 		status = rdmsr(pmu.msr_global_status);
685 		report(status & 1, "branch counter overflow");
686 		report(status & 2, "instruction counter overflow");
687 	}
688 
689 	report_prefix_pop();
690 }
691 
692 #define XBEGIN_STARTED (~0u)
693 static void check_tsx_cycles(void)
694 {
695 	pmu_counter_t cnt;
696 	unsigned int i, ret = 0;
697 
698 	if (!this_cpu_has(X86_FEATURE_RTM))
699 		return;
700 
701 	report_prefix_push("TSX cycles");
702 
703 	for (i = 0; i < pmu.nr_gp_counters; i++) {
704 		cnt.ctr = MSR_GP_COUNTERx(i);
705 
706 		if (i == 2) {
707 			/* Transactional cycles committed only on gp counter 2 */
708 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x30000003c;
709 		} else {
710 			/* Transactional cycles */
711 			cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x10000003c;
712 		}
713 
714 		start_event(&cnt);
715 
716 		asm volatile("xbegin 1f\n\t"
717 				"1:\n\t"
718 				: "+a" (ret) :: "memory");
719 
720 		/* Generate a non-canonical #GP to trigger ABORT. */
721 		if (ret == XBEGIN_STARTED)
722 			*(int *)NONCANONICAL = 0;
723 
724 		stop_event(&cnt);
725 
726 		report(cnt.count > 0, "gp cntr-%d with a value of %" PRId64 "", i, cnt.count);
727 	}
728 
729 	report_prefix_pop();
730 }
731 
732 static void warm_up(void)
733 {
734 	int i;
735 
736 	/*
737 	 * Since cycles event is always run as the first event, there would be
738 	 * a warm-up state to warm up the cache, it leads to the measured cycles
739 	 * value may exceed the pre-defined cycles upper boundary and cause
740 	 * false positive. To avoid this, introduce an warm-up state before
741 	 * the real verification.
742 	 */
743 	for (i = 0; i < 10; i++)
744 		loop(0);
745 }
746 
747 static void check_counters(void)
748 {
749 	if (is_fep_available())
750 		check_emulated_instr();
751 
752 	warm_up();
753 	check_gp_counters();
754 	check_fixed_counters();
755 	check_rdpmc();
756 	check_counters_many();
757 	check_counter_overflow();
758 	check_gp_counter_cmask();
759 	check_running_counter_wrmsr();
760 	check_tsx_cycles();
761 }
762 
763 static void do_unsupported_width_counter_write(void *index)
764 {
765 	wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull);
766 }
767 
768 static void check_gp_counters_write_width(void)
769 {
770 	u64 val_64 = 0xffffff0123456789ull;
771 	u64 val_32 = val_64 & ((1ull << 32) - 1);
772 	u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1);
773 	int i;
774 
775 	/*
776 	 * MSR_IA32_PERFCTRn supports 64-bit writes,
777 	 * but only the lowest 32 bits are valid.
778 	 */
779 	for (i = 0; i < pmu.nr_gp_counters; i++) {
780 		wrmsr(MSR_IA32_PERFCTR0 + i, val_32);
781 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
782 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
783 
784 		wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width);
785 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
786 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
787 
788 		wrmsr(MSR_IA32_PERFCTR0 + i, val_64);
789 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
790 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
791 	}
792 
793 	/*
794 	 * MSR_IA32_PMCn supports writing values up to GP counter width,
795 	 * and only the lowest bits of GP counter width are valid.
796 	 */
797 	for (i = 0; i < pmu.nr_gp_counters; i++) {
798 		wrmsr(MSR_IA32_PMC0 + i, val_32);
799 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
800 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
801 
802 		wrmsr(MSR_IA32_PMC0 + i, val_max_width);
803 		assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width);
804 		assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width);
805 
806 		report(test_for_exception(GP_VECTOR,
807 			do_unsupported_width_counter_write, &i),
808 		"writing unsupported width to MSR_IA32_PMC%d raises #GP", i);
809 	}
810 }
811 
812 /*
813  * Per the SDM, reference cycles are currently implemented using the
814  * core crystal clock, TSC, or bus clock. Calibrate to the TSC
815  * frequency to set reasonable expectations.
816  */
817 static void set_ref_cycle_expectations(void)
818 {
819 	pmu_counter_t cnt = {
820 		.ctr = MSR_IA32_PERFCTR0,
821 		.config = EVNTSEL_OS | EVNTSEL_USR |
822 			  intel_gp_events[INTEL_REF_CYCLES_IDX].unit_sel,
823 	};
824 	uint64_t tsc_delta;
825 	uint64_t t0, t1, t2, t3;
826 
827 	/* Bit 2 enumerates the availability of reference cycles events. */
828 	if (!pmu.nr_gp_counters || !pmu_gp_counter_is_available(2))
829 		return;
830 
831 	if (this_cpu_has_perf_global_ctrl())
832 		wrmsr(pmu.msr_global_ctl, 0);
833 
834 	t0 = fenced_rdtsc();
835 	start_event(&cnt);
836 	t1 = fenced_rdtsc();
837 
838 	/*
839 	 * This loop has to run long enough to dominate the VM-exit
840 	 * costs for playing with the PMU MSRs on start and stop.
841 	 *
842 	 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times
843 	 * the core crystal clock, this function calculated a guest
844 	 * TSC : ref cycles ratio of around 105 with ECX initialized
845 	 * to one billion.
846 	 */
847 	asm volatile("loop ." : "+c"((int){1000000000ull}));
848 
849 	t2 = fenced_rdtsc();
850 	stop_event(&cnt);
851 	t3 = fenced_rdtsc();
852 
853 	tsc_delta = ((t2 - t1) + (t3 - t0)) / 2;
854 
855 	if (!tsc_delta)
856 		return;
857 
858 	intel_gp_events[INTEL_REF_CYCLES_IDX].min =
859 		(intel_gp_events[INTEL_REF_CYCLES_IDX].min * cnt.count) / tsc_delta;
860 	intel_gp_events[INTEL_REF_CYCLES_IDX].max =
861 		(intel_gp_events[INTEL_REF_CYCLES_IDX].max * cnt.count) / tsc_delta;
862 }
863 
864 static void check_invalid_rdpmc_gp(void)
865 {
866 	uint64_t val;
867 
868 	report(rdpmc_safe(64, &val) == GP_VECTOR,
869 	       "Expected #GP on RDPMC(64)");
870 }
871 
872 int main(int ac, char **av)
873 {
874 	int instruction_idx;
875 	int branch_idx;
876 
877 	setup_vm();
878 	handle_irq(PMI_VECTOR, cnt_overflow);
879 	buf = malloc(N*64);
880 
881 	check_invalid_rdpmc_gp();
882 
883 	if (pmu.is_intel) {
884 		if (!pmu.version) {
885 			report_skip("No Intel Arch PMU is detected!");
886 			return report_summary();
887 		}
888 		gp_events = (struct pmu_event *)intel_gp_events;
889 		gp_events_size = sizeof(intel_gp_events)/sizeof(intel_gp_events[0]);
890 		instruction_idx = INTEL_INSTRUCTIONS_IDX;
891 		branch_idx = INTEL_BRANCHES_IDX;
892 		report_prefix_push("Intel");
893 		set_ref_cycle_expectations();
894 	} else {
895 		gp_events_size = sizeof(amd_gp_events)/sizeof(amd_gp_events[0]);
896 		gp_events = (struct pmu_event *)amd_gp_events;
897 		instruction_idx = AMD_INSTRUCTIONS_IDX;
898 		branch_idx = AMD_BRANCHES_IDX;
899 		report_prefix_push("AMD");
900 	}
901 	adjust_events_range(gp_events, instruction_idx, branch_idx);
902 
903 	printf("PMU version:         %d\n", pmu.version);
904 	printf("GP counters:         %d\n", pmu.nr_gp_counters);
905 	printf("GP counter width:    %d\n", pmu.gp_counter_width);
906 	printf("Mask length:         %d\n", pmu.gp_counter_mask_length);
907 	printf("Fixed counters:      %d\n", pmu.nr_fixed_counters);
908 	printf("Fixed counter width: %d\n", pmu.fixed_counter_width);
909 
910 	fixed_counters_num = MIN(pmu.nr_fixed_counters, ARRAY_SIZE(fixed_events));
911 	if (pmu.nr_fixed_counters > ARRAY_SIZE(fixed_events))
912 		report_info("Fixed counters number %d > defined fixed events %u.  "
913 			    "Please update test case.", pmu.nr_fixed_counters,
914 			    (uint32_t)ARRAY_SIZE(fixed_events));
915 
916 	apic_write(APIC_LVTPC, PMI_VECTOR);
917 
918 	check_counters();
919 
920 	if (pmu_has_full_writes()) {
921 		pmu.msr_gp_counter_base = MSR_IA32_PMC0;
922 
923 		report_prefix_push("full-width writes");
924 		check_counters();
925 		check_gp_counters_write_width();
926 		report_prefix_pop();
927 	}
928 
929 	if (!pmu.is_intel) {
930 		report_prefix_push("K7");
931 		pmu.nr_gp_counters = AMD64_NUM_COUNTERS;
932 		pmu.msr_gp_counter_base = MSR_K7_PERFCTR0;
933 		pmu.msr_gp_event_select_base = MSR_K7_EVNTSEL0;
934 		check_counters();
935 		report_prefix_pop();
936 	}
937 
938 	return report_summary();
939 }
940