1
2 #include "x86/msr.h"
3 #include "x86/processor.h"
4 #include "x86/pmu.h"
5 #include "x86/apic-defs.h"
6 #include "x86/apic.h"
7 #include "x86/desc.h"
8 #include "x86/isr.h"
9 #include "vmalloc.h"
10 #include "alloc.h"
11
12 #include "libcflat.h"
13 #include <stdint.h>
14
15 #define N 1000000
16
17 #define IBPB_JMP_INSNS 9
18 #define IBPB_JMP_BRANCHES 2
19
20 #if defined(__i386__) || defined(_M_IX86) /* i386 */
21 #define IBPB_JMP_ASM(_wrmsr) \
22 "mov $1, %%eax; xor %%edx, %%edx;\n\t" \
23 "mov $73, %%ecx;\n\t" \
24 _wrmsr "\n\t" \
25 "call 1f\n\t" \
26 "1: pop %%eax\n\t" \
27 "add $(2f-1b), %%eax\n\t" \
28 "jmp *%%eax;\n\t" \
29 "nop;\n\t" \
30 "2: nop;\n\t"
31 #else /* x86_64 */
32 #define IBPB_JMP_ASM(_wrmsr) \
33 "mov $1, %%eax; xor %%edx, %%edx;\n\t" \
34 "mov $73, %%ecx;\n\t" \
35 _wrmsr "\n\t" \
36 "call 1f\n\t" \
37 "1: pop %%rax\n\t" \
38 "add $(2f-1b), %%rax\n\t" \
39 "jmp *%%rax;\n\t" \
40 "nop;\n\t" \
41 "2: nop;\n\t"
42 #endif
43
44 /* GLOBAL_CTRL enable + disable + clflush/mfence + IBPB_JMP */
45 #define EXTRA_INSNS (3 + 3 + 2 + IBPB_JMP_INSNS)
46 #define LOOP_INSNS (N * 10 + EXTRA_INSNS)
47 #define LOOP_BRANCHES (N + IBPB_JMP_BRANCHES)
48 #define LOOP_ASM(_wrmsr1, _clflush, _wrmsr2) \
49 _wrmsr1 "\n\t" \
50 "mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t" \
51 _clflush "\n\t" \
52 "mfence;\n\t" \
53 "1: mov (%1), %2; add $64, %1;\n\t" \
54 "nop; nop; nop; nop; nop; nop; nop;\n\t" \
55 "loop 1b;\n\t" \
56 IBPB_JMP_ASM(_wrmsr2) \
57 "mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t" \
58 _wrmsr1 "\n\t"
59
60 #define _loop_asm(_wrmsr1, _clflush, _wrmsr2) \
61 do { \
62 asm volatile(LOOP_ASM(_wrmsr1, _clflush, _wrmsr2) \
63 : "=b"(tmp), "=r"(tmp2), "=r"(tmp3) \
64 : "a"(eax), "d"(edx), "c"(global_ctl), \
65 "0"(N), "1"(buf) \
66 : "edi"); \
67 } while (0)
68
69 /* the number of instructions and branches of the kvm_fep_asm() blob */
70 #define KVM_FEP_INSNS 22
71 #define KVM_FEP_BRANCHES 5
72
73 /*
74 * KVM_FEP is a magic prefix that forces emulation so
75 * 'KVM_FEP "jne label\n"' just counts as a single instruction.
76 */
77 #define kvm_fep_asm(_wrmsr) \
78 do { \
79 asm volatile( \
80 _wrmsr "\n\t" \
81 "mov %%ecx, %%edi;\n\t" \
82 "mov $0x0, %%eax;\n\t" \
83 "cmp $0x0, %%eax;\n\t" \
84 KVM_FEP "jne 1f\n\t" \
85 KVM_FEP "jne 1f\n\t" \
86 KVM_FEP "jne 1f\n\t" \
87 KVM_FEP "jne 1f\n\t" \
88 KVM_FEP "jne 1f\n\t" \
89 "mov $0xa, %%eax; cpuid;\n\t" \
90 "mov $0xa, %%eax; cpuid;\n\t" \
91 "mov $0xa, %%eax; cpuid;\n\t" \
92 "mov $0xa, %%eax; cpuid;\n\t" \
93 "mov $0xa, %%eax; cpuid;\n\t" \
94 "1: mov %%edi, %%ecx; \n\t" \
95 "xor %%eax, %%eax; \n\t" \
96 "xor %%edx, %%edx;\n\t" \
97 _wrmsr "\n\t" \
98 : \
99 : "a"(eax), "d"(edx), "c"(ecx) \
100 : "ebx", "edi"); \
101 } while (0)
102
103 typedef struct {
104 uint32_t ctr;
105 uint32_t idx;
106 uint64_t config;
107 uint64_t count;
108 } pmu_counter_t;
109
110 struct pmu_event {
111 const char *name;
112 uint32_t unit_sel;
113 int min;
114 int max;
115 } intel_gp_events[] = {
116 {"core cycles", 0x003c, 1*N, 50*N},
117 {"instructions", 0x00c0, 10*N, 10.2*N},
118 {"ref cycles", 0x013c, 1*N, 30*N},
119 {"llc references", 0x4f2e, 1, 2*N},
120 {"llc misses", 0x412e, 1, 1*N},
121 {"branches", 0x00c4, 1*N, 1.1*N},
122 {"branch misses", 0x00c5, 1, 0.1*N},
123 }, amd_gp_events[] = {
124 {"core cycles", 0x0076, 1*N, 50*N},
125 {"instructions", 0x00c0, 10*N, 10.2*N},
126 {"branches", 0x00c2, 1*N, 1.1*N},
127 {"branch misses", 0x00c3, 1, 0.1*N},
128 }, fixed_events[] = {
129 {"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
130 {"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
131 {"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
132 };
133
134 /*
135 * Events index in intel_gp_events[], ensure consistent with
136 * intel_gp_events[].
137 */
138 enum {
139 INTEL_INSTRUCTIONS_IDX = 1,
140 INTEL_REF_CYCLES_IDX = 2,
141 INTEL_LLC_MISSES_IDX = 4,
142 INTEL_BRANCHES_IDX = 5,
143 INTEL_BRANCH_MISS_IDX = 6,
144 };
145
146 /*
147 * Events index in amd_gp_events[], ensure consistent with
148 * amd_gp_events[].
149 */
150 enum {
151 AMD_INSTRUCTIONS_IDX = 1,
152 AMD_BRANCHES_IDX = 2,
153 AMD_BRANCH_MISS_IDX = 3,
154 };
155
156 char *buf;
157
158 static struct pmu_event *gp_events;
159 static unsigned int gp_events_size;
160 static unsigned int fixed_counters_num;
161
has_ibpb(void)162 static int has_ibpb(void)
163 {
164 return this_cpu_has(X86_FEATURE_SPEC_CTRL) ||
165 this_cpu_has(X86_FEATURE_AMD_IBPB);
166 }
167
__loop(void)168 static inline void __loop(void)
169 {
170 unsigned long tmp, tmp2, tmp3;
171 u32 global_ctl = 0;
172 u32 eax = 0;
173 u32 edx = 0;
174
175 if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
176 _loop_asm("nop", "clflush (%1)", "wrmsr");
177 else if (this_cpu_has(X86_FEATURE_CLFLUSH))
178 _loop_asm("nop", "clflush (%1)", "nop");
179 else if (has_ibpb())
180 _loop_asm("nop", "nop", "wrmsr");
181 else
182 _loop_asm("nop", "nop", "nop");
183 }
184
185 /*
186 * Enable and disable counters in a whole asm blob to ensure
187 * no other instructions are counted in the window between
188 * counters enabling and really LOOP_ASM code executing.
189 * Thus counters can verify instructions and branches events
190 * against precise counts instead of a rough valid count range.
191 */
__precise_loop(u64 cntrs)192 static inline void __precise_loop(u64 cntrs)
193 {
194 unsigned long tmp, tmp2, tmp3;
195 u32 global_ctl = pmu.msr_global_ctl;
196 u32 eax = cntrs & (BIT_ULL(32) - 1);
197 u32 edx = cntrs >> 32;
198
199 if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb())
200 _loop_asm("wrmsr", "clflush (%1)", "wrmsr");
201 else if (this_cpu_has(X86_FEATURE_CLFLUSH))
202 _loop_asm("wrmsr", "clflush (%1)", "nop");
203 else if (has_ibpb())
204 _loop_asm("wrmsr", "nop", "wrmsr");
205 else
206 _loop_asm("wrmsr", "nop", "nop");
207 }
208
loop(u64 cntrs)209 static inline void loop(u64 cntrs)
210 {
211 if (!this_cpu_has_perf_global_ctrl())
212 __loop();
213 else
214 __precise_loop(cntrs);
215 }
216
adjust_events_range(struct pmu_event * gp_events,int instruction_idx,int branch_idx,int branch_miss_idx)217 static void adjust_events_range(struct pmu_event *gp_events,
218 int instruction_idx, int branch_idx,
219 int branch_miss_idx)
220 {
221 /*
222 * If HW supports GLOBAL_CTRL MSR, enabling and disabling PMCs are
223 * moved in __precise_loop(). Thus, instructions and branches events
224 * can be verified against a precise count instead of a rough range.
225 *
226 * Skip the precise checks on AMD, as AMD CPUs count VMRUN as a branch
227 * instruction in guest context, which* leads to intermittent failures
228 * as the counts will vary depending on how many asynchronous VM-Exits
229 * occur while running the measured code, e.g. if the host takes IRQs.
230 */
231 if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) {
232 gp_events[instruction_idx].min = LOOP_INSNS;
233 gp_events[instruction_idx].max = LOOP_INSNS;
234 gp_events[branch_idx].min = LOOP_BRANCHES;
235 gp_events[branch_idx].max = LOOP_BRANCHES;
236 }
237
238 /*
239 * For CPUs without IBPB support, no way to force to trigger a branch
240 * miss and the measured branch misses is possible to be 0. Thus
241 * overwrite the lower boundary of branch misses event to 0 to avoid
242 * false positive.
243 */
244 if (!has_ibpb())
245 gp_events[branch_miss_idx].min = 0;
246 }
247
248 volatile uint64_t irq_received;
249
cnt_overflow(isr_regs_t * regs)250 static void cnt_overflow(isr_regs_t *regs)
251 {
252 irq_received++;
253 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
254 apic_write(APIC_EOI, 0);
255 }
256
check_irq(void)257 static bool check_irq(void)
258 {
259 int i;
260 irq_received = 0;
261 sti();
262 for (i = 0; i < 100000 && !irq_received; i++)
263 asm volatile("pause");
264 cli();
265 return irq_received;
266 }
267
is_gp(pmu_counter_t * evt)268 static bool is_gp(pmu_counter_t *evt)
269 {
270 if (!pmu.is_intel)
271 return true;
272
273 return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 ||
274 evt->ctr >= MSR_IA32_PMC0;
275 }
276
event_to_global_idx(pmu_counter_t * cnt)277 static int event_to_global_idx(pmu_counter_t *cnt)
278 {
279 if (pmu.is_intel)
280 return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base :
281 (MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
282
283 if (pmu.msr_gp_counter_base == MSR_F15H_PERF_CTR0)
284 return (cnt->ctr - pmu.msr_gp_counter_base) / 2;
285 else
286 return cnt->ctr - pmu.msr_gp_counter_base;
287 }
288
get_counter_event(pmu_counter_t * cnt)289 static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
290 {
291 if (is_gp(cnt)) {
292 int i;
293
294 for (i = 0; i < gp_events_size; i++)
295 if (gp_events[i].unit_sel == (cnt->config & 0xffff))
296 return &gp_events[i];
297 } else {
298 unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0;
299
300 if (idx < ARRAY_SIZE(fixed_events))
301 return &fixed_events[idx];
302 }
303
304 return (void*)0;
305 }
306
global_enable(pmu_counter_t * cnt)307 static void global_enable(pmu_counter_t *cnt)
308 {
309 if (!this_cpu_has_perf_global_ctrl())
310 return;
311
312 cnt->idx = event_to_global_idx(cnt);
313 wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx));
314 }
315
global_disable(pmu_counter_t * cnt)316 static void global_disable(pmu_counter_t *cnt)
317 {
318 if (!this_cpu_has_perf_global_ctrl())
319 return;
320
321 wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx));
322 }
323
__start_event(pmu_counter_t * evt,uint64_t count)324 static void __start_event(pmu_counter_t *evt, uint64_t count)
325 {
326 evt->count = count;
327 wrmsr(evt->ctr, evt->count);
328 if (is_gp(evt)) {
329 wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
330 evt->config | EVNTSEL_EN);
331 } else {
332 uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
333 int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
334 uint32_t usrospmi = 0;
335
336 if (evt->config & EVNTSEL_OS)
337 usrospmi |= (1 << 0);
338 if (evt->config & EVNTSEL_USR)
339 usrospmi |= (1 << 1);
340 if (evt->config & EVNTSEL_INT)
341 usrospmi |= (1 << 3); // PMI on overflow
342 ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift);
343 wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl);
344 }
345 apic_write(APIC_LVTPC, PMI_VECTOR);
346 }
347
start_event(pmu_counter_t * evt)348 static void start_event(pmu_counter_t *evt)
349 {
350 __start_event(evt, 0);
351 global_enable(evt);
352 }
353
__stop_event(pmu_counter_t * evt)354 static void __stop_event(pmu_counter_t *evt)
355 {
356 if (is_gp(evt)) {
357 wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
358 evt->config & ~EVNTSEL_EN);
359 } else {
360 uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL);
361 int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4;
362 wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift));
363 }
364 evt->count = rdmsr(evt->ctr);
365 }
366
stop_event(pmu_counter_t * evt)367 static void stop_event(pmu_counter_t *evt)
368 {
369 global_disable(evt);
370 __stop_event(evt);
371 }
372
measure_many(pmu_counter_t * evt,int count)373 static noinline void measure_many(pmu_counter_t *evt, int count)
374 {
375 int i;
376 u64 cntrs = 0;
377
378 for (i = 0; i < count; i++) {
379 __start_event(&evt[i], 0);
380 cntrs |= BIT_ULL(event_to_global_idx(&evt[i]));
381 }
382 loop(cntrs);
383 for (i = 0; i < count; i++)
384 __stop_event(&evt[i]);
385 }
386
measure_one(pmu_counter_t * evt)387 static void measure_one(pmu_counter_t *evt)
388 {
389 measure_many(evt, 1);
390 }
391
__measure(pmu_counter_t * evt,uint64_t count)392 static noinline void __measure(pmu_counter_t *evt, uint64_t count)
393 {
394 u64 cntrs = BIT_ULL(event_to_global_idx(evt));
395
396 __start_event(evt, count);
397 loop(cntrs);
398 __stop_event(evt);
399 }
400
verify_event(uint64_t count,struct pmu_event * e)401 static bool verify_event(uint64_t count, struct pmu_event *e)
402 {
403 bool pass;
404
405 if (!e)
406 return false;
407
408 pass = count >= e->min && count <= e->max;
409 if (!pass)
410 printf("FAIL: %d <= %"PRId64" <= %d\n", e->min, count, e->max);
411
412 return pass;
413 }
414
verify_counter(pmu_counter_t * cnt)415 static bool verify_counter(pmu_counter_t *cnt)
416 {
417 return verify_event(cnt->count, get_counter_event(cnt));
418 }
419
check_gp_counter(struct pmu_event * evt)420 static void check_gp_counter(struct pmu_event *evt)
421 {
422 pmu_counter_t cnt = {
423 .config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel,
424 };
425 int i;
426
427 for (i = 0; i < pmu.nr_gp_counters; i++) {
428 cnt.ctr = MSR_GP_COUNTERx(i);
429 measure_one(&cnt);
430 report(verify_event(cnt.count, evt), "%s-%d", evt->name, i);
431 }
432 }
433
check_gp_counters(void)434 static void check_gp_counters(void)
435 {
436 int i;
437
438 for (i = 0; i < gp_events_size; i++)
439 if (pmu_arch_event_is_available(i))
440 check_gp_counter(&gp_events[i]);
441 else
442 printf("GP event '%s' is disabled\n",
443 gp_events[i].name);
444 }
445
check_fixed_counters(void)446 static void check_fixed_counters(void)
447 {
448 pmu_counter_t cnt = {
449 .config = EVNTSEL_OS | EVNTSEL_USR,
450 };
451 int i;
452
453 for (i = 0; i < fixed_counters_num; i++) {
454 cnt.ctr = fixed_events[i].unit_sel;
455 measure_one(&cnt);
456 report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i);
457 }
458 }
459
get_one_event(int idx)460 static struct pmu_event *get_one_event(int idx)
461 {
462 int i;
463
464 if (pmu_arch_event_is_available(idx))
465 return &gp_events[idx % gp_events_size];
466
467 for (i = 0; i < gp_events_size; i++) {
468 if (pmu_arch_event_is_available(i))
469 return &gp_events[i];
470 }
471
472 return NULL;
473 }
474
check_counters_many(void)475 static void check_counters_many(void)
476 {
477 struct pmu_event *evt;
478 pmu_counter_t cnt[48];
479 int i, n;
480
481 for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) {
482 evt = get_one_event(i);
483 if (!evt)
484 continue;
485
486 cnt[n].ctr = MSR_GP_COUNTERx(n);
487 cnt[n].config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel;
488 n++;
489 }
490 for (i = 0; i < fixed_counters_num; i++) {
491 cnt[n].ctr = fixed_events[i].unit_sel;
492 cnt[n].config = EVNTSEL_OS | EVNTSEL_USR;
493 n++;
494 }
495
496 assert(n <= ARRAY_SIZE(cnt));
497 measure_many(cnt, n);
498
499 for (i = 0; i < n; i++)
500 if (!verify_counter(&cnt[i]))
501 break;
502
503 report(i == n, "all counters");
504 }
505
measure_for_overflow(pmu_counter_t * cnt)506 static uint64_t measure_for_overflow(pmu_counter_t *cnt)
507 {
508 __measure(cnt, 0);
509 /*
510 * To generate overflow, i.e. roll over to '0', the initial count just
511 * needs to be preset to the negative expected count. However, as per
512 * Intel's SDM, the preset count needs to be incremented by 1 to ensure
513 * the overflow interrupt is generated immediately instead of possibly
514 * waiting for the overflow to propagate through the counter.
515 */
516 assert(cnt->count > 1);
517 return 1 - cnt->count;
518 }
519
check_counter_overflow(void)520 static void check_counter_overflow(void)
521 {
522 int i;
523 uint64_t overflow_preset;
524 int instruction_idx = pmu.is_intel ?
525 INTEL_INSTRUCTIONS_IDX :
526 AMD_INSTRUCTIONS_IDX;
527
528 pmu_counter_t cnt = {
529 .ctr = MSR_GP_COUNTERx(0),
530 .config = EVNTSEL_OS | EVNTSEL_USR |
531 gp_events[instruction_idx].unit_sel /* instructions */,
532 };
533 overflow_preset = measure_for_overflow(&cnt);
534
535 /* clear status before test */
536 if (this_cpu_has_perf_global_status())
537 pmu_clear_global_status();
538
539 report_prefix_push("overflow");
540
541 for (i = 0; i < pmu.nr_gp_counters + 1; i++) {
542 uint64_t status;
543 int idx;
544
545 cnt.count = overflow_preset;
546 if (pmu_use_full_writes())
547 cnt.count &= (1ull << pmu.gp_counter_width) - 1;
548
549 if (i == pmu.nr_gp_counters) {
550 if (!pmu.is_intel)
551 break;
552
553 cnt.ctr = fixed_events[0].unit_sel;
554 cnt.count = measure_for_overflow(&cnt);
555 cnt.count &= (1ull << pmu.gp_counter_width) - 1;
556 } else {
557 cnt.ctr = MSR_GP_COUNTERx(i);
558 }
559
560 if (i % 2)
561 cnt.config |= EVNTSEL_INT;
562 else
563 cnt.config &= ~EVNTSEL_INT;
564 idx = event_to_global_idx(&cnt);
565 __measure(&cnt, cnt.count);
566 if (pmu.is_intel)
567 report(cnt.count == 1, "cntr-%d", i);
568 else
569 report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i);
570
571 if (!this_cpu_has_perf_global_status())
572 continue;
573
574 status = rdmsr(pmu.msr_global_status);
575 report(status & (1ull << idx), "status-%d", i);
576 wrmsr(pmu.msr_global_status_clr, status);
577 status = rdmsr(pmu.msr_global_status);
578 report(!(status & (1ull << idx)), "status clear-%d", i);
579 report(check_irq() == (i % 2), "irq-%d", i);
580 }
581
582 report_prefix_pop();
583 }
584
check_gp_counter_cmask(void)585 static void check_gp_counter_cmask(void)
586 {
587 int instruction_idx = pmu.is_intel ?
588 INTEL_INSTRUCTIONS_IDX :
589 AMD_INSTRUCTIONS_IDX;
590
591 pmu_counter_t cnt = {
592 .ctr = MSR_GP_COUNTERx(0),
593 .config = EVNTSEL_OS | EVNTSEL_USR |
594 gp_events[instruction_idx].unit_sel /* instructions */,
595 };
596 cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
597 measure_one(&cnt);
598 report(cnt.count < gp_events[instruction_idx].min, "cmask");
599 }
600
do_rdpmc_fast(void * ptr)601 static void do_rdpmc_fast(void *ptr)
602 {
603 pmu_counter_t *cnt = ptr;
604 uint32_t idx = (uint32_t)cnt->idx | (1u << 31);
605
606 if (!is_gp(cnt))
607 idx |= 1 << 30;
608
609 cnt->count = rdpmc(idx);
610 }
611
612
check_rdpmc(void)613 static void check_rdpmc(void)
614 {
615 uint64_t val = 0xff0123456789ull;
616 bool exc;
617 int i;
618
619 report_prefix_push("rdpmc");
620
621 for (i = 0; i < pmu.nr_gp_counters; i++) {
622 uint64_t x;
623 pmu_counter_t cnt = {
624 .ctr = MSR_GP_COUNTERx(i),
625 .idx = i
626 };
627
628 /*
629 * Without full-width writes, only the low 32 bits are writable,
630 * and the value is sign-extended.
631 */
632 if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0)
633 x = (uint64_t)(int64_t)(int32_t)val;
634 else
635 x = (uint64_t)(int64_t)val;
636
637 /* Mask according to the number of supported bits */
638 x &= (1ull << pmu.gp_counter_width) - 1;
639
640 wrmsr(MSR_GP_COUNTERx(i), val);
641 report(rdpmc(i) == x, "cntr-%d", i);
642
643 exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
644 if (exc)
645 report_skip("fast-%d", i);
646 else
647 report(cnt.count == (u32)val, "fast-%d", i);
648 }
649 for (i = 0; i < fixed_counters_num; i++) {
650 uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1);
651 pmu_counter_t cnt = {
652 .ctr = MSR_CORE_PERF_FIXED_CTR0 + i,
653 .idx = i
654 };
655
656 wrmsr(MSR_PERF_FIXED_CTRx(i), x);
657 report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i);
658
659 exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
660 if (exc)
661 report_skip("fixed fast-%d", i);
662 else
663 report(cnt.count == (u32)x, "fixed fast-%d", i);
664 }
665
666 report_prefix_pop();
667 }
668
check_running_counter_wrmsr(void)669 static void check_running_counter_wrmsr(void)
670 {
671 uint64_t status;
672 uint64_t count;
673 unsigned int instruction_idx = pmu.is_intel ?
674 INTEL_INSTRUCTIONS_IDX :
675 AMD_INSTRUCTIONS_IDX;
676
677 pmu_counter_t evt = {
678 .ctr = MSR_GP_COUNTERx(0),
679 .config = EVNTSEL_OS | EVNTSEL_USR |
680 gp_events[instruction_idx].unit_sel,
681 };
682
683 report_prefix_push("running counter wrmsr");
684
685 start_event(&evt);
686 __loop();
687 wrmsr(MSR_GP_COUNTERx(0), 0);
688 stop_event(&evt);
689 report(evt.count < gp_events[instruction_idx].min, "cntr");
690
691 /* clear status before overflow test */
692 if (this_cpu_has_perf_global_status())
693 pmu_clear_global_status();
694
695 start_event(&evt);
696
697 count = -1;
698 if (pmu_use_full_writes())
699 count &= (1ull << pmu.gp_counter_width) - 1;
700
701 wrmsr(MSR_GP_COUNTERx(0), count);
702
703 __loop();
704 stop_event(&evt);
705
706 if (this_cpu_has_perf_global_status()) {
707 status = rdmsr(pmu.msr_global_status);
708 report(status & 1, "status msr bit");
709 }
710
711 report_prefix_pop();
712 }
713
check_emulated_instr(void)714 static void check_emulated_instr(void)
715 {
716 u32 eax, edx, ecx;
717 uint64_t status, instr_start, brnch_start;
718 uint64_t gp_counter_width = (1ull << pmu.gp_counter_width) - 1;
719 unsigned int branch_idx = pmu.is_intel ?
720 INTEL_BRANCHES_IDX : AMD_BRANCHES_IDX;
721 unsigned int instruction_idx = pmu.is_intel ?
722 INTEL_INSTRUCTIONS_IDX :
723 AMD_INSTRUCTIONS_IDX;
724
725 pmu_counter_t brnch_cnt = {
726 .ctr = MSR_GP_COUNTERx(0),
727 /* branch instructions */
728 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[branch_idx].unit_sel,
729 };
730 pmu_counter_t instr_cnt = {
731 .ctr = MSR_GP_COUNTERx(1),
732 /* instructions */
733 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel,
734 };
735 report_prefix_push("emulated instruction");
736
737 if (this_cpu_has_perf_global_status())
738 pmu_clear_global_status();
739
740 __start_event(&brnch_cnt, 0);
741 __start_event(&instr_cnt, 0);
742
743 brnch_start = -KVM_FEP_BRANCHES;
744 instr_start = -KVM_FEP_INSNS;
745 wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width);
746 wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width);
747
748 if (this_cpu_has_perf_global_ctrl()) {
749 eax = BIT(0) | BIT(1);
750 ecx = pmu.msr_global_ctl;
751 edx = 0;
752 kvm_fep_asm("wrmsr");
753 } else {
754 eax = ecx = edx = 0;
755 kvm_fep_asm("nop");
756 }
757
758 __stop_event(&brnch_cnt);
759 __stop_event(&instr_cnt);
760
761 // Check that the end count - start count is at least the expected
762 // number of instructions and branches.
763 if (this_cpu_has_perf_global_ctrl()) {
764 report(instr_cnt.count - instr_start == KVM_FEP_INSNS,
765 "instruction count");
766 report(brnch_cnt.count - brnch_start == KVM_FEP_BRANCHES,
767 "branch count");
768 } else {
769 report(instr_cnt.count - instr_start >= KVM_FEP_INSNS,
770 "instruction count");
771 report(brnch_cnt.count - brnch_start >= KVM_FEP_BRANCHES,
772 "branch count");
773 }
774
775 if (this_cpu_has_perf_global_status()) {
776 // Additionally check that those counters overflowed properly.
777 status = rdmsr(pmu.msr_global_status);
778 report(status & BIT_ULL(0), "branch counter overflow");
779 report(status & BIT_ULL(1), "instruction counter overflow");
780 }
781
782 report_prefix_pop();
783 }
784
785 #define XBEGIN_STARTED (~0u)
check_tsx_cycles(void)786 static void check_tsx_cycles(void)
787 {
788 pmu_counter_t cnt;
789 unsigned int i, ret = 0;
790
791 if (!this_cpu_has(X86_FEATURE_RTM))
792 return;
793
794 report_prefix_push("TSX cycles");
795
796 for (i = 0; i < pmu.nr_gp_counters; i++) {
797 cnt.ctr = MSR_GP_COUNTERx(i);
798
799 if (i == 2) {
800 /* Transactional cycles committed only on gp counter 2 */
801 cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x30000003c;
802 } else {
803 /* Transactional cycles */
804 cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x10000003c;
805 }
806
807 start_event(&cnt);
808
809 asm volatile("xbegin 1f\n\t"
810 "1:\n\t"
811 : "+a" (ret) :: "memory");
812
813 /* Generate a non-canonical #GP to trigger ABORT. */
814 if (ret == XBEGIN_STARTED)
815 *(int *)NONCANONICAL = 0;
816
817 stop_event(&cnt);
818
819 report(cnt.count > 0, "gp cntr-%d with a value of %" PRId64 "", i, cnt.count);
820 }
821
822 report_prefix_pop();
823 }
824
warm_up(void)825 static void warm_up(void)
826 {
827 int i;
828
829 /*
830 * Since cycles event is always run as the first event, there would be
831 * a warm-up state to warm up the cache, it leads to the measured cycles
832 * value may exceed the pre-defined cycles upper boundary and cause
833 * false positive. To avoid this, introduce an warm-up state before
834 * the real verification.
835 */
836 for (i = 0; i < 10; i++)
837 loop(0);
838 }
839
check_counters(void)840 static void check_counters(void)
841 {
842 if (is_fep_available)
843 check_emulated_instr();
844
845 warm_up();
846 check_gp_counters();
847 check_fixed_counters();
848 check_rdpmc();
849 check_counters_many();
850 check_counter_overflow();
851 check_gp_counter_cmask();
852 check_running_counter_wrmsr();
853 check_tsx_cycles();
854 }
855
do_unsupported_width_counter_write(void * index)856 static void do_unsupported_width_counter_write(void *index)
857 {
858 wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull);
859 }
860
check_gp_counters_write_width(void)861 static void check_gp_counters_write_width(void)
862 {
863 u64 val_64 = 0xffffff0123456789ull;
864 u64 val_32 = val_64 & ((1ull << 32) - 1);
865 u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1);
866 int i;
867
868 /*
869 * MSR_IA32_PERFCTRn supports 64-bit writes,
870 * but only the lowest 32 bits are valid.
871 */
872 for (i = 0; i < pmu.nr_gp_counters; i++) {
873 wrmsr(MSR_IA32_PERFCTR0 + i, val_32);
874 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
875 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
876
877 wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width);
878 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
879 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
880
881 wrmsr(MSR_IA32_PERFCTR0 + i, val_64);
882 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
883 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
884 }
885
886 /*
887 * MSR_IA32_PMCn supports writing values up to GP counter width,
888 * and only the lowest bits of GP counter width are valid.
889 */
890 for (i = 0; i < pmu.nr_gp_counters; i++) {
891 wrmsr(MSR_IA32_PMC0 + i, val_32);
892 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
893 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
894
895 wrmsr(MSR_IA32_PMC0 + i, val_max_width);
896 assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width);
897 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width);
898
899 report(test_for_exception(GP_VECTOR,
900 do_unsupported_width_counter_write, &i),
901 "writing unsupported width to MSR_IA32_PMC%d raises #GP", i);
902 }
903 }
904
905 /*
906 * Per the SDM, reference cycles are currently implemented using the
907 * core crystal clock, TSC, or bus clock. Calibrate to the TSC
908 * frequency to set reasonable expectations.
909 */
set_ref_cycle_expectations(void)910 static void set_ref_cycle_expectations(void)
911 {
912 pmu_counter_t cnt = {
913 .ctr = MSR_IA32_PERFCTR0,
914 .config = EVNTSEL_OS | EVNTSEL_USR |
915 intel_gp_events[INTEL_REF_CYCLES_IDX].unit_sel,
916 };
917 uint64_t tsc_delta;
918 uint64_t t0, t1, t2, t3;
919
920 /* Bit 2 enumerates the availability of reference cycles events. */
921 if (!pmu.nr_gp_counters || !pmu_arch_event_is_available(2))
922 return;
923
924 t0 = fenced_rdtsc();
925 start_event(&cnt);
926 t1 = fenced_rdtsc();
927
928 /*
929 * This loop has to run long enough to dominate the VM-exit
930 * costs for playing with the PMU MSRs on start and stop.
931 *
932 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times
933 * the core crystal clock, this function calculated a guest
934 * TSC : ref cycles ratio of around 105 with ECX initialized
935 * to one billion.
936 */
937 asm volatile("loop ." : "+c"((int){1000000000ull}));
938
939 t2 = fenced_rdtsc();
940 stop_event(&cnt);
941 t3 = fenced_rdtsc();
942
943 tsc_delta = ((t2 - t1) + (t3 - t0)) / 2;
944
945 if (!tsc_delta)
946 return;
947
948 intel_gp_events[INTEL_REF_CYCLES_IDX].min =
949 (intel_gp_events[INTEL_REF_CYCLES_IDX].min * cnt.count) / tsc_delta;
950 intel_gp_events[INTEL_REF_CYCLES_IDX].max =
951 (intel_gp_events[INTEL_REF_CYCLES_IDX].max * cnt.count) / tsc_delta;
952 }
953
check_invalid_rdpmc_gp(void)954 static void check_invalid_rdpmc_gp(void)
955 {
956 uint64_t val;
957
958 report(rdpmc_safe(64, &val) == GP_VECTOR,
959 "Expected #GP on RDPMC(64)");
960 }
961
main(int ac,char ** av)962 int main(int ac, char **av)
963 {
964 int instruction_idx;
965 int branch_idx;
966 int branch_miss_idx;
967
968 setup_vm();
969 handle_irq(PMI_VECTOR, cnt_overflow);
970 buf = malloc(N*64);
971
972 if (this_cpu_has_perf_global_ctrl())
973 wrmsr(pmu.msr_global_ctl, 0);
974
975 check_invalid_rdpmc_gp();
976
977 if (pmu.is_intel) {
978 if (!pmu.version) {
979 report_skip("No Intel Arch PMU is detected!");
980 return report_summary();
981 }
982 gp_events = (struct pmu_event *)intel_gp_events;
983 gp_events_size = sizeof(intel_gp_events)/sizeof(intel_gp_events[0]);
984 instruction_idx = INTEL_INSTRUCTIONS_IDX;
985 branch_idx = INTEL_BRANCHES_IDX;
986 branch_miss_idx = INTEL_BRANCH_MISS_IDX;
987
988 /*
989 * For legacy Intel CPUS without clflush/clflushopt support,
990 * there is no way to force to trigger a LLC miss, thus set
991 * the minimum value to 0 to avoid false positives.
992 */
993 if (!this_cpu_has(X86_FEATURE_CLFLUSH))
994 gp_events[INTEL_LLC_MISSES_IDX].min = 0;
995
996 report_prefix_push("Intel");
997 set_ref_cycle_expectations();
998 } else {
999 gp_events_size = sizeof(amd_gp_events)/sizeof(amd_gp_events[0]);
1000 gp_events = (struct pmu_event *)amd_gp_events;
1001 instruction_idx = AMD_INSTRUCTIONS_IDX;
1002 branch_idx = AMD_BRANCHES_IDX;
1003 branch_miss_idx = AMD_BRANCH_MISS_IDX;
1004 report_prefix_push("AMD");
1005 }
1006 adjust_events_range(gp_events, instruction_idx, branch_idx, branch_miss_idx);
1007
1008 printf("PMU version: %d\n", pmu.version);
1009 printf("GP counters: %d\n", pmu.nr_gp_counters);
1010 printf("GP counter width: %d\n", pmu.gp_counter_width);
1011 printf("Event Mask length: %d\n", pmu.arch_event_mask_length);
1012 printf("Arch Events (mask): 0x%x\n", pmu.arch_event_available);
1013 printf("Fixed counters: %d\n", pmu.nr_fixed_counters);
1014 printf("Fixed counter width: %d\n", pmu.fixed_counter_width);
1015
1016 fixed_counters_num = MIN(pmu.nr_fixed_counters, ARRAY_SIZE(fixed_events));
1017 if (pmu.nr_fixed_counters > ARRAY_SIZE(fixed_events))
1018 report_info("Fixed counters number %d > defined fixed events %u. "
1019 "Please update test case.", pmu.nr_fixed_counters,
1020 (unsigned)ARRAY_SIZE(fixed_events));
1021
1022 apic_write(APIC_LVTPC, PMI_VECTOR);
1023
1024 check_counters();
1025
1026 if (pmu_has_full_writes()) {
1027 pmu.msr_gp_counter_base = MSR_IA32_PMC0;
1028
1029 report_prefix_push("full-width writes");
1030 check_counters();
1031 check_gp_counters_write_width();
1032 report_prefix_pop();
1033 }
1034
1035 if (!pmu.is_intel) {
1036 report_prefix_push("K7");
1037 pmu.nr_gp_counters = AMD64_NUM_COUNTERS;
1038 pmu.msr_gp_counter_base = MSR_K7_PERFCTR0;
1039 pmu.msr_gp_event_select_base = MSR_K7_EVNTSEL0;
1040 check_counters();
1041 report_prefix_pop();
1042 }
1043
1044 return report_summary();
1045 }
1046