1 2 #include "x86/msr.h" 3 #include "x86/processor.h" 4 #include "x86/pmu.h" 5 #include "x86/apic-defs.h" 6 #include "x86/apic.h" 7 #include "x86/desc.h" 8 #include "x86/isr.h" 9 #include "vmalloc.h" 10 #include "alloc.h" 11 12 #include "libcflat.h" 13 #include <stdint.h> 14 15 #define N 1000000 16 17 #define IBPB_JMP_INSNS 9 18 #define IBPB_JMP_BRANCHES 2 19 20 #if defined(__i386__) || defined(_M_IX86) /* i386 */ 21 #define IBPB_JMP_ASM(_wrmsr) \ 22 "mov $1, %%eax; xor %%edx, %%edx;\n\t" \ 23 "mov $73, %%ecx;\n\t" \ 24 _wrmsr "\n\t" \ 25 "call 1f\n\t" \ 26 "1: pop %%eax\n\t" \ 27 "add $(2f-1b), %%eax\n\t" \ 28 "jmp *%%eax;\n\t" \ 29 "nop;\n\t" \ 30 "2: nop;\n\t" 31 #else /* x86_64 */ 32 #define IBPB_JMP_ASM(_wrmsr) \ 33 "mov $1, %%eax; xor %%edx, %%edx;\n\t" \ 34 "mov $73, %%ecx;\n\t" \ 35 _wrmsr "\n\t" \ 36 "call 1f\n\t" \ 37 "1: pop %%rax\n\t" \ 38 "add $(2f-1b), %%rax\n\t" \ 39 "jmp *%%rax;\n\t" \ 40 "nop;\n\t" \ 41 "2: nop;\n\t" 42 #endif 43 44 /* GLOBAL_CTRL enable + disable + clflush/mfence + IBPB_JMP */ 45 #define EXTRA_INSNS (3 + 3 + 2 + IBPB_JMP_INSNS) 46 #define LOOP_INSNS (N * 10 + EXTRA_INSNS) 47 #define LOOP_BRANCHES (N + IBPB_JMP_BRANCHES) 48 #define LOOP_ASM(_wrmsr1, _clflush, _wrmsr2) \ 49 _wrmsr1 "\n\t" \ 50 "mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t" \ 51 _clflush "\n\t" \ 52 "mfence;\n\t" \ 53 "1: mov (%1), %2; add $64, %1;\n\t" \ 54 "nop; nop; nop; nop; nop; nop; nop;\n\t" \ 55 "loop 1b;\n\t" \ 56 IBPB_JMP_ASM(_wrmsr2) \ 57 "mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t" \ 58 _wrmsr1 "\n\t" 59 60 #define _loop_asm(_wrmsr1, _clflush, _wrmsr2) \ 61 do { \ 62 asm volatile(LOOP_ASM(_wrmsr1, _clflush, _wrmsr2) \ 63 : "=b"(tmp), "=r"(tmp2), "=r"(tmp3) \ 64 : "a"(eax), "d"(edx), "c"(global_ctl), \ 65 "0"(N), "1"(buf) \ 66 : "edi"); \ 67 } while (0) 68 69 /* the number of instructions and branches of the kvm_fep_asm() blob */ 70 #define KVM_FEP_INSNS 22 71 #define KVM_FEP_BRANCHES 5 72 73 /* 74 * KVM_FEP is a magic prefix that forces emulation so 75 * 'KVM_FEP "jne label\n"' just counts as a single instruction. 76 */ 77 #define kvm_fep_asm(_wrmsr) \ 78 do { \ 79 asm volatile( \ 80 _wrmsr "\n\t" \ 81 "mov %%ecx, %%edi;\n\t" \ 82 "mov $0x0, %%eax;\n\t" \ 83 "cmp $0x0, %%eax;\n\t" \ 84 KVM_FEP "jne 1f\n\t" \ 85 KVM_FEP "jne 1f\n\t" \ 86 KVM_FEP "jne 1f\n\t" \ 87 KVM_FEP "jne 1f\n\t" \ 88 KVM_FEP "jne 1f\n\t" \ 89 "mov $0xa, %%eax; cpuid;\n\t" \ 90 "mov $0xa, %%eax; cpuid;\n\t" \ 91 "mov $0xa, %%eax; cpuid;\n\t" \ 92 "mov $0xa, %%eax; cpuid;\n\t" \ 93 "mov $0xa, %%eax; cpuid;\n\t" \ 94 "1: mov %%edi, %%ecx; \n\t" \ 95 "xor %%eax, %%eax; \n\t" \ 96 "xor %%edx, %%edx;\n\t" \ 97 _wrmsr "\n\t" \ 98 : \ 99 : "a"(eax), "d"(edx), "c"(ecx) \ 100 : "ebx", "edi"); \ 101 } while (0) 102 103 typedef struct { 104 uint32_t ctr; 105 uint32_t idx; 106 uint64_t config; 107 uint64_t count; 108 } pmu_counter_t; 109 110 struct pmu_event { 111 const char *name; 112 uint32_t unit_sel; 113 int min; 114 int max; 115 } intel_gp_events[] = { 116 {"core cycles", 0x003c, 1*N, 50*N}, 117 {"instructions", 0x00c0, 10*N, 10.2*N}, 118 {"ref cycles", 0x013c, 1*N, 30*N}, 119 {"llc references", 0x4f2e, 1, 2*N}, 120 {"llc misses", 0x412e, 1, 1*N}, 121 {"branches", 0x00c4, 1*N, 1.1*N}, 122 {"branch misses", 0x00c5, 1, 0.1*N}, 123 }, amd_gp_events[] = { 124 {"core cycles", 0x0076, 1*N, 50*N}, 125 {"instructions", 0x00c0, 10*N, 10.2*N}, 126 {"branches", 0x00c2, 1*N, 1.1*N}, 127 {"branch misses", 0x00c3, 1, 0.1*N}, 128 }, fixed_events[] = { 129 {"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N}, 130 {"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N}, 131 {"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N} 132 }; 133 134 /* 135 * Events index in intel_gp_events[], ensure consistent with 136 * intel_gp_events[]. 137 */ 138 enum { 139 INTEL_INSTRUCTIONS_IDX = 1, 140 INTEL_REF_CYCLES_IDX = 2, 141 INTEL_LLC_MISSES_IDX = 4, 142 INTEL_BRANCHES_IDX = 5, 143 INTEL_BRANCH_MISS_IDX = 6, 144 }; 145 146 /* 147 * Events index in amd_gp_events[], ensure consistent with 148 * amd_gp_events[]. 149 */ 150 enum { 151 AMD_INSTRUCTIONS_IDX = 1, 152 AMD_BRANCHES_IDX = 2, 153 AMD_BRANCH_MISS_IDX = 3, 154 }; 155 156 char *buf; 157 158 static struct pmu_event *gp_events; 159 static unsigned int gp_events_size; 160 static unsigned int fixed_counters_num; 161 162 static int has_ibpb(void) 163 { 164 return this_cpu_has(X86_FEATURE_SPEC_CTRL) || 165 this_cpu_has(X86_FEATURE_AMD_IBPB); 166 } 167 168 static inline void __loop(void) 169 { 170 unsigned long tmp, tmp2, tmp3; 171 u32 global_ctl = 0; 172 u32 eax = 0; 173 u32 edx = 0; 174 175 if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb()) 176 _loop_asm("nop", "clflush (%1)", "wrmsr"); 177 else if (this_cpu_has(X86_FEATURE_CLFLUSH)) 178 _loop_asm("nop", "clflush (%1)", "nop"); 179 else if (has_ibpb()) 180 _loop_asm("nop", "nop", "wrmsr"); 181 else 182 _loop_asm("nop", "nop", "nop"); 183 } 184 185 /* 186 * Enable and disable counters in a whole asm blob to ensure 187 * no other instructions are counted in the window between 188 * counters enabling and really LOOP_ASM code executing. 189 * Thus counters can verify instructions and branches events 190 * against precise counts instead of a rough valid count range. 191 */ 192 static inline void __precise_loop(u64 cntrs) 193 { 194 unsigned long tmp, tmp2, tmp3; 195 u32 global_ctl = pmu.msr_global_ctl; 196 u32 eax = cntrs & (BIT_ULL(32) - 1); 197 u32 edx = cntrs >> 32; 198 199 if (this_cpu_has(X86_FEATURE_CLFLUSH) && has_ibpb()) 200 _loop_asm("wrmsr", "clflush (%1)", "wrmsr"); 201 else if (this_cpu_has(X86_FEATURE_CLFLUSH)) 202 _loop_asm("wrmsr", "clflush (%1)", "nop"); 203 else if (has_ibpb()) 204 _loop_asm("wrmsr", "nop", "wrmsr"); 205 else 206 _loop_asm("wrmsr", "nop", "nop"); 207 } 208 209 static inline void loop(u64 cntrs) 210 { 211 if (!this_cpu_has_perf_global_ctrl()) 212 __loop(); 213 else 214 __precise_loop(cntrs); 215 } 216 217 static void adjust_events_range(struct pmu_event *gp_events, 218 int instruction_idx, int branch_idx, 219 int branch_miss_idx) 220 { 221 /* 222 * If HW supports GLOBAL_CTRL MSR, enabling and disabling PMCs are 223 * moved in __precise_loop(). Thus, instructions and branches events 224 * can be verified against a precise count instead of a rough range. 225 * 226 * Skip the precise checks on AMD, as AMD CPUs count VMRUN as a branch 227 * instruction in guest context, which* leads to intermittent failures 228 * as the counts will vary depending on how many asynchronous VM-Exits 229 * occur while running the measured code, e.g. if the host takes IRQs. 230 */ 231 if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) { 232 gp_events[instruction_idx].min = LOOP_INSNS; 233 gp_events[instruction_idx].max = LOOP_INSNS; 234 gp_events[branch_idx].min = LOOP_BRANCHES; 235 gp_events[branch_idx].max = LOOP_BRANCHES; 236 } 237 238 /* 239 * For CPUs without IBPB support, no way to force to trigger a branch 240 * miss and the measured branch misses is possible to be 0. Thus 241 * overwrite the lower boundary of branch misses event to 0 to avoid 242 * false positive. 243 */ 244 if (!has_ibpb()) 245 gp_events[branch_miss_idx].min = 0; 246 } 247 248 volatile uint64_t irq_received; 249 250 static void cnt_overflow(isr_regs_t *regs) 251 { 252 irq_received++; 253 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 254 apic_write(APIC_EOI, 0); 255 } 256 257 static bool check_irq(void) 258 { 259 int i; 260 irq_received = 0; 261 sti(); 262 for (i = 0; i < 100000 && !irq_received; i++) 263 asm volatile("pause"); 264 cli(); 265 return irq_received; 266 } 267 268 static bool is_gp(pmu_counter_t *evt) 269 { 270 if (!pmu.is_intel) 271 return true; 272 273 return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 || 274 evt->ctr >= MSR_IA32_PMC0; 275 } 276 277 static int event_to_global_idx(pmu_counter_t *cnt) 278 { 279 if (pmu.is_intel) 280 return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base : 281 (MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX)); 282 283 if (pmu.msr_gp_counter_base == MSR_F15H_PERF_CTR0) 284 return (cnt->ctr - pmu.msr_gp_counter_base) / 2; 285 else 286 return cnt->ctr - pmu.msr_gp_counter_base; 287 } 288 289 static struct pmu_event* get_counter_event(pmu_counter_t *cnt) 290 { 291 if (is_gp(cnt)) { 292 int i; 293 294 for (i = 0; i < gp_events_size; i++) 295 if (gp_events[i].unit_sel == (cnt->config & 0xffff)) 296 return &gp_events[i]; 297 } else { 298 unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0; 299 300 if (idx < ARRAY_SIZE(fixed_events)) 301 return &fixed_events[idx]; 302 } 303 304 return (void*)0; 305 } 306 307 static void global_enable(pmu_counter_t *cnt) 308 { 309 if (!this_cpu_has_perf_global_ctrl()) 310 return; 311 312 cnt->idx = event_to_global_idx(cnt); 313 wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx)); 314 } 315 316 static void global_disable(pmu_counter_t *cnt) 317 { 318 if (!this_cpu_has_perf_global_ctrl()) 319 return; 320 321 wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx)); 322 } 323 324 static void __start_event(pmu_counter_t *evt, uint64_t count) 325 { 326 evt->count = count; 327 wrmsr(evt->ctr, evt->count); 328 if (is_gp(evt)) { 329 wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)), 330 evt->config | EVNTSEL_EN); 331 } else { 332 uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL); 333 int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4; 334 uint32_t usrospmi = 0; 335 336 if (evt->config & EVNTSEL_OS) 337 usrospmi |= (1 << 0); 338 if (evt->config & EVNTSEL_USR) 339 usrospmi |= (1 << 1); 340 if (evt->config & EVNTSEL_INT) 341 usrospmi |= (1 << 3); // PMI on overflow 342 ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift); 343 wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl); 344 } 345 apic_write(APIC_LVTPC, PMI_VECTOR); 346 } 347 348 static void start_event(pmu_counter_t *evt) 349 { 350 __start_event(evt, 0); 351 global_enable(evt); 352 } 353 354 static void __stop_event(pmu_counter_t *evt) 355 { 356 if (is_gp(evt)) { 357 wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)), 358 evt->config & ~EVNTSEL_EN); 359 } else { 360 uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL); 361 int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4; 362 wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift)); 363 } 364 evt->count = rdmsr(evt->ctr); 365 } 366 367 static void stop_event(pmu_counter_t *evt) 368 { 369 global_disable(evt); 370 __stop_event(evt); 371 } 372 373 static noinline void measure_many(pmu_counter_t *evt, int count) 374 { 375 int i; 376 u64 cntrs = 0; 377 378 for (i = 0; i < count; i++) { 379 __start_event(&evt[i], 0); 380 cntrs |= BIT_ULL(event_to_global_idx(&evt[i])); 381 } 382 loop(cntrs); 383 for (i = 0; i < count; i++) 384 __stop_event(&evt[i]); 385 } 386 387 static void measure_one(pmu_counter_t *evt) 388 { 389 measure_many(evt, 1); 390 } 391 392 static noinline void __measure(pmu_counter_t *evt, uint64_t count) 393 { 394 u64 cntrs = BIT_ULL(event_to_global_idx(evt)); 395 396 __start_event(evt, count); 397 loop(cntrs); 398 __stop_event(evt); 399 } 400 401 static bool verify_event(uint64_t count, struct pmu_event *e) 402 { 403 bool pass; 404 405 if (!e) 406 return false; 407 408 pass = count >= e->min && count <= e->max; 409 if (!pass) 410 printf("FAIL: %d <= %"PRId64" <= %d\n", e->min, count, e->max); 411 412 return pass; 413 } 414 415 static bool verify_counter(pmu_counter_t *cnt) 416 { 417 return verify_event(cnt->count, get_counter_event(cnt)); 418 } 419 420 static void check_gp_counter(struct pmu_event *evt) 421 { 422 pmu_counter_t cnt = { 423 .config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel, 424 }; 425 int i; 426 427 for (i = 0; i < pmu.nr_gp_counters; i++) { 428 cnt.ctr = MSR_GP_COUNTERx(i); 429 measure_one(&cnt); 430 report(verify_event(cnt.count, evt), "%s-%d", evt->name, i); 431 } 432 } 433 434 static void check_gp_counters(void) 435 { 436 int i; 437 438 for (i = 0; i < gp_events_size; i++) 439 if (pmu_arch_event_is_available(i)) 440 check_gp_counter(&gp_events[i]); 441 else 442 printf("GP event '%s' is disabled\n", 443 gp_events[i].name); 444 } 445 446 static void check_fixed_counters(void) 447 { 448 pmu_counter_t cnt = { 449 .config = EVNTSEL_OS | EVNTSEL_USR, 450 }; 451 int i; 452 453 for (i = 0; i < fixed_counters_num; i++) { 454 cnt.ctr = fixed_events[i].unit_sel; 455 measure_one(&cnt); 456 report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i); 457 } 458 } 459 460 static struct pmu_event *get_one_event(int idx) 461 { 462 int i; 463 464 if (pmu_arch_event_is_available(idx)) 465 return &gp_events[idx % gp_events_size]; 466 467 for (i = 0; i < gp_events_size; i++) { 468 if (pmu_arch_event_is_available(i)) 469 return &gp_events[i]; 470 } 471 472 return NULL; 473 } 474 475 static void check_counters_many(void) 476 { 477 struct pmu_event *evt; 478 pmu_counter_t cnt[48]; 479 int i, n; 480 481 for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) { 482 evt = get_one_event(i); 483 if (!evt) 484 continue; 485 486 cnt[n].ctr = MSR_GP_COUNTERx(n); 487 cnt[n].config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel; 488 n++; 489 } 490 for (i = 0; i < fixed_counters_num; i++) { 491 cnt[n].ctr = fixed_events[i].unit_sel; 492 cnt[n].config = EVNTSEL_OS | EVNTSEL_USR; 493 n++; 494 } 495 496 assert(n <= ARRAY_SIZE(cnt)); 497 measure_many(cnt, n); 498 499 for (i = 0; i < n; i++) 500 if (!verify_counter(&cnt[i])) 501 break; 502 503 report(i == n, "all counters"); 504 } 505 506 static uint64_t measure_for_overflow(pmu_counter_t *cnt) 507 { 508 __measure(cnt, 0); 509 /* 510 * To generate overflow, i.e. roll over to '0', the initial count just 511 * needs to be preset to the negative expected count. However, as per 512 * Intel's SDM, the preset count needs to be incremented by 1 to ensure 513 * the overflow interrupt is generated immediately instead of possibly 514 * waiting for the overflow to propagate through the counter. 515 */ 516 assert(cnt->count > 1); 517 return 1 - cnt->count; 518 } 519 520 static void check_counter_overflow(void) 521 { 522 int i; 523 uint64_t overflow_preset; 524 int instruction_idx = pmu.is_intel ? 525 INTEL_INSTRUCTIONS_IDX : 526 AMD_INSTRUCTIONS_IDX; 527 528 pmu_counter_t cnt = { 529 .ctr = MSR_GP_COUNTERx(0), 530 .config = EVNTSEL_OS | EVNTSEL_USR | 531 gp_events[instruction_idx].unit_sel /* instructions */, 532 }; 533 overflow_preset = measure_for_overflow(&cnt); 534 535 /* clear status before test */ 536 if (this_cpu_has_perf_global_status()) 537 pmu_clear_global_status(); 538 539 report_prefix_push("overflow"); 540 541 for (i = 0; i < pmu.nr_gp_counters + 1; i++) { 542 uint64_t status; 543 int idx; 544 545 cnt.count = overflow_preset; 546 if (pmu_use_full_writes()) 547 cnt.count &= (1ull << pmu.gp_counter_width) - 1; 548 549 if (i == pmu.nr_gp_counters) { 550 if (!pmu.is_intel) 551 break; 552 553 cnt.ctr = fixed_events[0].unit_sel; 554 cnt.count = measure_for_overflow(&cnt); 555 cnt.count &= (1ull << pmu.gp_counter_width) - 1; 556 } else { 557 cnt.ctr = MSR_GP_COUNTERx(i); 558 } 559 560 if (i % 2) 561 cnt.config |= EVNTSEL_INT; 562 else 563 cnt.config &= ~EVNTSEL_INT; 564 idx = event_to_global_idx(&cnt); 565 __measure(&cnt, cnt.count); 566 if (pmu.is_intel) 567 report(cnt.count == 1, "cntr-%d", i); 568 else 569 report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i); 570 571 if (!this_cpu_has_perf_global_status()) 572 continue; 573 574 status = rdmsr(pmu.msr_global_status); 575 report(status & (1ull << idx), "status-%d", i); 576 wrmsr(pmu.msr_global_status_clr, status); 577 status = rdmsr(pmu.msr_global_status); 578 report(!(status & (1ull << idx)), "status clear-%d", i); 579 report(check_irq() == (i % 2), "irq-%d", i); 580 } 581 582 report_prefix_pop(); 583 } 584 585 static void check_gp_counter_cmask(void) 586 { 587 int instruction_idx = pmu.is_intel ? 588 INTEL_INSTRUCTIONS_IDX : 589 AMD_INSTRUCTIONS_IDX; 590 591 pmu_counter_t cnt = { 592 .ctr = MSR_GP_COUNTERx(0), 593 .config = EVNTSEL_OS | EVNTSEL_USR | 594 gp_events[instruction_idx].unit_sel /* instructions */, 595 }; 596 cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT); 597 measure_one(&cnt); 598 report(cnt.count < gp_events[instruction_idx].min, "cmask"); 599 } 600 601 static void do_rdpmc_fast(void *ptr) 602 { 603 pmu_counter_t *cnt = ptr; 604 uint32_t idx = (uint32_t)cnt->idx | (1u << 31); 605 606 if (!is_gp(cnt)) 607 idx |= 1 << 30; 608 609 cnt->count = rdpmc(idx); 610 } 611 612 613 static void check_rdpmc(void) 614 { 615 uint64_t val = 0xff0123456789ull; 616 bool exc; 617 int i; 618 619 report_prefix_push("rdpmc"); 620 621 for (i = 0; i < pmu.nr_gp_counters; i++) { 622 uint64_t x; 623 pmu_counter_t cnt = { 624 .ctr = MSR_GP_COUNTERx(i), 625 .idx = i 626 }; 627 628 /* 629 * Without full-width writes, only the low 32 bits are writable, 630 * and the value is sign-extended. 631 */ 632 if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0) 633 x = (uint64_t)(int64_t)(int32_t)val; 634 else 635 x = (uint64_t)(int64_t)val; 636 637 /* Mask according to the number of supported bits */ 638 x &= (1ull << pmu.gp_counter_width) - 1; 639 640 wrmsr(MSR_GP_COUNTERx(i), val); 641 report(rdpmc(i) == x, "cntr-%d", i); 642 643 exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt); 644 if (exc) 645 report_skip("fast-%d", i); 646 else 647 report(cnt.count == (u32)val, "fast-%d", i); 648 } 649 for (i = 0; i < fixed_counters_num; i++) { 650 uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1); 651 pmu_counter_t cnt = { 652 .ctr = MSR_CORE_PERF_FIXED_CTR0 + i, 653 .idx = i 654 }; 655 656 wrmsr(MSR_PERF_FIXED_CTRx(i), x); 657 report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i); 658 659 exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt); 660 if (exc) 661 report_skip("fixed fast-%d", i); 662 else 663 report(cnt.count == (u32)x, "fixed fast-%d", i); 664 } 665 666 report_prefix_pop(); 667 } 668 669 static void check_running_counter_wrmsr(void) 670 { 671 uint64_t status; 672 uint64_t count; 673 unsigned int instruction_idx = pmu.is_intel ? 674 INTEL_INSTRUCTIONS_IDX : 675 AMD_INSTRUCTIONS_IDX; 676 677 pmu_counter_t evt = { 678 .ctr = MSR_GP_COUNTERx(0), 679 .config = EVNTSEL_OS | EVNTSEL_USR | 680 gp_events[instruction_idx].unit_sel, 681 }; 682 683 report_prefix_push("running counter wrmsr"); 684 685 start_event(&evt); 686 __loop(); 687 wrmsr(MSR_GP_COUNTERx(0), 0); 688 stop_event(&evt); 689 report(evt.count < gp_events[instruction_idx].min, "cntr"); 690 691 /* clear status before overflow test */ 692 if (this_cpu_has_perf_global_status()) 693 pmu_clear_global_status(); 694 695 start_event(&evt); 696 697 count = -1; 698 if (pmu_use_full_writes()) 699 count &= (1ull << pmu.gp_counter_width) - 1; 700 701 wrmsr(MSR_GP_COUNTERx(0), count); 702 703 __loop(); 704 stop_event(&evt); 705 706 if (this_cpu_has_perf_global_status()) { 707 status = rdmsr(pmu.msr_global_status); 708 report(status & 1, "status msr bit"); 709 } 710 711 report_prefix_pop(); 712 } 713 714 static void check_emulated_instr(void) 715 { 716 u32 eax, edx, ecx; 717 uint64_t status, instr_start, brnch_start; 718 uint64_t gp_counter_width = (1ull << pmu.gp_counter_width) - 1; 719 unsigned int branch_idx = pmu.is_intel ? 720 INTEL_BRANCHES_IDX : AMD_BRANCHES_IDX; 721 unsigned int instruction_idx = pmu.is_intel ? 722 INTEL_INSTRUCTIONS_IDX : 723 AMD_INSTRUCTIONS_IDX; 724 725 pmu_counter_t brnch_cnt = { 726 .ctr = MSR_GP_COUNTERx(0), 727 /* branch instructions */ 728 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[branch_idx].unit_sel, 729 }; 730 pmu_counter_t instr_cnt = { 731 .ctr = MSR_GP_COUNTERx(1), 732 /* instructions */ 733 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel, 734 }; 735 report_prefix_push("emulated instruction"); 736 737 if (this_cpu_has_perf_global_status()) 738 pmu_clear_global_status(); 739 740 __start_event(&brnch_cnt, 0); 741 __start_event(&instr_cnt, 0); 742 743 brnch_start = -KVM_FEP_BRANCHES; 744 instr_start = -KVM_FEP_INSNS; 745 wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width); 746 wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width); 747 748 if (this_cpu_has_perf_global_ctrl()) { 749 eax = BIT(0) | BIT(1); 750 ecx = pmu.msr_global_ctl; 751 edx = 0; 752 kvm_fep_asm("wrmsr"); 753 } else { 754 eax = ecx = edx = 0; 755 kvm_fep_asm("nop"); 756 } 757 758 __stop_event(&brnch_cnt); 759 __stop_event(&instr_cnt); 760 761 // Check that the end count - start count is at least the expected 762 // number of instructions and branches. 763 if (this_cpu_has_perf_global_ctrl()) { 764 report(instr_cnt.count - instr_start == KVM_FEP_INSNS, 765 "instruction count"); 766 report(brnch_cnt.count - brnch_start == KVM_FEP_BRANCHES, 767 "branch count"); 768 } else { 769 report(instr_cnt.count - instr_start >= KVM_FEP_INSNS, 770 "instruction count"); 771 report(brnch_cnt.count - brnch_start >= KVM_FEP_BRANCHES, 772 "branch count"); 773 } 774 775 if (this_cpu_has_perf_global_status()) { 776 // Additionally check that those counters overflowed properly. 777 status = rdmsr(pmu.msr_global_status); 778 report(status & BIT_ULL(0), "branch counter overflow"); 779 report(status & BIT_ULL(1), "instruction counter overflow"); 780 } 781 782 report_prefix_pop(); 783 } 784 785 #define XBEGIN_STARTED (~0u) 786 static void check_tsx_cycles(void) 787 { 788 pmu_counter_t cnt; 789 unsigned int i, ret = 0; 790 791 if (!this_cpu_has(X86_FEATURE_RTM)) 792 return; 793 794 report_prefix_push("TSX cycles"); 795 796 for (i = 0; i < pmu.nr_gp_counters; i++) { 797 cnt.ctr = MSR_GP_COUNTERx(i); 798 799 if (i == 2) { 800 /* Transactional cycles committed only on gp counter 2 */ 801 cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x30000003c; 802 } else { 803 /* Transactional cycles */ 804 cnt.config = EVNTSEL_OS | EVNTSEL_USR | 0x10000003c; 805 } 806 807 start_event(&cnt); 808 809 asm volatile("xbegin 1f\n\t" 810 "1:\n\t" 811 : "+a" (ret) :: "memory"); 812 813 /* Generate a non-canonical #GP to trigger ABORT. */ 814 if (ret == XBEGIN_STARTED) 815 *(int *)NONCANONICAL = 0; 816 817 stop_event(&cnt); 818 819 report(cnt.count > 0, "gp cntr-%d with a value of %" PRId64 "", i, cnt.count); 820 } 821 822 report_prefix_pop(); 823 } 824 825 static void warm_up(void) 826 { 827 int i; 828 829 /* 830 * Since cycles event is always run as the first event, there would be 831 * a warm-up state to warm up the cache, it leads to the measured cycles 832 * value may exceed the pre-defined cycles upper boundary and cause 833 * false positive. To avoid this, introduce an warm-up state before 834 * the real verification. 835 */ 836 for (i = 0; i < 10; i++) 837 loop(0); 838 } 839 840 static void check_counters(void) 841 { 842 if (is_fep_available) 843 check_emulated_instr(); 844 845 warm_up(); 846 check_gp_counters(); 847 check_fixed_counters(); 848 check_rdpmc(); 849 check_counters_many(); 850 check_counter_overflow(); 851 check_gp_counter_cmask(); 852 check_running_counter_wrmsr(); 853 check_tsx_cycles(); 854 } 855 856 static void do_unsupported_width_counter_write(void *index) 857 { 858 wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull); 859 } 860 861 static void check_gp_counters_write_width(void) 862 { 863 u64 val_64 = 0xffffff0123456789ull; 864 u64 val_32 = val_64 & ((1ull << 32) - 1); 865 u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1); 866 int i; 867 868 /* 869 * MSR_IA32_PERFCTRn supports 64-bit writes, 870 * but only the lowest 32 bits are valid. 871 */ 872 for (i = 0; i < pmu.nr_gp_counters; i++) { 873 wrmsr(MSR_IA32_PERFCTR0 + i, val_32); 874 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32); 875 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32); 876 877 wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width); 878 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32); 879 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32); 880 881 wrmsr(MSR_IA32_PERFCTR0 + i, val_64); 882 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32); 883 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32); 884 } 885 886 /* 887 * MSR_IA32_PMCn supports writing values up to GP counter width, 888 * and only the lowest bits of GP counter width are valid. 889 */ 890 for (i = 0; i < pmu.nr_gp_counters; i++) { 891 wrmsr(MSR_IA32_PMC0 + i, val_32); 892 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32); 893 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32); 894 895 wrmsr(MSR_IA32_PMC0 + i, val_max_width); 896 assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width); 897 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width); 898 899 report(test_for_exception(GP_VECTOR, 900 do_unsupported_width_counter_write, &i), 901 "writing unsupported width to MSR_IA32_PMC%d raises #GP", i); 902 } 903 } 904 905 /* 906 * Per the SDM, reference cycles are currently implemented using the 907 * core crystal clock, TSC, or bus clock. Calibrate to the TSC 908 * frequency to set reasonable expectations. 909 */ 910 static void set_ref_cycle_expectations(void) 911 { 912 pmu_counter_t cnt = { 913 .ctr = MSR_IA32_PERFCTR0, 914 .config = EVNTSEL_OS | EVNTSEL_USR | 915 intel_gp_events[INTEL_REF_CYCLES_IDX].unit_sel, 916 }; 917 uint64_t tsc_delta; 918 uint64_t t0, t1, t2, t3; 919 920 /* Bit 2 enumerates the availability of reference cycles events. */ 921 if (!pmu.nr_gp_counters || !pmu_arch_event_is_available(2)) 922 return; 923 924 t0 = fenced_rdtsc(); 925 start_event(&cnt); 926 t1 = fenced_rdtsc(); 927 928 /* 929 * This loop has to run long enough to dominate the VM-exit 930 * costs for playing with the PMU MSRs on start and stop. 931 * 932 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times 933 * the core crystal clock, this function calculated a guest 934 * TSC : ref cycles ratio of around 105 with ECX initialized 935 * to one billion. 936 */ 937 asm volatile("loop ." : "+c"((int){1000000000ull})); 938 939 t2 = fenced_rdtsc(); 940 stop_event(&cnt); 941 t3 = fenced_rdtsc(); 942 943 tsc_delta = ((t2 - t1) + (t3 - t0)) / 2; 944 945 if (!tsc_delta) 946 return; 947 948 intel_gp_events[INTEL_REF_CYCLES_IDX].min = 949 (intel_gp_events[INTEL_REF_CYCLES_IDX].min * cnt.count) / tsc_delta; 950 intel_gp_events[INTEL_REF_CYCLES_IDX].max = 951 (intel_gp_events[INTEL_REF_CYCLES_IDX].max * cnt.count) / tsc_delta; 952 } 953 954 static void check_invalid_rdpmc_gp(void) 955 { 956 uint64_t val; 957 958 report(rdpmc_safe(64, &val) == GP_VECTOR, 959 "Expected #GP on RDPMC(64)"); 960 } 961 962 int main(int ac, char **av) 963 { 964 int instruction_idx; 965 int branch_idx; 966 int branch_miss_idx; 967 968 setup_vm(); 969 handle_irq(PMI_VECTOR, cnt_overflow); 970 buf = malloc(N*64); 971 972 if (this_cpu_has_perf_global_ctrl()) 973 wrmsr(pmu.msr_global_ctl, 0); 974 975 check_invalid_rdpmc_gp(); 976 977 if (pmu.is_intel) { 978 if (!pmu.version) { 979 report_skip("No Intel Arch PMU is detected!"); 980 return report_summary(); 981 } 982 gp_events = (struct pmu_event *)intel_gp_events; 983 gp_events_size = sizeof(intel_gp_events)/sizeof(intel_gp_events[0]); 984 instruction_idx = INTEL_INSTRUCTIONS_IDX; 985 branch_idx = INTEL_BRANCHES_IDX; 986 branch_miss_idx = INTEL_BRANCH_MISS_IDX; 987 988 /* 989 * For legacy Intel CPUS without clflush/clflushopt support, 990 * there is no way to force to trigger a LLC miss, thus set 991 * the minimum value to 0 to avoid false positives. 992 */ 993 if (!this_cpu_has(X86_FEATURE_CLFLUSH)) 994 gp_events[INTEL_LLC_MISSES_IDX].min = 0; 995 996 report_prefix_push("Intel"); 997 set_ref_cycle_expectations(); 998 } else { 999 gp_events_size = sizeof(amd_gp_events)/sizeof(amd_gp_events[0]); 1000 gp_events = (struct pmu_event *)amd_gp_events; 1001 instruction_idx = AMD_INSTRUCTIONS_IDX; 1002 branch_idx = AMD_BRANCHES_IDX; 1003 branch_miss_idx = AMD_BRANCH_MISS_IDX; 1004 report_prefix_push("AMD"); 1005 } 1006 adjust_events_range(gp_events, instruction_idx, branch_idx, branch_miss_idx); 1007 1008 printf("PMU version: %d\n", pmu.version); 1009 printf("GP counters: %d\n", pmu.nr_gp_counters); 1010 printf("GP counter width: %d\n", pmu.gp_counter_width); 1011 printf("Event Mask length: %d\n", pmu.arch_event_mask_length); 1012 printf("Arch Events (mask): 0x%x\n", pmu.arch_event_available); 1013 printf("Fixed counters: %d\n", pmu.nr_fixed_counters); 1014 printf("Fixed counter width: %d\n", pmu.fixed_counter_width); 1015 1016 fixed_counters_num = MIN(pmu.nr_fixed_counters, ARRAY_SIZE(fixed_events)); 1017 if (pmu.nr_fixed_counters > ARRAY_SIZE(fixed_events)) 1018 report_info("Fixed counters number %d > defined fixed events %u. " 1019 "Please update test case.", pmu.nr_fixed_counters, 1020 (unsigned)ARRAY_SIZE(fixed_events)); 1021 1022 apic_write(APIC_LVTPC, PMI_VECTOR); 1023 1024 check_counters(); 1025 1026 if (pmu_has_full_writes()) { 1027 pmu.msr_gp_counter_base = MSR_IA32_PMC0; 1028 1029 report_prefix_push("full-width writes"); 1030 check_counters(); 1031 check_gp_counters_write_width(); 1032 report_prefix_pop(); 1033 } 1034 1035 if (!pmu.is_intel) { 1036 report_prefix_push("K7"); 1037 pmu.nr_gp_counters = AMD64_NUM_COUNTERS; 1038 pmu.msr_gp_counter_base = MSR_K7_PERFCTR0; 1039 pmu.msr_gp_event_select_base = MSR_K7_EVNTSEL0; 1040 check_counters(); 1041 report_prefix_pop(); 1042 } 1043 1044 return report_summary(); 1045 } 1046