1 2 #include "x86/msr.h" 3 #include "x86/processor.h" 4 #include "x86/pmu.h" 5 #include "x86/apic-defs.h" 6 #include "x86/apic.h" 7 #include "x86/desc.h" 8 #include "x86/isr.h" 9 #include "alloc.h" 10 11 #include "libcflat.h" 12 #include <stdint.h> 13 14 #define N 1000000 15 16 // These values match the number of instructions and branches in the 17 // assembly block in check_emulated_instr(). 18 #define EXPECTED_INSTR 17 19 #define EXPECTED_BRNCH 5 20 21 typedef struct { 22 uint32_t ctr; 23 uint32_t config; 24 uint64_t count; 25 int idx; 26 } pmu_counter_t; 27 28 struct pmu_event { 29 const char *name; 30 uint32_t unit_sel; 31 int min; 32 int max; 33 } gp_events[] = { 34 {"core cycles", 0x003c, 1*N, 50*N}, 35 {"instructions", 0x00c0, 10*N, 10.2*N}, 36 {"ref cycles", 0x013c, 1*N, 30*N}, 37 {"llc references", 0x4f2e, 1, 2*N}, 38 {"llc misses", 0x412e, 1, 1*N}, 39 {"branches", 0x00c4, 1*N, 1.1*N}, 40 {"branch misses", 0x00c5, 0, 0.1*N}, 41 }, fixed_events[] = { 42 {"fixed 1", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N}, 43 {"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N}, 44 {"fixed 3", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N} 45 }; 46 47 char *buf; 48 49 static inline void loop(void) 50 { 51 unsigned long tmp, tmp2, tmp3; 52 53 asm volatile("1: mov (%1), %2; add $64, %1; nop; nop; nop; nop; nop; nop; nop; loop 1b" 54 : "=c"(tmp), "=r"(tmp2), "=r"(tmp3): "0"(N), "1"(buf)); 55 56 } 57 58 volatile uint64_t irq_received; 59 60 static void cnt_overflow(isr_regs_t *regs) 61 { 62 irq_received++; 63 apic_write(APIC_EOI, 0); 64 } 65 66 static bool check_irq(void) 67 { 68 int i; 69 irq_received = 0; 70 irq_enable(); 71 for (i = 0; i < 100000 && !irq_received; i++) 72 asm volatile("pause"); 73 irq_disable(); 74 return irq_received; 75 } 76 77 static bool is_gp(pmu_counter_t *evt) 78 { 79 return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 || 80 evt->ctr >= MSR_IA32_PMC0; 81 } 82 83 static int event_to_global_idx(pmu_counter_t *cnt) 84 { 85 return cnt->ctr - (is_gp(cnt) ? pmu.msr_gp_counter_base : 86 (MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX)); 87 } 88 89 static struct pmu_event* get_counter_event(pmu_counter_t *cnt) 90 { 91 if (is_gp(cnt)) { 92 int i; 93 94 for (i = 0; i < sizeof(gp_events)/sizeof(gp_events[0]); i++) 95 if (gp_events[i].unit_sel == (cnt->config & 0xffff)) 96 return &gp_events[i]; 97 } else 98 return &fixed_events[cnt->ctr - MSR_CORE_PERF_FIXED_CTR0]; 99 100 return (void*)0; 101 } 102 103 static void global_enable(pmu_counter_t *cnt) 104 { 105 if (!this_cpu_has_perf_global_ctrl()) 106 return; 107 108 cnt->idx = event_to_global_idx(cnt); 109 wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) | BIT_ULL(cnt->idx)); 110 } 111 112 static void global_disable(pmu_counter_t *cnt) 113 { 114 if (!this_cpu_has_perf_global_ctrl()) 115 return; 116 117 wrmsr(pmu.msr_global_ctl, rdmsr(pmu.msr_global_ctl) & ~BIT_ULL(cnt->idx)); 118 } 119 120 static void __start_event(pmu_counter_t *evt, uint64_t count) 121 { 122 evt->count = count; 123 wrmsr(evt->ctr, evt->count); 124 if (is_gp(evt)) { 125 wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)), 126 evt->config | EVNTSEL_EN); 127 } else { 128 uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL); 129 int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4; 130 uint32_t usrospmi = 0; 131 132 if (evt->config & EVNTSEL_OS) 133 usrospmi |= (1 << 0); 134 if (evt->config & EVNTSEL_USR) 135 usrospmi |= (1 << 1); 136 if (evt->config & EVNTSEL_INT) 137 usrospmi |= (1 << 3); // PMI on overflow 138 ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift); 139 wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl); 140 } 141 global_enable(evt); 142 apic_write(APIC_LVTPC, PMI_VECTOR); 143 } 144 145 static void start_event(pmu_counter_t *evt) 146 { 147 __start_event(evt, 0); 148 } 149 150 static void stop_event(pmu_counter_t *evt) 151 { 152 global_disable(evt); 153 if (is_gp(evt)) { 154 wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)), 155 evt->config & ~EVNTSEL_EN); 156 } else { 157 uint32_t ctrl = rdmsr(MSR_CORE_PERF_FIXED_CTR_CTRL); 158 int shift = (evt->ctr - MSR_CORE_PERF_FIXED_CTR0) * 4; 159 wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl & ~(0xf << shift)); 160 } 161 evt->count = rdmsr(evt->ctr); 162 } 163 164 static noinline void measure_many(pmu_counter_t *evt, int count) 165 { 166 int i; 167 for (i = 0; i < count; i++) 168 start_event(&evt[i]); 169 loop(); 170 for (i = 0; i < count; i++) 171 stop_event(&evt[i]); 172 } 173 174 static void measure_one(pmu_counter_t *evt) 175 { 176 measure_many(evt, 1); 177 } 178 179 static noinline void __measure(pmu_counter_t *evt, uint64_t count) 180 { 181 __start_event(evt, count); 182 loop(); 183 stop_event(evt); 184 } 185 186 static bool verify_event(uint64_t count, struct pmu_event *e) 187 { 188 // printf("%d <= %ld <= %d\n", e->min, count, e->max); 189 return count >= e->min && count <= e->max; 190 191 } 192 193 static bool verify_counter(pmu_counter_t *cnt) 194 { 195 return verify_event(cnt->count, get_counter_event(cnt)); 196 } 197 198 static void check_gp_counter(struct pmu_event *evt) 199 { 200 pmu_counter_t cnt = { 201 .config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel, 202 }; 203 int i; 204 205 for (i = 0; i < pmu.nr_gp_counters; i++) { 206 cnt.ctr = MSR_GP_COUNTERx(i); 207 measure_one(&cnt); 208 report(verify_event(cnt.count, evt), "%s-%d", evt->name, i); 209 } 210 } 211 212 static void check_gp_counters(void) 213 { 214 int i; 215 216 for (i = 0; i < sizeof(gp_events)/sizeof(gp_events[0]); i++) 217 if (pmu_gp_counter_is_available(i)) 218 check_gp_counter(&gp_events[i]); 219 else 220 printf("GP event '%s' is disabled\n", 221 gp_events[i].name); 222 } 223 224 static void check_fixed_counters(void) 225 { 226 pmu_counter_t cnt = { 227 .config = EVNTSEL_OS | EVNTSEL_USR, 228 }; 229 int i; 230 231 for (i = 0; i < pmu.nr_fixed_counters; i++) { 232 cnt.ctr = fixed_events[i].unit_sel; 233 measure_one(&cnt); 234 report(verify_event(cnt.count, &fixed_events[i]), "fixed-%d", i); 235 } 236 } 237 238 static void check_counters_many(void) 239 { 240 pmu_counter_t cnt[10]; 241 int i, n; 242 243 for (i = 0, n = 0; n < pmu.nr_gp_counters; i++) { 244 if (!pmu_gp_counter_is_available(i)) 245 continue; 246 247 cnt[n].ctr = MSR_GP_COUNTERx(n); 248 cnt[n].config = EVNTSEL_OS | EVNTSEL_USR | 249 gp_events[i % ARRAY_SIZE(gp_events)].unit_sel; 250 n++; 251 } 252 for (i = 0; i < pmu.nr_fixed_counters; i++) { 253 cnt[n].ctr = fixed_events[i].unit_sel; 254 cnt[n].config = EVNTSEL_OS | EVNTSEL_USR; 255 n++; 256 } 257 258 measure_many(cnt, n); 259 260 for (i = 0; i < n; i++) 261 if (!verify_counter(&cnt[i])) 262 break; 263 264 report(i == n, "all counters"); 265 } 266 267 static uint64_t measure_for_overflow(pmu_counter_t *cnt) 268 { 269 __measure(cnt, 0); 270 /* 271 * To generate overflow, i.e. roll over to '0', the initial count just 272 * needs to be preset to the negative expected count. However, as per 273 * Intel's SDM, the preset count needs to be incremented by 1 to ensure 274 * the overflow interrupt is generated immediately instead of possibly 275 * waiting for the overflow to propagate through the counter. 276 */ 277 assert(cnt->count > 1); 278 return 1 - cnt->count; 279 } 280 281 static void check_counter_overflow(void) 282 { 283 uint64_t overflow_preset; 284 int i; 285 pmu_counter_t cnt = { 286 .ctr = MSR_GP_COUNTERx(0), 287 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */, 288 }; 289 overflow_preset = measure_for_overflow(&cnt); 290 291 /* clear status before test */ 292 if (this_cpu_has_perf_global_status()) 293 pmu_clear_global_status(); 294 295 report_prefix_push("overflow"); 296 297 for (i = 0; i < pmu.nr_gp_counters + 1; i++) { 298 uint64_t status; 299 int idx; 300 301 cnt.count = overflow_preset; 302 if (pmu_use_full_writes()) 303 cnt.count &= (1ull << pmu.gp_counter_width) - 1; 304 305 if (i == pmu.nr_gp_counters) { 306 cnt.ctr = fixed_events[0].unit_sel; 307 cnt.count = measure_for_overflow(&cnt); 308 cnt.count &= (1ull << pmu.gp_counter_width) - 1; 309 } else { 310 cnt.ctr = MSR_GP_COUNTERx(i); 311 } 312 313 if (i % 2) 314 cnt.config |= EVNTSEL_INT; 315 else 316 cnt.config &= ~EVNTSEL_INT; 317 idx = event_to_global_idx(&cnt); 318 __measure(&cnt, cnt.count); 319 report(cnt.count == 1, "cntr-%d", i); 320 321 if (!this_cpu_has_perf_global_status()) 322 continue; 323 324 status = rdmsr(pmu.msr_global_status); 325 report(status & (1ull << idx), "status-%d", i); 326 wrmsr(pmu.msr_global_status_clr, status); 327 status = rdmsr(pmu.msr_global_status); 328 report(!(status & (1ull << idx)), "status clear-%d", i); 329 report(check_irq() == (i % 2), "irq-%d", i); 330 } 331 332 report_prefix_pop(); 333 } 334 335 static void check_gp_counter_cmask(void) 336 { 337 pmu_counter_t cnt = { 338 .ctr = MSR_GP_COUNTERx(0), 339 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */, 340 }; 341 cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT); 342 measure_one(&cnt); 343 report(cnt.count < gp_events[1].min, "cmask"); 344 } 345 346 static void do_rdpmc_fast(void *ptr) 347 { 348 pmu_counter_t *cnt = ptr; 349 uint32_t idx = (uint32_t)cnt->idx | (1u << 31); 350 351 if (!is_gp(cnt)) 352 idx |= 1 << 30; 353 354 cnt->count = rdpmc(idx); 355 } 356 357 358 static void check_rdpmc(void) 359 { 360 uint64_t val = 0xff0123456789ull; 361 bool exc; 362 int i; 363 364 report_prefix_push("rdpmc"); 365 366 for (i = 0; i < pmu.nr_gp_counters; i++) { 367 uint64_t x; 368 pmu_counter_t cnt = { 369 .ctr = MSR_GP_COUNTERx(i), 370 .idx = i 371 }; 372 373 /* 374 * Without full-width writes, only the low 32 bits are writable, 375 * and the value is sign-extended. 376 */ 377 if (pmu.msr_gp_counter_base == MSR_IA32_PERFCTR0) 378 x = (uint64_t)(int64_t)(int32_t)val; 379 else 380 x = (uint64_t)(int64_t)val; 381 382 /* Mask according to the number of supported bits */ 383 x &= (1ull << pmu.gp_counter_width) - 1; 384 385 wrmsr(MSR_GP_COUNTERx(i), val); 386 report(rdpmc(i) == x, "cntr-%d", i); 387 388 exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt); 389 if (exc) 390 report_skip("fast-%d", i); 391 else 392 report(cnt.count == (u32)val, "fast-%d", i); 393 } 394 for (i = 0; i < pmu.nr_fixed_counters; i++) { 395 uint64_t x = val & ((1ull << pmu.fixed_counter_width) - 1); 396 pmu_counter_t cnt = { 397 .ctr = MSR_CORE_PERF_FIXED_CTR0 + i, 398 .idx = i 399 }; 400 401 wrmsr(MSR_PERF_FIXED_CTRx(i), x); 402 report(rdpmc(i | (1 << 30)) == x, "fixed cntr-%d", i); 403 404 exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt); 405 if (exc) 406 report_skip("fixed fast-%d", i); 407 else 408 report(cnt.count == (u32)x, "fixed fast-%d", i); 409 } 410 411 report_prefix_pop(); 412 } 413 414 static void check_running_counter_wrmsr(void) 415 { 416 uint64_t status; 417 uint64_t count; 418 pmu_counter_t evt = { 419 .ctr = MSR_GP_COUNTERx(0), 420 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel, 421 }; 422 423 report_prefix_push("running counter wrmsr"); 424 425 start_event(&evt); 426 loop(); 427 wrmsr(MSR_GP_COUNTERx(0), 0); 428 stop_event(&evt); 429 report(evt.count < gp_events[1].min, "cntr"); 430 431 /* clear status before overflow test */ 432 if (this_cpu_has_perf_global_status()) 433 pmu_clear_global_status(); 434 435 start_event(&evt); 436 437 count = -1; 438 if (pmu_use_full_writes()) 439 count &= (1ull << pmu.gp_counter_width) - 1; 440 441 wrmsr(MSR_GP_COUNTERx(0), count); 442 443 loop(); 444 stop_event(&evt); 445 446 if (this_cpu_has_perf_global_status()) { 447 status = rdmsr(pmu.msr_global_status); 448 report(status & 1, "status msr bit"); 449 } 450 451 report_prefix_pop(); 452 } 453 454 static void check_emulated_instr(void) 455 { 456 uint64_t status, instr_start, brnch_start; 457 pmu_counter_t brnch_cnt = { 458 .ctr = MSR_GP_COUNTERx(0), 459 /* branch instructions */ 460 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[5].unit_sel, 461 }; 462 pmu_counter_t instr_cnt = { 463 .ctr = MSR_GP_COUNTERx(1), 464 /* instructions */ 465 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel, 466 }; 467 report_prefix_push("emulated instruction"); 468 469 if (this_cpu_has_perf_global_status()) 470 pmu_clear_global_status(); 471 472 start_event(&brnch_cnt); 473 start_event(&instr_cnt); 474 475 brnch_start = -EXPECTED_BRNCH; 476 instr_start = -EXPECTED_INSTR; 477 wrmsr(MSR_GP_COUNTERx(0), brnch_start); 478 wrmsr(MSR_GP_COUNTERx(1), instr_start); 479 // KVM_FEP is a magic prefix that forces emulation so 480 // 'KVM_FEP "jne label\n"' just counts as a single instruction. 481 asm volatile( 482 "mov $0x0, %%eax\n" 483 "cmp $0x0, %%eax\n" 484 KVM_FEP "jne label\n" 485 KVM_FEP "jne label\n" 486 KVM_FEP "jne label\n" 487 KVM_FEP "jne label\n" 488 KVM_FEP "jne label\n" 489 "mov $0xa, %%eax\n" 490 "cpuid\n" 491 "mov $0xa, %%eax\n" 492 "cpuid\n" 493 "mov $0xa, %%eax\n" 494 "cpuid\n" 495 "mov $0xa, %%eax\n" 496 "cpuid\n" 497 "mov $0xa, %%eax\n" 498 "cpuid\n" 499 "label:\n" 500 : 501 : 502 : "eax", "ebx", "ecx", "edx"); 503 504 if (this_cpu_has_perf_global_ctrl()) 505 wrmsr(pmu.msr_global_ctl, 0); 506 507 stop_event(&brnch_cnt); 508 stop_event(&instr_cnt); 509 510 // Check that the end count - start count is at least the expected 511 // number of instructions and branches. 512 report(instr_cnt.count - instr_start >= EXPECTED_INSTR, 513 "instruction count"); 514 report(brnch_cnt.count - brnch_start >= EXPECTED_BRNCH, 515 "branch count"); 516 if (this_cpu_has_perf_global_status()) { 517 // Additionally check that those counters overflowed properly. 518 status = rdmsr(pmu.msr_global_status); 519 report(status & 1, "branch counter overflow"); 520 report(status & 2, "instruction counter overflow"); 521 } 522 523 report_prefix_pop(); 524 } 525 526 static void check_counters(void) 527 { 528 if (is_fep_available()) 529 check_emulated_instr(); 530 531 check_gp_counters(); 532 check_fixed_counters(); 533 check_rdpmc(); 534 check_counters_many(); 535 check_counter_overflow(); 536 check_gp_counter_cmask(); 537 check_running_counter_wrmsr(); 538 } 539 540 static void do_unsupported_width_counter_write(void *index) 541 { 542 wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull); 543 } 544 545 static void check_gp_counters_write_width(void) 546 { 547 u64 val_64 = 0xffffff0123456789ull; 548 u64 val_32 = val_64 & ((1ull << 32) - 1); 549 u64 val_max_width = val_64 & ((1ull << pmu.gp_counter_width) - 1); 550 int i; 551 552 /* 553 * MSR_IA32_PERFCTRn supports 64-bit writes, 554 * but only the lowest 32 bits are valid. 555 */ 556 for (i = 0; i < pmu.nr_gp_counters; i++) { 557 wrmsr(MSR_IA32_PERFCTR0 + i, val_32); 558 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32); 559 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32); 560 561 wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width); 562 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32); 563 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32); 564 565 wrmsr(MSR_IA32_PERFCTR0 + i, val_64); 566 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32); 567 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32); 568 } 569 570 /* 571 * MSR_IA32_PMCn supports writing values up to GP counter width, 572 * and only the lowest bits of GP counter width are valid. 573 */ 574 for (i = 0; i < pmu.nr_gp_counters; i++) { 575 wrmsr(MSR_IA32_PMC0 + i, val_32); 576 assert(rdmsr(MSR_IA32_PMC0 + i) == val_32); 577 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32); 578 579 wrmsr(MSR_IA32_PMC0 + i, val_max_width); 580 assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width); 581 assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width); 582 583 report(test_for_exception(GP_VECTOR, 584 do_unsupported_width_counter_write, &i), 585 "writing unsupported width to MSR_IA32_PMC%d raises #GP", i); 586 } 587 } 588 589 /* 590 * Per the SDM, reference cycles are currently implemented using the 591 * core crystal clock, TSC, or bus clock. Calibrate to the TSC 592 * frequency to set reasonable expectations. 593 */ 594 static void set_ref_cycle_expectations(void) 595 { 596 pmu_counter_t cnt = { 597 .ctr = MSR_IA32_PERFCTR0, 598 .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[2].unit_sel, 599 }; 600 uint64_t tsc_delta; 601 uint64_t t0, t1, t2, t3; 602 603 /* Bit 2 enumerates the availability of reference cycles events. */ 604 if (!pmu.nr_gp_counters || !pmu_gp_counter_is_available(2)) 605 return; 606 607 if (this_cpu_has_perf_global_ctrl()) 608 wrmsr(pmu.msr_global_ctl, 0); 609 610 t0 = fenced_rdtsc(); 611 start_event(&cnt); 612 t1 = fenced_rdtsc(); 613 614 /* 615 * This loop has to run long enough to dominate the VM-exit 616 * costs for playing with the PMU MSRs on start and stop. 617 * 618 * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times 619 * the core crystal clock, this function calculated a guest 620 * TSC : ref cycles ratio of around 105 with ECX initialized 621 * to one billion. 622 */ 623 asm volatile("loop ." : "+c"((int){1000000000ull})); 624 625 t2 = fenced_rdtsc(); 626 stop_event(&cnt); 627 t3 = fenced_rdtsc(); 628 629 tsc_delta = ((t2 - t1) + (t3 - t0)) / 2; 630 631 if (!tsc_delta) 632 return; 633 634 gp_events[2].min = (gp_events[2].min * cnt.count) / tsc_delta; 635 gp_events[2].max = (gp_events[2].max * cnt.count) / tsc_delta; 636 } 637 638 static void check_invalid_rdpmc_gp(void) 639 { 640 uint64_t val; 641 642 report(rdpmc_safe(64, &val) == GP_VECTOR, 643 "Expected #GP on RDPMC(64)"); 644 } 645 646 int main(int ac, char **av) 647 { 648 setup_vm(); 649 handle_irq(PMI_VECTOR, cnt_overflow); 650 buf = malloc(N*64); 651 652 check_invalid_rdpmc_gp(); 653 654 if (!pmu.version) { 655 report_skip("No Intel Arch PMU is detected!"); 656 return report_summary(); 657 } 658 659 set_ref_cycle_expectations(); 660 661 printf("PMU version: %d\n", pmu.version); 662 printf("GP counters: %d\n", pmu.nr_gp_counters); 663 printf("GP counter width: %d\n", pmu.gp_counter_width); 664 printf("Mask length: %d\n", pmu.gp_counter_mask_length); 665 printf("Fixed counters: %d\n", pmu.nr_fixed_counters); 666 printf("Fixed counter width: %d\n", pmu.fixed_counter_width); 667 668 apic_write(APIC_LVTPC, PMI_VECTOR); 669 670 check_counters(); 671 672 if (pmu_has_full_writes()) { 673 pmu.msr_gp_counter_base = MSR_IA32_PMC0; 674 675 report_prefix_push("full-width writes"); 676 check_counters(); 677 check_gp_counters_write_width(); 678 report_prefix_pop(); 679 } 680 681 return report_summary(); 682 } 683