1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Zhaoxin PMU; like Intel Architectural PerfMon-v2 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/stddef.h> 9 #include <linux/types.h> 10 #include <linux/init.h> 11 #include <linux/slab.h> 12 #include <linux/export.h> 13 #include <linux/nmi.h> 14 15 #include <asm/cpufeature.h> 16 #include <asm/hardirq.h> 17 #include <asm/apic.h> 18 #include <asm/msr.h> 19 20 #include "../perf_event.h" 21 22 /* 23 * Zhaoxin PerfMon, used on zxc and later. 24 */ 25 static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { 26 27 [PERF_COUNT_HW_CPU_CYCLES] = 0x0082, 28 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 29 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0515, 30 [PERF_COUNT_HW_CACHE_MISSES] = 0x051a, 31 [PERF_COUNT_HW_BUS_CYCLES] = 0x0083, 32 }; 33 34 static struct event_constraint zxc_event_constraints[] __read_mostly = { 35 36 FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ 37 EVENT_CONSTRAINT_END 38 }; 39 40 static struct event_constraint zxd_event_constraints[] __read_mostly = { 41 42 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */ 43 FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ 44 FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */ 45 EVENT_CONSTRAINT_END 46 }; 47 48 static __initconst const u64 zxd_hw_cache_event_ids 49 [PERF_COUNT_HW_CACHE_MAX] 50 [PERF_COUNT_HW_CACHE_OP_MAX] 51 [PERF_COUNT_HW_CACHE_RESULT_MAX] = { 52 [C(L1D)] = { 53 [C(OP_READ)] = { 54 [C(RESULT_ACCESS)] = 0x0042, 55 [C(RESULT_MISS)] = 0x0538, 56 }, 57 [C(OP_WRITE)] = { 58 [C(RESULT_ACCESS)] = 0x0043, 59 [C(RESULT_MISS)] = 0x0562, 60 }, 61 [C(OP_PREFETCH)] = { 62 [C(RESULT_ACCESS)] = -1, 63 [C(RESULT_MISS)] = -1, 64 }, 65 }, 66 [C(L1I)] = { 67 [C(OP_READ)] = { 68 [C(RESULT_ACCESS)] = 0x0300, 69 [C(RESULT_MISS)] = 0x0301, 70 }, 71 [C(OP_WRITE)] = { 72 [C(RESULT_ACCESS)] = -1, 73 [C(RESULT_MISS)] = -1, 74 }, 75 [C(OP_PREFETCH)] = { 76 [C(RESULT_ACCESS)] = 0x030a, 77 [C(RESULT_MISS)] = 0x030b, 78 }, 79 }, 80 [C(LL)] = { 81 [C(OP_READ)] = { 82 [C(RESULT_ACCESS)] = -1, 83 [C(RESULT_MISS)] = -1, 84 }, 85 [C(OP_WRITE)] = { 86 [C(RESULT_ACCESS)] = -1, 87 [C(RESULT_MISS)] = -1, 88 }, 89 [C(OP_PREFETCH)] = { 90 [C(RESULT_ACCESS)] = -1, 91 [C(RESULT_MISS)] = -1, 92 }, 93 }, 94 [C(DTLB)] = { 95 [C(OP_READ)] = { 96 [C(RESULT_ACCESS)] = 0x0042, 97 [C(RESULT_MISS)] = 0x052c, 98 }, 99 [C(OP_WRITE)] = { 100 [C(RESULT_ACCESS)] = 0x0043, 101 [C(RESULT_MISS)] = 0x0530, 102 }, 103 [C(OP_PREFETCH)] = { 104 [C(RESULT_ACCESS)] = 0x0564, 105 [C(RESULT_MISS)] = 0x0565, 106 }, 107 }, 108 [C(ITLB)] = { 109 [C(OP_READ)] = { 110 [C(RESULT_ACCESS)] = 0x00c0, 111 [C(RESULT_MISS)] = 0x0534, 112 }, 113 [C(OP_WRITE)] = { 114 [C(RESULT_ACCESS)] = -1, 115 [C(RESULT_MISS)] = -1, 116 }, 117 [C(OP_PREFETCH)] = { 118 [C(RESULT_ACCESS)] = -1, 119 [C(RESULT_MISS)] = -1, 120 }, 121 }, 122 [C(BPU)] = { 123 [C(OP_READ)] = { 124 [C(RESULT_ACCESS)] = 0x0700, 125 [C(RESULT_MISS)] = 0x0709, 126 }, 127 [C(OP_WRITE)] = { 128 [C(RESULT_ACCESS)] = -1, 129 [C(RESULT_MISS)] = -1, 130 }, 131 [C(OP_PREFETCH)] = { 132 [C(RESULT_ACCESS)] = -1, 133 [C(RESULT_MISS)] = -1, 134 }, 135 }, 136 [C(NODE)] = { 137 [C(OP_READ)] = { 138 [C(RESULT_ACCESS)] = -1, 139 [C(RESULT_MISS)] = -1, 140 }, 141 [C(OP_WRITE)] = { 142 [C(RESULT_ACCESS)] = -1, 143 [C(RESULT_MISS)] = -1, 144 }, 145 [C(OP_PREFETCH)] = { 146 [C(RESULT_ACCESS)] = -1, 147 [C(RESULT_MISS)] = -1, 148 }, 149 }, 150 }; 151 152 static __initconst const u64 zxe_hw_cache_event_ids 153 [PERF_COUNT_HW_CACHE_MAX] 154 [PERF_COUNT_HW_CACHE_OP_MAX] 155 [PERF_COUNT_HW_CACHE_RESULT_MAX] = { 156 [C(L1D)] = { 157 [C(OP_READ)] = { 158 [C(RESULT_ACCESS)] = 0x0568, 159 [C(RESULT_MISS)] = 0x054b, 160 }, 161 [C(OP_WRITE)] = { 162 [C(RESULT_ACCESS)] = 0x0669, 163 [C(RESULT_MISS)] = 0x0562, 164 }, 165 [C(OP_PREFETCH)] = { 166 [C(RESULT_ACCESS)] = -1, 167 [C(RESULT_MISS)] = -1, 168 }, 169 }, 170 [C(L1I)] = { 171 [C(OP_READ)] = { 172 [C(RESULT_ACCESS)] = 0x0300, 173 [C(RESULT_MISS)] = 0x0301, 174 }, 175 [C(OP_WRITE)] = { 176 [C(RESULT_ACCESS)] = -1, 177 [C(RESULT_MISS)] = -1, 178 }, 179 [C(OP_PREFETCH)] = { 180 [C(RESULT_ACCESS)] = 0x030a, 181 [C(RESULT_MISS)] = 0x030b, 182 }, 183 }, 184 [C(LL)] = { 185 [C(OP_READ)] = { 186 [C(RESULT_ACCESS)] = 0x0, 187 [C(RESULT_MISS)] = 0x0, 188 }, 189 [C(OP_WRITE)] = { 190 [C(RESULT_ACCESS)] = 0x0, 191 [C(RESULT_MISS)] = 0x0, 192 }, 193 [C(OP_PREFETCH)] = { 194 [C(RESULT_ACCESS)] = 0x0, 195 [C(RESULT_MISS)] = 0x0, 196 }, 197 }, 198 [C(DTLB)] = { 199 [C(OP_READ)] = { 200 [C(RESULT_ACCESS)] = 0x0568, 201 [C(RESULT_MISS)] = 0x052c, 202 }, 203 [C(OP_WRITE)] = { 204 [C(RESULT_ACCESS)] = 0x0669, 205 [C(RESULT_MISS)] = 0x0530, 206 }, 207 [C(OP_PREFETCH)] = { 208 [C(RESULT_ACCESS)] = 0x0564, 209 [C(RESULT_MISS)] = 0x0565, 210 }, 211 }, 212 [C(ITLB)] = { 213 [C(OP_READ)] = { 214 [C(RESULT_ACCESS)] = 0x00c0, 215 [C(RESULT_MISS)] = 0x0534, 216 }, 217 [C(OP_WRITE)] = { 218 [C(RESULT_ACCESS)] = -1, 219 [C(RESULT_MISS)] = -1, 220 }, 221 [C(OP_PREFETCH)] = { 222 [C(RESULT_ACCESS)] = -1, 223 [C(RESULT_MISS)] = -1, 224 }, 225 }, 226 [C(BPU)] = { 227 [C(OP_READ)] = { 228 [C(RESULT_ACCESS)] = 0x0028, 229 [C(RESULT_MISS)] = 0x0029, 230 }, 231 [C(OP_WRITE)] = { 232 [C(RESULT_ACCESS)] = -1, 233 [C(RESULT_MISS)] = -1, 234 }, 235 [C(OP_PREFETCH)] = { 236 [C(RESULT_ACCESS)] = -1, 237 [C(RESULT_MISS)] = -1, 238 }, 239 }, 240 [C(NODE)] = { 241 [C(OP_READ)] = { 242 [C(RESULT_ACCESS)] = -1, 243 [C(RESULT_MISS)] = -1, 244 }, 245 [C(OP_WRITE)] = { 246 [C(RESULT_ACCESS)] = -1, 247 [C(RESULT_MISS)] = -1, 248 }, 249 [C(OP_PREFETCH)] = { 250 [C(RESULT_ACCESS)] = -1, 251 [C(RESULT_MISS)] = -1, 252 }, 253 }, 254 }; 255 256 static void zhaoxin_pmu_disable_all(void) 257 { 258 wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0); 259 } 260 261 static void zhaoxin_pmu_enable_all(int added) 262 { 263 wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 264 } 265 266 static inline u64 zhaoxin_pmu_get_status(void) 267 { 268 u64 status; 269 270 rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status); 271 272 return status; 273 } 274 275 static inline void zhaoxin_pmu_ack_status(u64 ack) 276 { 277 wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 278 } 279 280 static inline void zxc_pmu_ack_status(u64 ack) 281 { 282 /* 283 * ZXC needs global control enabled in order to clear status bits. 284 */ 285 zhaoxin_pmu_enable_all(0); 286 zhaoxin_pmu_ack_status(ack); 287 zhaoxin_pmu_disable_all(); 288 } 289 290 static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc) 291 { 292 int idx = hwc->idx - INTEL_PMC_IDX_FIXED; 293 u64 ctrl_val, mask; 294 295 mask = 0xfULL << (idx * 4); 296 297 rdmsrq(hwc->config_base, ctrl_val); 298 ctrl_val &= ~mask; 299 wrmsrq(hwc->config_base, ctrl_val); 300 } 301 302 static void zhaoxin_pmu_disable_event(struct perf_event *event) 303 { 304 struct hw_perf_event *hwc = &event->hw; 305 306 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 307 zhaoxin_pmu_disable_fixed(hwc); 308 return; 309 } 310 311 x86_pmu_disable_event(event); 312 } 313 314 static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc) 315 { 316 int idx = hwc->idx - INTEL_PMC_IDX_FIXED; 317 u64 ctrl_val, bits, mask; 318 319 /* 320 * Enable IRQ generation (0x8), 321 * and enable ring-3 counting (0x2) and ring-0 counting (0x1) 322 * if requested: 323 */ 324 bits = 0x8ULL; 325 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) 326 bits |= 0x2; 327 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 328 bits |= 0x1; 329 330 bits <<= (idx * 4); 331 mask = 0xfULL << (idx * 4); 332 333 rdmsrq(hwc->config_base, ctrl_val); 334 ctrl_val &= ~mask; 335 ctrl_val |= bits; 336 wrmsrq(hwc->config_base, ctrl_val); 337 } 338 339 static void zhaoxin_pmu_enable_event(struct perf_event *event) 340 { 341 struct hw_perf_event *hwc = &event->hw; 342 343 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 344 zhaoxin_pmu_enable_fixed(hwc); 345 return; 346 } 347 348 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 349 } 350 351 /* 352 * This handler is triggered by the local APIC, so the APIC IRQ handling 353 * rules apply: 354 */ 355 static int zhaoxin_pmu_handle_irq(struct pt_regs *regs) 356 { 357 struct perf_sample_data data; 358 struct cpu_hw_events *cpuc; 359 int handled = 0; 360 u64 status; 361 int bit; 362 363 cpuc = this_cpu_ptr(&cpu_hw_events); 364 apic_write(APIC_LVTPC, APIC_DM_NMI); 365 zhaoxin_pmu_disable_all(); 366 status = zhaoxin_pmu_get_status(); 367 if (!status) 368 goto done; 369 370 again: 371 if (x86_pmu.enabled_ack) 372 zxc_pmu_ack_status(status); 373 else 374 zhaoxin_pmu_ack_status(status); 375 376 inc_irq_stat(apic_perf_irqs); 377 378 /* 379 * CondChgd bit 63 doesn't mean any overflow status. Ignore 380 * and clear the bit. 381 */ 382 if (__test_and_clear_bit(63, (unsigned long *)&status)) { 383 if (!status) 384 goto done; 385 } 386 387 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 388 struct perf_event *event = cpuc->events[bit]; 389 390 handled++; 391 392 if (!test_bit(bit, cpuc->active_mask)) 393 continue; 394 395 x86_perf_event_update(event); 396 perf_sample_data_init(&data, 0, event->hw.last_period); 397 398 if (!x86_perf_event_set_period(event)) 399 continue; 400 401 perf_event_overflow(event, &data, regs); 402 } 403 404 /* 405 * Repeat if there is more work to be done: 406 */ 407 status = zhaoxin_pmu_get_status(); 408 if (status) 409 goto again; 410 411 done: 412 zhaoxin_pmu_enable_all(0); 413 return handled; 414 } 415 416 static u64 zhaoxin_pmu_event_map(int hw_event) 417 { 418 return zx_pmon_event_map[hw_event]; 419 } 420 421 static struct event_constraint * 422 zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx, 423 struct perf_event *event) 424 { 425 struct event_constraint *c; 426 427 if (x86_pmu.event_constraints) { 428 for_each_event_constraint(c, x86_pmu.event_constraints) { 429 if ((event->hw.config & c->cmask) == c->code) 430 return c; 431 } 432 } 433 434 return &unconstrained; 435 } 436 437 PMU_FORMAT_ATTR(event, "config:0-7"); 438 PMU_FORMAT_ATTR(umask, "config:8-15"); 439 PMU_FORMAT_ATTR(edge, "config:18"); 440 PMU_FORMAT_ATTR(inv, "config:23"); 441 PMU_FORMAT_ATTR(cmask, "config:24-31"); 442 443 static struct attribute *zx_arch_formats_attr[] = { 444 &format_attr_event.attr, 445 &format_attr_umask.attr, 446 &format_attr_edge.attr, 447 &format_attr_inv.attr, 448 &format_attr_cmask.attr, 449 NULL, 450 }; 451 452 static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config) 453 { 454 u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); 455 456 return x86_event_sysfs_show(page, config, event); 457 } 458 459 static const struct x86_pmu zhaoxin_pmu __initconst = { 460 .name = "zhaoxin", 461 .handle_irq = zhaoxin_pmu_handle_irq, 462 .disable_all = zhaoxin_pmu_disable_all, 463 .enable_all = zhaoxin_pmu_enable_all, 464 .enable = zhaoxin_pmu_enable_event, 465 .disable = zhaoxin_pmu_disable_event, 466 .hw_config = x86_pmu_hw_config, 467 .schedule_events = x86_schedule_events, 468 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 469 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 470 .event_map = zhaoxin_pmu_event_map, 471 .max_events = ARRAY_SIZE(zx_pmon_event_map), 472 .apic = 1, 473 /* 474 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits. 475 */ 476 .max_period = (1ULL << 47) - 1, 477 .get_event_constraints = zhaoxin_get_event_constraints, 478 479 .format_attrs = zx_arch_formats_attr, 480 .events_sysfs_show = zhaoxin_event_sysfs_show, 481 }; 482 483 static const struct { int id; char *name; } zx_arch_events_map[] __initconst = { 484 { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, 485 { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, 486 { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, 487 { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, 488 { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, 489 { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, 490 { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, 491 }; 492 493 static __init void zhaoxin_arch_events_quirk(void) 494 { 495 int bit; 496 497 /* disable event that reported as not present by cpuid */ 498 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { 499 zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; 500 pr_warn("CPUID marked event: \'%s\' unavailable\n", 501 zx_arch_events_map[bit].name); 502 } 503 } 504 505 __init int zhaoxin_pmu_init(void) 506 { 507 union cpuid10_edx edx; 508 union cpuid10_eax eax; 509 union cpuid10_ebx ebx; 510 struct event_constraint *c; 511 unsigned int unused; 512 int version; 513 514 pr_info("Welcome to zhaoxin pmu!\n"); 515 516 /* 517 * Check whether the Architectural PerfMon supports 518 * hw_event or not. 519 */ 520 cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); 521 522 if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1) 523 return -ENODEV; 524 525 version = eax.split.version_id; 526 if (version != 2) 527 return -ENODEV; 528 529 x86_pmu = zhaoxin_pmu; 530 pr_info("Version check pass!\n"); 531 532 x86_pmu.version = version; 533 x86_pmu.cntr_mask64 = GENMASK_ULL(eax.split.num_counters - 1, 0); 534 x86_pmu.cntval_bits = eax.split.bit_width; 535 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; 536 x86_pmu.events_maskl = ebx.full; 537 x86_pmu.events_mask_len = eax.split.mask_length; 538 539 x86_pmu.fixed_cntr_mask64 = GENMASK_ULL(edx.split.num_counters_fixed - 1, 0); 540 x86_add_quirk(zhaoxin_arch_events_quirk); 541 542 switch (boot_cpu_data.x86) { 543 case 0x06: 544 /* 545 * Support Zhaoxin CPU from ZXC series, exclude Nano series through FMS. 546 * Nano FMS: Family=6, Model=F, Stepping=[0-A][C-D] 547 * ZXC FMS: Family=6, Model=F, Stepping=E-F OR Family=6, Model=0x19, Stepping=0-3 548 */ 549 if ((boot_cpu_data.x86_model == 0x0f && boot_cpu_data.x86_stepping >= 0x0e) || 550 boot_cpu_data.x86_model == 0x19) { 551 552 x86_pmu.max_period = x86_pmu.cntval_mask >> 1; 553 554 /* Clearing status works only if the global control is enable on zxc. */ 555 x86_pmu.enabled_ack = 1; 556 557 x86_pmu.event_constraints = zxc_event_constraints; 558 zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0; 559 zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0; 560 zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0; 561 zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0; 562 563 pr_cont("ZXC events, "); 564 break; 565 } 566 return -ENODEV; 567 568 case 0x07: 569 zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 570 X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01); 571 572 zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 573 X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0); 574 575 switch (boot_cpu_data.x86_model) { 576 case 0x1b: 577 memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids, 578 sizeof(hw_cache_event_ids)); 579 580 x86_pmu.event_constraints = zxd_event_constraints; 581 582 zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700; 583 zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709; 584 585 pr_cont("ZXD events, "); 586 break; 587 case 0x3b: 588 memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids, 589 sizeof(hw_cache_event_ids)); 590 591 x86_pmu.event_constraints = zxd_event_constraints; 592 593 zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028; 594 zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029; 595 596 pr_cont("ZXE events, "); 597 break; 598 default: 599 return -ENODEV; 600 } 601 break; 602 603 default: 604 return -ENODEV; 605 } 606 607 x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; 608 x86_pmu.intel_ctrl |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED; 609 610 if (x86_pmu.event_constraints) { 611 for_each_event_constraint(c, x86_pmu.event_constraints) { 612 c->idxmsk64 |= x86_pmu.cntr_mask64; 613 c->weight += x86_pmu_num_counters(NULL); 614 } 615 } 616 617 return 0; 618 } 619 620