1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4
5 #include <asm/cpu.h>
6 #include <asm/msr.h>
7
8 #include "mce_amd.h"
9
10 static struct amd_decoder_ops fam_ops;
11
12 static u8 xec_mask = 0xf;
13
14 static void (*decode_dram_ecc)(int node_id, struct mce *m);
15
amd_register_ecc_decoder(void (* f)(int,struct mce *))16 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
17 {
18 decode_dram_ecc = f;
19 }
20 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
21
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))22 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
23 {
24 if (decode_dram_ecc) {
25 WARN_ON(decode_dram_ecc != f);
26
27 decode_dram_ecc = NULL;
28 }
29 }
30 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
31
32 /*
33 * string representation for the different MCA reported error types, see F3x48
34 * or MSR0000_0411.
35 */
36
37 /* transaction type */
38 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
39
40 /* cache level */
41 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
42
43 /* memory transaction type */
44 static const char * const rrrr_msgs[] = {
45 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
46 };
47
48 /* participating processor */
49 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
50 EXPORT_SYMBOL_GPL(pp_msgs);
51
52 /* request timeout */
53 static const char * const to_msgs[] = { "no timeout", "timed out" };
54
55 /* memory or i/o */
56 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
57
58 /* internal error type */
59 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
60
61 static const char * const f15h_mc1_mce_desc[] = {
62 "UC during a demand linefill from L2",
63 "Parity error during data load from IC",
64 "Parity error for IC valid bit",
65 "Main tag parity error",
66 "Parity error in prediction queue",
67 "PFB data/address parity error",
68 "Parity error in the branch status reg",
69 "PFB promotion address error",
70 "Tag error during probe/victimization",
71 "Parity error for IC probe tag valid bit",
72 "PFB non-cacheable bit parity error",
73 "PFB valid bit parity error", /* xec = 0xd */
74 "Microcode Patch Buffer", /* xec = 010 */
75 "uop queue",
76 "insn buffer",
77 "predecode buffer",
78 "fetch address FIFO",
79 "dispatch uop queue"
80 };
81
82 static const char * const f15h_mc2_mce_desc[] = {
83 "Fill ECC error on data fills", /* xec = 0x4 */
84 "Fill parity error on insn fills",
85 "Prefetcher request FIFO parity error",
86 "PRQ address parity error",
87 "PRQ data parity error",
88 "WCC Tag ECC error",
89 "WCC Data ECC error",
90 "WCB Data parity error",
91 "VB Data ECC or parity error",
92 "L2 Tag ECC error", /* xec = 0x10 */
93 "Hard L2 Tag ECC error",
94 "Multiple hits on L2 tag",
95 "XAB parity error",
96 "PRB address parity error"
97 };
98
99 static const char * const mc4_mce_desc[] = {
100 "DRAM ECC error detected on the NB",
101 "CRC error detected on HT link",
102 "Link-defined sync error packets detected on HT link",
103 "HT Master abort",
104 "HT Target abort",
105 "Invalid GART PTE entry during GART table walk",
106 "Unsupported atomic RMW received from an IO link",
107 "Watchdog timeout due to lack of progress",
108 "DRAM ECC error detected on the NB",
109 "SVM DMA Exclusion Vector error",
110 "HT data error detected on link",
111 "Protocol error (link, L3, probe filter)",
112 "NB internal arrays parity error",
113 "DRAM addr/ctl signals parity error",
114 "IO link transmission error",
115 "L3 data cache ECC error", /* xec = 0x1c */
116 "L3 cache tag error",
117 "L3 LRU parity bits error",
118 "ECC Error in the Probe Filter directory"
119 };
120
121 static const char * const mc5_mce_desc[] = {
122 "CPU Watchdog timer expire",
123 "Wakeup array dest tag",
124 "AG payload array",
125 "EX payload array",
126 "IDRF array",
127 "Retire dispatch queue",
128 "Mapper checkpoint array",
129 "Physical register file EX0 port",
130 "Physical register file EX1 port",
131 "Physical register file AG0 port",
132 "Physical register file AG1 port",
133 "Flag register file",
134 "DE error occurred",
135 "Retire status queue"
136 };
137
138 static const char * const mc6_mce_desc[] = {
139 "Hardware Assertion",
140 "Free List",
141 "Physical Register File",
142 "Retire Queue",
143 "Scheduler table",
144 "Status Register File",
145 };
146
f12h_mc0_mce(u16 ec,u8 xec)147 static bool f12h_mc0_mce(u16 ec, u8 xec)
148 {
149 bool ret = false;
150
151 if (MEM_ERROR(ec)) {
152 u8 ll = LL(ec);
153 ret = true;
154
155 if (ll == LL_L2)
156 pr_cont("during L1 linefill from L2.\n");
157 else if (ll == LL_L1)
158 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
159 else
160 ret = false;
161 }
162 return ret;
163 }
164
f10h_mc0_mce(u16 ec,u8 xec)165 static bool f10h_mc0_mce(u16 ec, u8 xec)
166 {
167 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
168 pr_cont("during data scrub.\n");
169 return true;
170 }
171 return f12h_mc0_mce(ec, xec);
172 }
173
k8_mc0_mce(u16 ec,u8 xec)174 static bool k8_mc0_mce(u16 ec, u8 xec)
175 {
176 if (BUS_ERROR(ec)) {
177 pr_cont("during system linefill.\n");
178 return true;
179 }
180
181 return f10h_mc0_mce(ec, xec);
182 }
183
cat_mc0_mce(u16 ec,u8 xec)184 static bool cat_mc0_mce(u16 ec, u8 xec)
185 {
186 u8 r4 = R4(ec);
187 bool ret = true;
188
189 if (MEM_ERROR(ec)) {
190
191 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
192 return false;
193
194 switch (r4) {
195 case R4_DRD:
196 case R4_DWR:
197 pr_cont("Data/Tag parity error due to %s.\n",
198 (r4 == R4_DRD ? "load/hw prf" : "store"));
199 break;
200 case R4_EVICT:
201 pr_cont("Copyback parity error on a tag miss.\n");
202 break;
203 case R4_SNOOP:
204 pr_cont("Tag parity error during snoop.\n");
205 break;
206 default:
207 ret = false;
208 }
209 } else if (BUS_ERROR(ec)) {
210
211 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
212 return false;
213
214 pr_cont("System read data error on a ");
215
216 switch (r4) {
217 case R4_RD:
218 pr_cont("TLB reload.\n");
219 break;
220 case R4_DWR:
221 pr_cont("store.\n");
222 break;
223 case R4_DRD:
224 pr_cont("load.\n");
225 break;
226 default:
227 ret = false;
228 }
229 } else {
230 ret = false;
231 }
232
233 return ret;
234 }
235
f15h_mc0_mce(u16 ec,u8 xec)236 static bool f15h_mc0_mce(u16 ec, u8 xec)
237 {
238 bool ret = true;
239
240 if (MEM_ERROR(ec)) {
241
242 switch (xec) {
243 case 0x0:
244 pr_cont("Data Array access error.\n");
245 break;
246
247 case 0x1:
248 pr_cont("UC error during a linefill from L2/NB.\n");
249 break;
250
251 case 0x2:
252 case 0x11:
253 pr_cont("STQ access error.\n");
254 break;
255
256 case 0x3:
257 pr_cont("SCB access error.\n");
258 break;
259
260 case 0x10:
261 pr_cont("Tag error.\n");
262 break;
263
264 case 0x12:
265 pr_cont("LDQ access error.\n");
266 break;
267
268 default:
269 ret = false;
270 }
271 } else if (BUS_ERROR(ec)) {
272
273 if (!xec)
274 pr_cont("System Read Data Error.\n");
275 else
276 pr_cont(" Internal error condition type %d.\n", xec);
277 } else if (INT_ERROR(ec)) {
278 if (xec <= 0x1f)
279 pr_cont("Hardware Assert.\n");
280 else
281 ret = false;
282
283 } else
284 ret = false;
285
286 return ret;
287 }
288
decode_mc0_mce(struct mce * m)289 static void decode_mc0_mce(struct mce *m)
290 {
291 u16 ec = EC(m->status);
292 u8 xec = XEC(m->status, xec_mask);
293
294 pr_emerg(HW_ERR "MC0 Error: ");
295
296 /* TLB error signatures are the same across families */
297 if (TLB_ERROR(ec)) {
298 if (TT(ec) == TT_DATA) {
299 pr_cont("%s TLB %s.\n", LL_MSG(ec),
300 ((xec == 2) ? "locked miss"
301 : (xec ? "multimatch" : "parity")));
302 return;
303 }
304 } else if (fam_ops.mc0_mce(ec, xec))
305 ;
306 else
307 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
308 }
309
k8_mc1_mce(u16 ec,u8 xec)310 static bool k8_mc1_mce(u16 ec, u8 xec)
311 {
312 u8 ll = LL(ec);
313 bool ret = true;
314
315 if (!MEM_ERROR(ec))
316 return false;
317
318 if (ll == 0x2)
319 pr_cont("during a linefill from L2.\n");
320 else if (ll == 0x1) {
321 switch (R4(ec)) {
322 case R4_IRD:
323 pr_cont("Parity error during data load.\n");
324 break;
325
326 case R4_EVICT:
327 pr_cont("Copyback Parity/Victim error.\n");
328 break;
329
330 case R4_SNOOP:
331 pr_cont("Tag Snoop error.\n");
332 break;
333
334 default:
335 ret = false;
336 break;
337 }
338 } else
339 ret = false;
340
341 return ret;
342 }
343
cat_mc1_mce(u16 ec,u8 xec)344 static bool cat_mc1_mce(u16 ec, u8 xec)
345 {
346 u8 r4 = R4(ec);
347 bool ret = true;
348
349 if (!MEM_ERROR(ec))
350 return false;
351
352 if (TT(ec) != TT_INSTR)
353 return false;
354
355 if (r4 == R4_IRD)
356 pr_cont("Data/tag array parity error for a tag hit.\n");
357 else if (r4 == R4_SNOOP)
358 pr_cont("Tag error during snoop/victimization.\n");
359 else if (xec == 0x0)
360 pr_cont("Tag parity error from victim castout.\n");
361 else if (xec == 0x2)
362 pr_cont("Microcode patch RAM parity error.\n");
363 else
364 ret = false;
365
366 return ret;
367 }
368
f15h_mc1_mce(u16 ec,u8 xec)369 static bool f15h_mc1_mce(u16 ec, u8 xec)
370 {
371 bool ret = true;
372
373 if (!MEM_ERROR(ec))
374 return false;
375
376 switch (xec) {
377 case 0x0 ... 0xa:
378 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
379 break;
380
381 case 0xd:
382 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
383 break;
384
385 case 0x10:
386 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
387 break;
388
389 case 0x11 ... 0x15:
390 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
391 break;
392
393 default:
394 ret = false;
395 }
396 return ret;
397 }
398
decode_mc1_mce(struct mce * m)399 static void decode_mc1_mce(struct mce *m)
400 {
401 u16 ec = EC(m->status);
402 u8 xec = XEC(m->status, xec_mask);
403
404 pr_emerg(HW_ERR "MC1 Error: ");
405
406 if (TLB_ERROR(ec))
407 pr_cont("%s TLB %s.\n", LL_MSG(ec),
408 (xec ? "multimatch" : "parity error"));
409 else if (BUS_ERROR(ec)) {
410 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
411
412 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
413 } else if (INT_ERROR(ec)) {
414 if (xec <= 0x3f)
415 pr_cont("Hardware Assert.\n");
416 else
417 goto wrong_mc1_mce;
418 } else if (fam_ops.mc1_mce(ec, xec))
419 ;
420 else
421 goto wrong_mc1_mce;
422
423 return;
424
425 wrong_mc1_mce:
426 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
427 }
428
k8_mc2_mce(u16 ec,u8 xec)429 static bool k8_mc2_mce(u16 ec, u8 xec)
430 {
431 bool ret = true;
432
433 if (xec == 0x1)
434 pr_cont(" in the write data buffers.\n");
435 else if (xec == 0x3)
436 pr_cont(" in the victim data buffers.\n");
437 else if (xec == 0x2 && MEM_ERROR(ec))
438 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
439 else if (xec == 0x0) {
440 if (TLB_ERROR(ec))
441 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
442 TT_MSG(ec));
443 else if (BUS_ERROR(ec))
444 pr_cont(": %s/ECC error in data read from NB: %s.\n",
445 R4_MSG(ec), PP_MSG(ec));
446 else if (MEM_ERROR(ec)) {
447 u8 r4 = R4(ec);
448
449 if (r4 >= 0x7)
450 pr_cont(": %s error during data copyback.\n",
451 R4_MSG(ec));
452 else if (r4 <= 0x1)
453 pr_cont(": %s parity/ECC error during data "
454 "access from L2.\n", R4_MSG(ec));
455 else
456 ret = false;
457 } else
458 ret = false;
459 } else
460 ret = false;
461
462 return ret;
463 }
464
f15h_mc2_mce(u16 ec,u8 xec)465 static bool f15h_mc2_mce(u16 ec, u8 xec)
466 {
467 bool ret = true;
468
469 if (TLB_ERROR(ec)) {
470 if (xec == 0x0)
471 pr_cont("Data parity TLB read error.\n");
472 else if (xec == 0x1)
473 pr_cont("Poison data provided for TLB fill.\n");
474 else
475 ret = false;
476 } else if (BUS_ERROR(ec)) {
477 if (xec > 2)
478 ret = false;
479
480 pr_cont("Error during attempted NB data read.\n");
481 } else if (MEM_ERROR(ec)) {
482 switch (xec) {
483 case 0x4 ... 0xc:
484 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
485 break;
486
487 case 0x10 ... 0x14:
488 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
489 break;
490
491 default:
492 ret = false;
493 }
494 } else if (INT_ERROR(ec)) {
495 if (xec <= 0x3f)
496 pr_cont("Hardware Assert.\n");
497 else
498 ret = false;
499 }
500
501 return ret;
502 }
503
f16h_mc2_mce(u16 ec,u8 xec)504 static bool f16h_mc2_mce(u16 ec, u8 xec)
505 {
506 u8 r4 = R4(ec);
507
508 if (!MEM_ERROR(ec))
509 return false;
510
511 switch (xec) {
512 case 0x04 ... 0x05:
513 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
514 break;
515
516 case 0x09 ... 0x0b:
517 case 0x0d ... 0x0f:
518 pr_cont("ECC error in L2 tag (%s).\n",
519 ((r4 == R4_GEN) ? "BankReq" :
520 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
521 break;
522
523 case 0x10 ... 0x19:
524 case 0x1b:
525 pr_cont("ECC error in L2 data array (%s).\n",
526 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
527 ((r4 == R4_GEN) ? "Attr" :
528 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
529 break;
530
531 case 0x1c ... 0x1d:
532 case 0x1f:
533 pr_cont("Parity error in L2 attribute bits (%s).\n",
534 ((r4 == R4_RD) ? "Hit" :
535 ((r4 == R4_GEN) ? "Attr" : "Fill")));
536 break;
537
538 default:
539 return false;
540 }
541
542 return true;
543 }
544
decode_mc2_mce(struct mce * m)545 static void decode_mc2_mce(struct mce *m)
546 {
547 u16 ec = EC(m->status);
548 u8 xec = XEC(m->status, xec_mask);
549
550 pr_emerg(HW_ERR "MC2 Error: ");
551
552 if (!fam_ops.mc2_mce(ec, xec))
553 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
554 }
555
decode_mc3_mce(struct mce * m)556 static void decode_mc3_mce(struct mce *m)
557 {
558 u16 ec = EC(m->status);
559 u8 xec = XEC(m->status, xec_mask);
560
561 if (boot_cpu_data.x86 >= 0x14) {
562 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
563 " please report on LKML.\n");
564 return;
565 }
566
567 pr_emerg(HW_ERR "MC3 Error");
568
569 if (xec == 0x0) {
570 u8 r4 = R4(ec);
571
572 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
573 goto wrong_mc3_mce;
574
575 pr_cont(" during %s.\n", R4_MSG(ec));
576 } else
577 goto wrong_mc3_mce;
578
579 return;
580
581 wrong_mc3_mce:
582 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
583 }
584
decode_mc4_mce(struct mce * m)585 static void decode_mc4_mce(struct mce *m)
586 {
587 unsigned int fam = x86_family(m->cpuid);
588 int node_id = topology_amd_node_id(m->extcpu);
589 u16 ec = EC(m->status);
590 u8 xec = XEC(m->status, 0x1f);
591 u8 offset = 0;
592
593 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
594
595 switch (xec) {
596 case 0x0 ... 0xe:
597
598 /* special handling for DRAM ECCs */
599 if (xec == 0x0 || xec == 0x8) {
600 /* no ECCs on F11h */
601 if (fam == 0x11)
602 goto wrong_mc4_mce;
603
604 pr_cont("%s.\n", mc4_mce_desc[xec]);
605
606 if (decode_dram_ecc)
607 decode_dram_ecc(node_id, m);
608 return;
609 }
610 break;
611
612 case 0xf:
613 if (TLB_ERROR(ec))
614 pr_cont("GART Table Walk data error.\n");
615 else if (BUS_ERROR(ec))
616 pr_cont("DMA Exclusion Vector Table Walk error.\n");
617 else
618 goto wrong_mc4_mce;
619 return;
620
621 case 0x19:
622 if (fam == 0x15 || fam == 0x16)
623 pr_cont("Compute Unit Data Error.\n");
624 else
625 goto wrong_mc4_mce;
626 return;
627
628 case 0x1c ... 0x1f:
629 offset = 13;
630 break;
631
632 default:
633 goto wrong_mc4_mce;
634 }
635
636 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
637 return;
638
639 wrong_mc4_mce:
640 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
641 }
642
decode_mc5_mce(struct mce * m)643 static void decode_mc5_mce(struct mce *m)
644 {
645 unsigned int fam = x86_family(m->cpuid);
646 u16 ec = EC(m->status);
647 u8 xec = XEC(m->status, xec_mask);
648
649 if (fam == 0xf || fam == 0x11)
650 goto wrong_mc5_mce;
651
652 pr_emerg(HW_ERR "MC5 Error: ");
653
654 if (INT_ERROR(ec)) {
655 if (xec <= 0x1f) {
656 pr_cont("Hardware Assert.\n");
657 return;
658 } else
659 goto wrong_mc5_mce;
660 }
661
662 if (xec == 0x0 || xec == 0xc)
663 pr_cont("%s.\n", mc5_mce_desc[xec]);
664 else if (xec <= 0xd)
665 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
666 else
667 goto wrong_mc5_mce;
668
669 return;
670
671 wrong_mc5_mce:
672 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
673 }
674
decode_mc6_mce(struct mce * m)675 static void decode_mc6_mce(struct mce *m)
676 {
677 u8 xec = XEC(m->status, xec_mask);
678
679 pr_emerg(HW_ERR "MC6 Error: ");
680
681 if (xec > 0x5)
682 goto wrong_mc6_mce;
683
684 pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
685 return;
686
687 wrong_mc6_mce:
688 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
689 }
690
691 static const char * const smca_long_names[] = {
692 [SMCA_LS ... SMCA_LS_V2] = "Load Store Unit",
693 [SMCA_IF] = "Instruction Fetch Unit",
694 [SMCA_L2_CACHE] = "L2 Cache",
695 [SMCA_DE] = "Decode Unit",
696 [SMCA_RESERVED] = "Reserved",
697 [SMCA_EX] = "Execution Unit",
698 [SMCA_FP] = "Floating Point Unit",
699 [SMCA_L3_CACHE] = "L3 Cache",
700 [SMCA_CS ... SMCA_CS_V2] = "Coherent Slave",
701 [SMCA_PIE] = "Power, Interrupts, etc.",
702
703 /* UMC v2 is separate because both of them can exist in a single system. */
704 [SMCA_UMC] = "Unified Memory Controller",
705 [SMCA_UMC_V2] = "Unified Memory Controller v2",
706 [SMCA_PB] = "Parameter Block",
707 [SMCA_PSP ... SMCA_PSP_V2] = "Platform Security Processor",
708 [SMCA_SMU ... SMCA_SMU_V2] = "System Management Unit",
709 [SMCA_MP5] = "Microprocessor 5 Unit",
710 [SMCA_MPDMA] = "MPDMA Unit",
711 [SMCA_NBIO] = "Northbridge IO Unit",
712 [SMCA_PCIE ... SMCA_PCIE_V2] = "PCI Express Unit",
713 [SMCA_XGMI_PCS] = "Ext Global Memory Interconnect PCS Unit",
714 [SMCA_NBIF] = "NBIF Unit",
715 [SMCA_SHUB] = "System Hub Unit",
716 [SMCA_SATA] = "SATA Unit",
717 [SMCA_USB] = "USB Unit",
718 [SMCA_GMI_PCS] = "Global Memory Interconnect PCS Unit",
719 [SMCA_XGMI_PHY] = "Ext Global Memory Interconnect PHY Unit",
720 [SMCA_WAFL_PHY] = "WAFL PHY Unit",
721 [SMCA_GMI_PHY] = "Global Memory Interconnect PHY Unit",
722 };
723
smca_get_long_name(enum smca_bank_types t)724 static const char *smca_get_long_name(enum smca_bank_types t)
725 {
726 if (t >= N_SMCA_BANK_TYPES)
727 return NULL;
728
729 return smca_long_names[t];
730 }
731
732 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)733 static void decode_smca_error(struct mce *m)
734 {
735 enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
736 u8 xec = XEC(m->status, xec_mask);
737
738 if (bank_type >= N_SMCA_BANK_TYPES)
739 return;
740
741 if (bank_type == SMCA_RESERVED) {
742 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
743 return;
744 }
745
746 pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
747
748 if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
749 xec == 0 && decode_dram_ecc)
750 decode_dram_ecc(topology_amd_node_id(m->extcpu), m);
751 }
752
amd_decode_err_code(u16 ec)753 static inline void amd_decode_err_code(u16 ec)
754 {
755 if (INT_ERROR(ec)) {
756 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
757 return;
758 }
759
760 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
761
762 if (BUS_ERROR(ec))
763 pr_cont(", mem/io: %s", II_MSG(ec));
764 else
765 pr_cont(", tx: %s", TT_MSG(ec));
766
767 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
768 pr_cont(", mem-tx: %s", R4_MSG(ec));
769
770 if (BUS_ERROR(ec))
771 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
772 }
773
774 pr_cont("\n");
775 }
776
decode_error_status(struct mce * m)777 static const char *decode_error_status(struct mce *m)
778 {
779 if (m->status & MCI_STATUS_UC) {
780 if (m->status & MCI_STATUS_PCC)
781 return "System Fatal error.";
782 if (m->mcgstatus & MCG_STATUS_RIPV)
783 return "Uncorrected, software restartable error.";
784 return "Uncorrected, software containable error.";
785 }
786
787 if (m->status & MCI_STATUS_DEFERRED)
788 return "Deferred error, no action required.";
789
790 return "Corrected error, no action required.";
791 }
792
793 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)794 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
795 {
796 struct mce *m = (struct mce *)data;
797 struct mce_hw_err *err = to_mce_hw_err(m);
798 unsigned int fam = x86_family(m->cpuid);
799 u32 mca_config_lo = 0, dummy;
800 int ecc;
801
802 if (m->kflags & MCE_HANDLED_CEC)
803 return NOTIFY_DONE;
804
805 pr_emerg(HW_ERR "%s\n", decode_error_status(m));
806
807 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
808 m->extcpu,
809 fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
810 m->bank,
811 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
812 ((m->status & MCI_STATUS_UC) ? "UE" :
813 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
814 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
815 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
816 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
817
818 if (boot_cpu_has(X86_FEATURE_SMCA)) {
819 rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy);
820
821 if (mca_config_lo & MCI_CONFIG_MCAX)
822 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
823
824 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
825 }
826
827 /* do the two bits[14:13] together */
828 ecc = (m->status >> 45) & 0x3;
829 if (ecc)
830 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
831
832 if (fam >= 0x15) {
833 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
834
835 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
836 if (fam != 0x15 || m->bank != 4)
837 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
838 }
839
840 if (fam >= 0x17)
841 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
842
843 pr_cont("]: 0x%016llx\n", m->status);
844
845 if (m->status & MCI_STATUS_ADDRV)
846 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
847
848 if (m->ppin)
849 pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
850
851 if (boot_cpu_has(X86_FEATURE_SMCA)) {
852 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
853
854 if (m->status & MCI_STATUS_SYNDV) {
855 pr_cont(", Syndrome: 0x%016llx\n", m->synd);
856 if (mca_config_lo & MCI_CONFIG_FRUTEXT) {
857 char frutext[17];
858
859 frutext[16] = '\0';
860 memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
861 memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
862
863 pr_emerg(HW_ERR "FRU Text: %s", frutext);
864 }
865 }
866
867 pr_cont("\n");
868
869 decode_smca_error(m);
870 goto err_code;
871 }
872
873 if (m->tsc)
874 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
875
876 /* Doesn't matter which member to test. */
877 if (!fam_ops.mc0_mce)
878 goto err_code;
879
880 switch (m->bank) {
881 case 0:
882 decode_mc0_mce(m);
883 break;
884
885 case 1:
886 decode_mc1_mce(m);
887 break;
888
889 case 2:
890 decode_mc2_mce(m);
891 break;
892
893 case 3:
894 decode_mc3_mce(m);
895 break;
896
897 case 4:
898 decode_mc4_mce(m);
899 break;
900
901 case 5:
902 decode_mc5_mce(m);
903 break;
904
905 case 6:
906 decode_mc6_mce(m);
907 break;
908
909 default:
910 break;
911 }
912
913 err_code:
914 amd_decode_err_code(m->status & 0xffff);
915
916 m->kflags |= MCE_HANDLED_EDAC;
917 return NOTIFY_OK;
918 }
919
920 static struct notifier_block amd_mce_dec_nb = {
921 .notifier_call = amd_decode_mce,
922 .priority = MCE_PRIO_EDAC,
923 };
924
mce_amd_init(void)925 static int __init mce_amd_init(void)
926 {
927 struct cpuinfo_x86 *c = &boot_cpu_data;
928
929 if (c->x86_vendor != X86_VENDOR_AMD &&
930 c->x86_vendor != X86_VENDOR_HYGON)
931 return -ENODEV;
932
933 if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
934 return -ENODEV;
935
936 if (boot_cpu_has(X86_FEATURE_SMCA)) {
937 xec_mask = 0x3f;
938 goto out;
939 }
940
941 switch (c->x86) {
942 case 0xf:
943 fam_ops.mc0_mce = k8_mc0_mce;
944 fam_ops.mc1_mce = k8_mc1_mce;
945 fam_ops.mc2_mce = k8_mc2_mce;
946 break;
947
948 case 0x10:
949 fam_ops.mc0_mce = f10h_mc0_mce;
950 fam_ops.mc1_mce = k8_mc1_mce;
951 fam_ops.mc2_mce = k8_mc2_mce;
952 break;
953
954 case 0x11:
955 fam_ops.mc0_mce = k8_mc0_mce;
956 fam_ops.mc1_mce = k8_mc1_mce;
957 fam_ops.mc2_mce = k8_mc2_mce;
958 break;
959
960 case 0x12:
961 fam_ops.mc0_mce = f12h_mc0_mce;
962 fam_ops.mc1_mce = k8_mc1_mce;
963 fam_ops.mc2_mce = k8_mc2_mce;
964 break;
965
966 case 0x14:
967 fam_ops.mc0_mce = cat_mc0_mce;
968 fam_ops.mc1_mce = cat_mc1_mce;
969 fam_ops.mc2_mce = k8_mc2_mce;
970 break;
971
972 case 0x15:
973 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
974
975 fam_ops.mc0_mce = f15h_mc0_mce;
976 fam_ops.mc1_mce = f15h_mc1_mce;
977 fam_ops.mc2_mce = f15h_mc2_mce;
978 break;
979
980 case 0x16:
981 xec_mask = 0x1f;
982 fam_ops.mc0_mce = cat_mc0_mce;
983 fam_ops.mc1_mce = cat_mc1_mce;
984 fam_ops.mc2_mce = f16h_mc2_mce;
985 break;
986
987 case 0x17:
988 case 0x18:
989 pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
990 return -EINVAL;
991
992 default:
993 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
994 return -EINVAL;
995 }
996
997 out:
998 pr_info("MCE: In-kernel MCE decoding enabled.\n");
999
1000 mce_register_decode_chain(&amd_mce_dec_nb);
1001
1002 return 0;
1003 }
1004 early_initcall(mce_amd_init);
1005
1006 #ifdef MODULE
mce_amd_exit(void)1007 static void __exit mce_amd_exit(void)
1008 {
1009 mce_unregister_decode_chain(&amd_mce_dec_nb);
1010 }
1011
1012 MODULE_DESCRIPTION("AMD MCE decoder");
1013 MODULE_ALIAS("edac-mce-amd");
1014 MODULE_LICENSE("GPL");
1015 module_exit(mce_amd_exit);
1016 #endif
1017