xref: /linux/drivers/edac/mce_amd.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4 
5 #include <asm/cpu.h>
6 #include <asm/msr.h>
7 
8 #include "mce_amd.h"
9 
10 static struct amd_decoder_ops fam_ops;
11 
12 static u8 xec_mask	 = 0xf;
13 
14 static void (*decode_dram_ecc)(int node_id, struct mce *m);
15 
amd_register_ecc_decoder(void (* f)(int,struct mce *))16 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
17 {
18 	decode_dram_ecc = f;
19 }
20 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
21 
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))22 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
23 {
24 	if (decode_dram_ecc) {
25 		WARN_ON(decode_dram_ecc != f);
26 
27 		decode_dram_ecc = NULL;
28 	}
29 }
30 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
31 
32 /*
33  * string representation for the different MCA reported error types, see F3x48
34  * or MSR0000_0411.
35  */
36 
37 /* transaction type */
38 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
39 
40 /* cache level */
41 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
42 
43 /* memory transaction type */
44 static const char * const rrrr_msgs[] = {
45        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
46 };
47 
48 /* participating processor */
49 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
50 EXPORT_SYMBOL_GPL(pp_msgs);
51 
52 /* request timeout */
53 static const char * const to_msgs[] = { "no timeout", "timed out" };
54 
55 /* memory or i/o */
56 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
57 
58 /* internal error type */
59 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
60 
61 static const char * const f15h_mc1_mce_desc[] = {
62 	"UC during a demand linefill from L2",
63 	"Parity error during data load from IC",
64 	"Parity error for IC valid bit",
65 	"Main tag parity error",
66 	"Parity error in prediction queue",
67 	"PFB data/address parity error",
68 	"Parity error in the branch status reg",
69 	"PFB promotion address error",
70 	"Tag error during probe/victimization",
71 	"Parity error for IC probe tag valid bit",
72 	"PFB non-cacheable bit parity error",
73 	"PFB valid bit parity error",			/* xec = 0xd */
74 	"Microcode Patch Buffer",			/* xec = 010 */
75 	"uop queue",
76 	"insn buffer",
77 	"predecode buffer",
78 	"fetch address FIFO",
79 	"dispatch uop queue"
80 };
81 
82 static const char * const f15h_mc2_mce_desc[] = {
83 	"Fill ECC error on data fills",			/* xec = 0x4 */
84 	"Fill parity error on insn fills",
85 	"Prefetcher request FIFO parity error",
86 	"PRQ address parity error",
87 	"PRQ data parity error",
88 	"WCC Tag ECC error",
89 	"WCC Data ECC error",
90 	"WCB Data parity error",
91 	"VB Data ECC or parity error",
92 	"L2 Tag ECC error",				/* xec = 0x10 */
93 	"Hard L2 Tag ECC error",
94 	"Multiple hits on L2 tag",
95 	"XAB parity error",
96 	"PRB address parity error"
97 };
98 
99 static const char * const mc4_mce_desc[] = {
100 	"DRAM ECC error detected on the NB",
101 	"CRC error detected on HT link",
102 	"Link-defined sync error packets detected on HT link",
103 	"HT Master abort",
104 	"HT Target abort",
105 	"Invalid GART PTE entry during GART table walk",
106 	"Unsupported atomic RMW received from an IO link",
107 	"Watchdog timeout due to lack of progress",
108 	"DRAM ECC error detected on the NB",
109 	"SVM DMA Exclusion Vector error",
110 	"HT data error detected on link",
111 	"Protocol error (link, L3, probe filter)",
112 	"NB internal arrays parity error",
113 	"DRAM addr/ctl signals parity error",
114 	"IO link transmission error",
115 	"L3 data cache ECC error",			/* xec = 0x1c */
116 	"L3 cache tag error",
117 	"L3 LRU parity bits error",
118 	"ECC Error in the Probe Filter directory"
119 };
120 
121 static const char * const mc5_mce_desc[] = {
122 	"CPU Watchdog timer expire",
123 	"Wakeup array dest tag",
124 	"AG payload array",
125 	"EX payload array",
126 	"IDRF array",
127 	"Retire dispatch queue",
128 	"Mapper checkpoint array",
129 	"Physical register file EX0 port",
130 	"Physical register file EX1 port",
131 	"Physical register file AG0 port",
132 	"Physical register file AG1 port",
133 	"Flag register file",
134 	"DE error occurred",
135 	"Retire status queue"
136 };
137 
138 static const char * const mc6_mce_desc[] = {
139 	"Hardware Assertion",
140 	"Free List",
141 	"Physical Register File",
142 	"Retire Queue",
143 	"Scheduler table",
144 	"Status Register File",
145 };
146 
f12h_mc0_mce(u16 ec,u8 xec)147 static bool f12h_mc0_mce(u16 ec, u8 xec)
148 {
149 	bool ret = false;
150 
151 	if (MEM_ERROR(ec)) {
152 		u8 ll = LL(ec);
153 		ret = true;
154 
155 		if (ll == LL_L2)
156 			pr_cont("during L1 linefill from L2.\n");
157 		else if (ll == LL_L1)
158 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
159 		else
160 			ret = false;
161 	}
162 	return ret;
163 }
164 
f10h_mc0_mce(u16 ec,u8 xec)165 static bool f10h_mc0_mce(u16 ec, u8 xec)
166 {
167 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
168 		pr_cont("during data scrub.\n");
169 		return true;
170 	}
171 	return f12h_mc0_mce(ec, xec);
172 }
173 
k8_mc0_mce(u16 ec,u8 xec)174 static bool k8_mc0_mce(u16 ec, u8 xec)
175 {
176 	if (BUS_ERROR(ec)) {
177 		pr_cont("during system linefill.\n");
178 		return true;
179 	}
180 
181 	return f10h_mc0_mce(ec, xec);
182 }
183 
cat_mc0_mce(u16 ec,u8 xec)184 static bool cat_mc0_mce(u16 ec, u8 xec)
185 {
186 	u8 r4	 = R4(ec);
187 	bool ret = true;
188 
189 	if (MEM_ERROR(ec)) {
190 
191 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
192 			return false;
193 
194 		switch (r4) {
195 		case R4_DRD:
196 		case R4_DWR:
197 			pr_cont("Data/Tag parity error due to %s.\n",
198 				(r4 == R4_DRD ? "load/hw prf" : "store"));
199 			break;
200 		case R4_EVICT:
201 			pr_cont("Copyback parity error on a tag miss.\n");
202 			break;
203 		case R4_SNOOP:
204 			pr_cont("Tag parity error during snoop.\n");
205 			break;
206 		default:
207 			ret = false;
208 		}
209 	} else if (BUS_ERROR(ec)) {
210 
211 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
212 			return false;
213 
214 		pr_cont("System read data error on a ");
215 
216 		switch (r4) {
217 		case R4_RD:
218 			pr_cont("TLB reload.\n");
219 			break;
220 		case R4_DWR:
221 			pr_cont("store.\n");
222 			break;
223 		case R4_DRD:
224 			pr_cont("load.\n");
225 			break;
226 		default:
227 			ret = false;
228 		}
229 	} else {
230 		ret = false;
231 	}
232 
233 	return ret;
234 }
235 
f15h_mc0_mce(u16 ec,u8 xec)236 static bool f15h_mc0_mce(u16 ec, u8 xec)
237 {
238 	bool ret = true;
239 
240 	if (MEM_ERROR(ec)) {
241 
242 		switch (xec) {
243 		case 0x0:
244 			pr_cont("Data Array access error.\n");
245 			break;
246 
247 		case 0x1:
248 			pr_cont("UC error during a linefill from L2/NB.\n");
249 			break;
250 
251 		case 0x2:
252 		case 0x11:
253 			pr_cont("STQ access error.\n");
254 			break;
255 
256 		case 0x3:
257 			pr_cont("SCB access error.\n");
258 			break;
259 
260 		case 0x10:
261 			pr_cont("Tag error.\n");
262 			break;
263 
264 		case 0x12:
265 			pr_cont("LDQ access error.\n");
266 			break;
267 
268 		default:
269 			ret = false;
270 		}
271 	} else if (BUS_ERROR(ec)) {
272 
273 		if (!xec)
274 			pr_cont("System Read Data Error.\n");
275 		else
276 			pr_cont(" Internal error condition type %d.\n", xec);
277 	} else if (INT_ERROR(ec)) {
278 		if (xec <= 0x1f)
279 			pr_cont("Hardware Assert.\n");
280 		else
281 			ret = false;
282 
283 	} else
284 		ret = false;
285 
286 	return ret;
287 }
288 
decode_mc0_mce(struct mce * m)289 static void decode_mc0_mce(struct mce *m)
290 {
291 	u16 ec = EC(m->status);
292 	u8 xec = XEC(m->status, xec_mask);
293 
294 	pr_emerg(HW_ERR "MC0 Error: ");
295 
296 	/* TLB error signatures are the same across families */
297 	if (TLB_ERROR(ec)) {
298 		if (TT(ec) == TT_DATA) {
299 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
300 				((xec == 2) ? "locked miss"
301 					    : (xec ? "multimatch" : "parity")));
302 			return;
303 		}
304 	} else if (fam_ops.mc0_mce(ec, xec))
305 		;
306 	else
307 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
308 }
309 
k8_mc1_mce(u16 ec,u8 xec)310 static bool k8_mc1_mce(u16 ec, u8 xec)
311 {
312 	u8 ll	 = LL(ec);
313 	bool ret = true;
314 
315 	if (!MEM_ERROR(ec))
316 		return false;
317 
318 	if (ll == 0x2)
319 		pr_cont("during a linefill from L2.\n");
320 	else if (ll == 0x1) {
321 		switch (R4(ec)) {
322 		case R4_IRD:
323 			pr_cont("Parity error during data load.\n");
324 			break;
325 
326 		case R4_EVICT:
327 			pr_cont("Copyback Parity/Victim error.\n");
328 			break;
329 
330 		case R4_SNOOP:
331 			pr_cont("Tag Snoop error.\n");
332 			break;
333 
334 		default:
335 			ret = false;
336 			break;
337 		}
338 	} else
339 		ret = false;
340 
341 	return ret;
342 }
343 
cat_mc1_mce(u16 ec,u8 xec)344 static bool cat_mc1_mce(u16 ec, u8 xec)
345 {
346 	u8 r4    = R4(ec);
347 	bool ret = true;
348 
349 	if (!MEM_ERROR(ec))
350 		return false;
351 
352 	if (TT(ec) != TT_INSTR)
353 		return false;
354 
355 	if (r4 == R4_IRD)
356 		pr_cont("Data/tag array parity error for a tag hit.\n");
357 	else if (r4 == R4_SNOOP)
358 		pr_cont("Tag error during snoop/victimization.\n");
359 	else if (xec == 0x0)
360 		pr_cont("Tag parity error from victim castout.\n");
361 	else if (xec == 0x2)
362 		pr_cont("Microcode patch RAM parity error.\n");
363 	else
364 		ret = false;
365 
366 	return ret;
367 }
368 
f15h_mc1_mce(u16 ec,u8 xec)369 static bool f15h_mc1_mce(u16 ec, u8 xec)
370 {
371 	bool ret = true;
372 
373 	if (!MEM_ERROR(ec))
374 		return false;
375 
376 	switch (xec) {
377 	case 0x0 ... 0xa:
378 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
379 		break;
380 
381 	case 0xd:
382 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
383 		break;
384 
385 	case 0x10:
386 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
387 		break;
388 
389 	case 0x11 ... 0x15:
390 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
391 		break;
392 
393 	default:
394 		ret = false;
395 	}
396 	return ret;
397 }
398 
decode_mc1_mce(struct mce * m)399 static void decode_mc1_mce(struct mce *m)
400 {
401 	u16 ec = EC(m->status);
402 	u8 xec = XEC(m->status, xec_mask);
403 
404 	pr_emerg(HW_ERR "MC1 Error: ");
405 
406 	if (TLB_ERROR(ec))
407 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
408 			(xec ? "multimatch" : "parity error"));
409 	else if (BUS_ERROR(ec)) {
410 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
411 
412 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
413 	} else if (INT_ERROR(ec)) {
414 		if (xec <= 0x3f)
415 			pr_cont("Hardware Assert.\n");
416 		else
417 			goto wrong_mc1_mce;
418 	} else if (fam_ops.mc1_mce(ec, xec))
419 		;
420 	else
421 		goto wrong_mc1_mce;
422 
423 	return;
424 
425 wrong_mc1_mce:
426 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
427 }
428 
k8_mc2_mce(u16 ec,u8 xec)429 static bool k8_mc2_mce(u16 ec, u8 xec)
430 {
431 	bool ret = true;
432 
433 	if (xec == 0x1)
434 		pr_cont(" in the write data buffers.\n");
435 	else if (xec == 0x3)
436 		pr_cont(" in the victim data buffers.\n");
437 	else if (xec == 0x2 && MEM_ERROR(ec))
438 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
439 	else if (xec == 0x0) {
440 		if (TLB_ERROR(ec))
441 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
442 				TT_MSG(ec));
443 		else if (BUS_ERROR(ec))
444 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
445 				R4_MSG(ec), PP_MSG(ec));
446 		else if (MEM_ERROR(ec)) {
447 			u8 r4 = R4(ec);
448 
449 			if (r4 >= 0x7)
450 				pr_cont(": %s error during data copyback.\n",
451 					R4_MSG(ec));
452 			else if (r4 <= 0x1)
453 				pr_cont(": %s parity/ECC error during data "
454 					"access from L2.\n", R4_MSG(ec));
455 			else
456 				ret = false;
457 		} else
458 			ret = false;
459 	} else
460 		ret = false;
461 
462 	return ret;
463 }
464 
f15h_mc2_mce(u16 ec,u8 xec)465 static bool f15h_mc2_mce(u16 ec, u8 xec)
466 {
467 	bool ret = true;
468 
469 	if (TLB_ERROR(ec)) {
470 		if (xec == 0x0)
471 			pr_cont("Data parity TLB read error.\n");
472 		else if (xec == 0x1)
473 			pr_cont("Poison data provided for TLB fill.\n");
474 		else
475 			ret = false;
476 	} else if (BUS_ERROR(ec)) {
477 		if (xec > 2)
478 			ret = false;
479 
480 		pr_cont("Error during attempted NB data read.\n");
481 	} else if (MEM_ERROR(ec)) {
482 		switch (xec) {
483 		case 0x4 ... 0xc:
484 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
485 			break;
486 
487 		case 0x10 ... 0x14:
488 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
489 			break;
490 
491 		default:
492 			ret = false;
493 		}
494 	} else if (INT_ERROR(ec)) {
495 		if (xec <= 0x3f)
496 			pr_cont("Hardware Assert.\n");
497 		else
498 			ret = false;
499 	}
500 
501 	return ret;
502 }
503 
f16h_mc2_mce(u16 ec,u8 xec)504 static bool f16h_mc2_mce(u16 ec, u8 xec)
505 {
506 	u8 r4 = R4(ec);
507 
508 	if (!MEM_ERROR(ec))
509 		return false;
510 
511 	switch (xec) {
512 	case 0x04 ... 0x05:
513 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
514 		break;
515 
516 	case 0x09 ... 0x0b:
517 	case 0x0d ... 0x0f:
518 		pr_cont("ECC error in L2 tag (%s).\n",
519 			((r4 == R4_GEN)   ? "BankReq" :
520 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
521 		break;
522 
523 	case 0x10 ... 0x19:
524 	case 0x1b:
525 		pr_cont("ECC error in L2 data array (%s).\n",
526 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
527 			((r4 == R4_GEN)   ? "Attr" :
528 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
529 		break;
530 
531 	case 0x1c ... 0x1d:
532 	case 0x1f:
533 		pr_cont("Parity error in L2 attribute bits (%s).\n",
534 			((r4 == R4_RD)  ? "Hit"  :
535 			((r4 == R4_GEN) ? "Attr" : "Fill")));
536 		break;
537 
538 	default:
539 		return false;
540 	}
541 
542 	return true;
543 }
544 
decode_mc2_mce(struct mce * m)545 static void decode_mc2_mce(struct mce *m)
546 {
547 	u16 ec = EC(m->status);
548 	u8 xec = XEC(m->status, xec_mask);
549 
550 	pr_emerg(HW_ERR "MC2 Error: ");
551 
552 	if (!fam_ops.mc2_mce(ec, xec))
553 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
554 }
555 
decode_mc3_mce(struct mce * m)556 static void decode_mc3_mce(struct mce *m)
557 {
558 	u16 ec = EC(m->status);
559 	u8 xec = XEC(m->status, xec_mask);
560 
561 	if (boot_cpu_data.x86 >= 0x14) {
562 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
563 			 " please report on LKML.\n");
564 		return;
565 	}
566 
567 	pr_emerg(HW_ERR "MC3 Error");
568 
569 	if (xec == 0x0) {
570 		u8 r4 = R4(ec);
571 
572 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
573 			goto wrong_mc3_mce;
574 
575 		pr_cont(" during %s.\n", R4_MSG(ec));
576 	} else
577 		goto wrong_mc3_mce;
578 
579 	return;
580 
581  wrong_mc3_mce:
582 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
583 }
584 
decode_mc4_mce(struct mce * m)585 static void decode_mc4_mce(struct mce *m)
586 {
587 	unsigned int fam = x86_family(m->cpuid);
588 	int node_id = topology_amd_node_id(m->extcpu);
589 	u16 ec = EC(m->status);
590 	u8 xec = XEC(m->status, 0x1f);
591 	u8 offset = 0;
592 
593 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
594 
595 	switch (xec) {
596 	case 0x0 ... 0xe:
597 
598 		/* special handling for DRAM ECCs */
599 		if (xec == 0x0 || xec == 0x8) {
600 			/* no ECCs on F11h */
601 			if (fam == 0x11)
602 				goto wrong_mc4_mce;
603 
604 			pr_cont("%s.\n", mc4_mce_desc[xec]);
605 
606 			if (decode_dram_ecc)
607 				decode_dram_ecc(node_id, m);
608 			return;
609 		}
610 		break;
611 
612 	case 0xf:
613 		if (TLB_ERROR(ec))
614 			pr_cont("GART Table Walk data error.\n");
615 		else if (BUS_ERROR(ec))
616 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
617 		else
618 			goto wrong_mc4_mce;
619 		return;
620 
621 	case 0x19:
622 		if (fam == 0x15 || fam == 0x16)
623 			pr_cont("Compute Unit Data Error.\n");
624 		else
625 			goto wrong_mc4_mce;
626 		return;
627 
628 	case 0x1c ... 0x1f:
629 		offset = 13;
630 		break;
631 
632 	default:
633 		goto wrong_mc4_mce;
634 	}
635 
636 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
637 	return;
638 
639  wrong_mc4_mce:
640 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
641 }
642 
decode_mc5_mce(struct mce * m)643 static void decode_mc5_mce(struct mce *m)
644 {
645 	unsigned int fam = x86_family(m->cpuid);
646 	u16 ec = EC(m->status);
647 	u8 xec = XEC(m->status, xec_mask);
648 
649 	if (fam == 0xf || fam == 0x11)
650 		goto wrong_mc5_mce;
651 
652 	pr_emerg(HW_ERR "MC5 Error: ");
653 
654 	if (INT_ERROR(ec)) {
655 		if (xec <= 0x1f) {
656 			pr_cont("Hardware Assert.\n");
657 			return;
658 		} else
659 			goto wrong_mc5_mce;
660 	}
661 
662 	if (xec == 0x0 || xec == 0xc)
663 		pr_cont("%s.\n", mc5_mce_desc[xec]);
664 	else if (xec <= 0xd)
665 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
666 	else
667 		goto wrong_mc5_mce;
668 
669 	return;
670 
671  wrong_mc5_mce:
672 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
673 }
674 
decode_mc6_mce(struct mce * m)675 static void decode_mc6_mce(struct mce *m)
676 {
677 	u8 xec = XEC(m->status, xec_mask);
678 
679 	pr_emerg(HW_ERR "MC6 Error: ");
680 
681 	if (xec > 0x5)
682 		goto wrong_mc6_mce;
683 
684 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
685 	return;
686 
687  wrong_mc6_mce:
688 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
689 }
690 
691 static const char * const smca_long_names[] = {
692 	[SMCA_LS ... SMCA_LS_V2]	= "Load Store Unit",
693 	[SMCA_IF]			= "Instruction Fetch Unit",
694 	[SMCA_L2_CACHE]			= "L2 Cache",
695 	[SMCA_DE]			= "Decode Unit",
696 	[SMCA_RESERVED]			= "Reserved",
697 	[SMCA_EX]			= "Execution Unit",
698 	[SMCA_FP]			= "Floating Point Unit",
699 	[SMCA_L3_CACHE]			= "L3 Cache",
700 	[SMCA_CS ... SMCA_CS_V2]	= "Coherent Slave",
701 	[SMCA_PIE]			= "Power, Interrupts, etc.",
702 
703 	/* UMC v2 is separate because both of them can exist in a single system. */
704 	[SMCA_UMC]			= "Unified Memory Controller",
705 	[SMCA_UMC_V2]			= "Unified Memory Controller v2",
706 	[SMCA_PB]			= "Parameter Block",
707 	[SMCA_PSP ... SMCA_PSP_V2]	= "Platform Security Processor",
708 	[SMCA_SMU ... SMCA_SMU_V2]	= "System Management Unit",
709 	[SMCA_MP5]			= "Microprocessor 5 Unit",
710 	[SMCA_MPDMA]			= "MPDMA Unit",
711 	[SMCA_NBIO]			= "Northbridge IO Unit",
712 	[SMCA_PCIE ... SMCA_PCIE_V2]	= "PCI Express Unit",
713 	[SMCA_XGMI_PCS]			= "Ext Global Memory Interconnect PCS Unit",
714 	[SMCA_NBIF]			= "NBIF Unit",
715 	[SMCA_SHUB]			= "System Hub Unit",
716 	[SMCA_SATA]			= "SATA Unit",
717 	[SMCA_USB]			= "USB Unit",
718 	[SMCA_GMI_PCS]			= "Global Memory Interconnect PCS Unit",
719 	[SMCA_XGMI_PHY]			= "Ext Global Memory Interconnect PHY Unit",
720 	[SMCA_WAFL_PHY]			= "WAFL PHY Unit",
721 	[SMCA_GMI_PHY]			= "Global Memory Interconnect PHY Unit",
722 };
723 
smca_get_long_name(enum smca_bank_types t)724 static const char *smca_get_long_name(enum smca_bank_types t)
725 {
726 	if (t >= N_SMCA_BANK_TYPES)
727 		return NULL;
728 
729 	return smca_long_names[t];
730 }
731 
732 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)733 static void decode_smca_error(struct mce *m)
734 {
735 	enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
736 	u8 xec = XEC(m->status, xec_mask);
737 
738 	if (bank_type >= N_SMCA_BANK_TYPES)
739 		return;
740 
741 	if (bank_type == SMCA_RESERVED) {
742 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
743 		return;
744 	}
745 
746 	pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
747 
748 	if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
749 	    xec == 0 && decode_dram_ecc)
750 		decode_dram_ecc(topology_amd_node_id(m->extcpu), m);
751 }
752 
amd_decode_err_code(u16 ec)753 static inline void amd_decode_err_code(u16 ec)
754 {
755 	if (INT_ERROR(ec)) {
756 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
757 		return;
758 	}
759 
760 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
761 
762 	if (BUS_ERROR(ec))
763 		pr_cont(", mem/io: %s", II_MSG(ec));
764 	else
765 		pr_cont(", tx: %s", TT_MSG(ec));
766 
767 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
768 		pr_cont(", mem-tx: %s", R4_MSG(ec));
769 
770 		if (BUS_ERROR(ec))
771 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
772 	}
773 
774 	pr_cont("\n");
775 }
776 
decode_error_status(struct mce * m)777 static const char *decode_error_status(struct mce *m)
778 {
779 	if (m->status & MCI_STATUS_UC) {
780 		if (m->status & MCI_STATUS_PCC)
781 			return "System Fatal error.";
782 		if (m->mcgstatus & MCG_STATUS_RIPV)
783 			return "Uncorrected, software restartable error.";
784 		return "Uncorrected, software containable error.";
785 	}
786 
787 	if (m->status & MCI_STATUS_DEFERRED)
788 		return "Deferred error, no action required.";
789 
790 	return "Corrected error, no action required.";
791 }
792 
793 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)794 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
795 {
796 	struct mce *m = (struct mce *)data;
797 	struct mce_hw_err *err = to_mce_hw_err(m);
798 	unsigned int fam = x86_family(m->cpuid);
799 	u32 mca_config_lo = 0, dummy;
800 	int ecc;
801 
802 	if (m->kflags & MCE_HANDLED_CEC)
803 		return NOTIFY_DONE;
804 
805 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
806 
807 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
808 		m->extcpu,
809 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
810 		m->bank,
811 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
812 		((m->status & MCI_STATUS_UC)	? "UE"	  :
813 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
814 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
815 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
816 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
817 
818 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
819 		rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy);
820 
821 		if (mca_config_lo & MCI_CONFIG_MCAX)
822 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
823 
824 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
825 	}
826 
827 	/* do the two bits[14:13] together */
828 	ecc = (m->status >> 45) & 0x3;
829 	if (ecc)
830 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
831 
832 	if (fam >= 0x15) {
833 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
834 
835 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
836 		if (fam != 0x15 || m->bank != 4)
837 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
838 	}
839 
840 	if (fam >= 0x17)
841 		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
842 
843 	pr_cont("]: 0x%016llx\n", m->status);
844 
845 	if (m->status & MCI_STATUS_ADDRV)
846 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
847 
848 	if (m->ppin)
849 		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
850 
851 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
852 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
853 
854 		if (m->status & MCI_STATUS_SYNDV) {
855 			pr_cont(", Syndrome: 0x%016llx\n", m->synd);
856 			if (mca_config_lo & MCI_CONFIG_FRUTEXT) {
857 				char frutext[17];
858 
859 				frutext[16] = '\0';
860 				memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
861 				memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
862 
863 				pr_emerg(HW_ERR "FRU Text: %s", frutext);
864 			}
865 		}
866 
867 		pr_cont("\n");
868 
869 		decode_smca_error(m);
870 		goto err_code;
871 	}
872 
873 	if (m->tsc)
874 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
875 
876 	/* Doesn't matter which member to test. */
877 	if (!fam_ops.mc0_mce)
878 		goto err_code;
879 
880 	switch (m->bank) {
881 	case 0:
882 		decode_mc0_mce(m);
883 		break;
884 
885 	case 1:
886 		decode_mc1_mce(m);
887 		break;
888 
889 	case 2:
890 		decode_mc2_mce(m);
891 		break;
892 
893 	case 3:
894 		decode_mc3_mce(m);
895 		break;
896 
897 	case 4:
898 		decode_mc4_mce(m);
899 		break;
900 
901 	case 5:
902 		decode_mc5_mce(m);
903 		break;
904 
905 	case 6:
906 		decode_mc6_mce(m);
907 		break;
908 
909 	default:
910 		break;
911 	}
912 
913  err_code:
914 	amd_decode_err_code(m->status & 0xffff);
915 
916 	m->kflags |= MCE_HANDLED_EDAC;
917 	return NOTIFY_OK;
918 }
919 
920 static struct notifier_block amd_mce_dec_nb = {
921 	.notifier_call	= amd_decode_mce,
922 	.priority	= MCE_PRIO_EDAC,
923 };
924 
mce_amd_init(void)925 static int __init mce_amd_init(void)
926 {
927 	struct cpuinfo_x86 *c = &boot_cpu_data;
928 
929 	if (c->x86_vendor != X86_VENDOR_AMD &&
930 	    c->x86_vendor != X86_VENDOR_HYGON)
931 		return -ENODEV;
932 
933 	if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
934 		return -ENODEV;
935 
936 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
937 		xec_mask = 0x3f;
938 		goto out;
939 	}
940 
941 	switch (c->x86) {
942 	case 0xf:
943 		fam_ops.mc0_mce = k8_mc0_mce;
944 		fam_ops.mc1_mce = k8_mc1_mce;
945 		fam_ops.mc2_mce = k8_mc2_mce;
946 		break;
947 
948 	case 0x10:
949 		fam_ops.mc0_mce = f10h_mc0_mce;
950 		fam_ops.mc1_mce = k8_mc1_mce;
951 		fam_ops.mc2_mce = k8_mc2_mce;
952 		break;
953 
954 	case 0x11:
955 		fam_ops.mc0_mce = k8_mc0_mce;
956 		fam_ops.mc1_mce = k8_mc1_mce;
957 		fam_ops.mc2_mce = k8_mc2_mce;
958 		break;
959 
960 	case 0x12:
961 		fam_ops.mc0_mce = f12h_mc0_mce;
962 		fam_ops.mc1_mce = k8_mc1_mce;
963 		fam_ops.mc2_mce = k8_mc2_mce;
964 		break;
965 
966 	case 0x14:
967 		fam_ops.mc0_mce = cat_mc0_mce;
968 		fam_ops.mc1_mce = cat_mc1_mce;
969 		fam_ops.mc2_mce = k8_mc2_mce;
970 		break;
971 
972 	case 0x15:
973 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
974 
975 		fam_ops.mc0_mce = f15h_mc0_mce;
976 		fam_ops.mc1_mce = f15h_mc1_mce;
977 		fam_ops.mc2_mce = f15h_mc2_mce;
978 		break;
979 
980 	case 0x16:
981 		xec_mask = 0x1f;
982 		fam_ops.mc0_mce = cat_mc0_mce;
983 		fam_ops.mc1_mce = cat_mc1_mce;
984 		fam_ops.mc2_mce = f16h_mc2_mce;
985 		break;
986 
987 	case 0x17:
988 	case 0x18:
989 		pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
990 		return -EINVAL;
991 
992 	default:
993 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
994 		return -EINVAL;
995 	}
996 
997 out:
998 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
999 
1000 	mce_register_decode_chain(&amd_mce_dec_nb);
1001 
1002 	return 0;
1003 }
1004 early_initcall(mce_amd_init);
1005 
1006 #ifdef MODULE
mce_amd_exit(void)1007 static void __exit mce_amd_exit(void)
1008 {
1009 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1010 }
1011 
1012 MODULE_DESCRIPTION("AMD MCE decoder");
1013 MODULE_ALIAS("edac-mce-amd");
1014 MODULE_LICENSE("GPL");
1015 module_exit(mce_amd_exit);
1016 #endif
1017