xref: /linux/drivers/edac/versalnet_edac.c (revision 1834703b8426c92211fd92a0e552fd4ae84dcb71)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * AMD Versal NET memory controller driver
4  * Copyright (C) 2025 Advanced Micro Devices, Inc.
5  */
6 
7 #include <linux/cdx/edac_cdx_pcol.h>
8 #include <linux/edac.h>
9 #include <linux/module.h>
10 #include <linux/of_device.h>
11 #include <linux/ras.h>
12 #include <linux/remoteproc.h>
13 #include <linux/rpmsg.h>
14 #include <linux/sizes.h>
15 #include <ras/ras_event.h>
16 
17 #include "edac_module.h"
18 
19 /* Granularity of reported error in bytes */
20 #define MC5_ERR_GRAIN			1
21 #define MC_GET_DDR_CONFIG_IN_LEN	4
22 
23 #define MC5_IRQ_CE_MASK			GENMASK(18, 15)
24 #define MC5_IRQ_UE_MASK			GENMASK(14, 11)
25 
26 #define MC5_RANK_1_MASK			GENMASK(11, 6)
27 #define MASK_24				GENMASK(29, 24)
28 #define MASK_0				GENMASK(5, 0)
29 
30 #define MC5_LRANK_1_MASK		GENMASK(11, 6)
31 #define MC5_LRANK_2_MASK		GENMASK(17, 12)
32 #define MC5_BANK1_MASK			GENMASK(11, 6)
33 #define MC5_GRP_0_MASK			GENMASK(17, 12)
34 #define MC5_GRP_1_MASK			GENMASK(23, 18)
35 
36 #define MC5_REGHI_ROW			7
37 #define MC5_EACHBIT			1
38 #define MC5_ERR_TYPE_CE			0
39 #define MC5_ERR_TYPE_UE			1
40 #define MC5_HIGH_MEM_EN			BIT(20)
41 #define MC5_MEM_MASK			GENMASK(19, 0)
42 #define MC5_X16_BASE			256
43 #define MC5_X16_ECC			32
44 #define MC5_X16_SIZE			(MC5_X16_BASE + MC5_X16_ECC)
45 #define MC5_X32_SIZE			576
46 #define MC5_HIMEM_BASE			(256 * SZ_1M)
47 #define MC5_ILC_HIMEM_EN		BIT(28)
48 #define MC5_ILC_MEM			GENMASK(27, 0)
49 #define MC5_INTERLEAVE_SEL		GENMASK(3, 0)
50 #define MC5_BUS_WIDTH_MASK		GENMASK(19, 18)
51 #define MC5_NUM_CHANS_MASK		BIT(17)
52 #define MC5_RANK_MASK			GENMASK(15, 14)
53 
54 #define ERROR_LEVEL			2
55 #define ERROR_ID			3
56 #define TOTAL_ERR_LENGTH		5
57 #define MSG_ERR_OFFSET			8
58 #define MSG_ERR_LENGTH			9
59 #define ERROR_DATA			10
60 #define MCDI_RESPONSE			0xFF
61 
62 #define REG_MAX				152
63 #define ADEC_MAX			152
64 #define NUM_CONTROLLERS			8
65 #define REGS_PER_CONTROLLER		19
66 #define ADEC_NUM			19
67 #define BUFFER_SZ			80
68 
69 #define XDDR5_BUS_WIDTH_64		0
70 #define XDDR5_BUS_WIDTH_32		1
71 #define XDDR5_BUS_WIDTH_16		2
72 
73 #define MC_NAME_LEN			32
74 
75 /**
76  * struct ecc_error_info - ECC error log information.
77  * @burstpos:		Burst position.
78  * @lrank:		Logical Rank number.
79  * @rank:		Rank number.
80  * @group:		Group number.
81  * @bank:		Bank number.
82  * @col:		Column number.
83  * @row:		Row number.
84  * @rowhi:		Row number higher bits.
85  * @i:			Combined ECC error vector containing encoded values of burst position,
86  *			rank, bank, column, and row information.
87  */
88 union ecc_error_info {
89 	struct {
90 		u32 burstpos:3;
91 		u32 lrank:4;
92 		u32 rank:2;
93 		u32 group:3;
94 		u32 bank:2;
95 		u32 col:11;
96 		u32 row:7;
97 		u32 rowhi;
98 	};
99 	u64 i;
100 } __packed;
101 
102 /* Row and column bit positions in the address decoder (ADEC) registers. */
103 union row_col_mapping {
104 	struct {
105 		u32 row0:6;
106 		u32 row1:6;
107 		u32 row2:6;
108 		u32 row3:6;
109 		u32 row4:6;
110 		u32 reserved:2;
111 	};
112 	struct {
113 		u32 col1:6;
114 		u32 col2:6;
115 		u32 col3:6;
116 		u32 col4:6;
117 		u32 col5:6;
118 		u32 reservedcol:2;
119 	};
120 	u32 i;
121 } __packed;
122 
123 /**
124  * struct ecc_status - ECC status information to report.
125  * @ceinfo:	Correctable errors.
126  * @ueinfo:	Uncorrected errors.
127  * @channel:	Channel number.
128  * @error_type:	Error type.
129  */
130 struct ecc_status {
131 	union ecc_error_info ceinfo[2];
132 	union ecc_error_info ueinfo[2];
133 	u8 channel;
134 	u8 error_type;
135 };
136 
137 /**
138  * struct mc_priv - DDR memory controller private instance data.
139  * @message:		Buffer for framing the event specific info.
140  * @stat:		ECC status information.
141  * @error_id:		The error id.
142  * @error_level:	The error level.
143  * @dwidth:		Width of data bus excluding ECC bits.
144  * @part_len:		The support of the message received.
145  * @regs:		The registers sent on the rpmsg.
146  * @adec:		Address decode registers.
147  * @mci:		Memory controller interface.
148  * @ept:		rpmsg endpoint.
149  * @mcdi:		The mcdi handle.
150  */
151 struct mc_priv {
152 	char message[256];
153 	struct ecc_status stat;
154 	u32 error_id;
155 	u32 error_level;
156 	u32 dwidth;
157 	u32 part_len;
158 	u32 regs[REG_MAX];
159 	u32 adec[ADEC_MAX];
160 	struct mem_ctl_info *mci[NUM_CONTROLLERS];
161 	struct rpmsg_endpoint *ept;
162 	struct cdx_mcdi *mcdi;
163 };
164 
165 /*
166  * Address decoder (ADEC) registers to match the order in which the register
167  * information is received from the firmware.
168  */
169 enum adec_info {
170 	CONF = 0,
171 	ADEC0,
172 	ADEC1,
173 	ADEC2,
174 	ADEC3,
175 	ADEC4,
176 	ADEC5,
177 	ADEC6,
178 	ADEC7,
179 	ADEC8,
180 	ADEC9,
181 	ADEC10,
182 	ADEC11,
183 	ADEC12,
184 	ADEC13,
185 	ADEC14,
186 	ADEC15,
187 	ADEC16,
188 	ADECILC,
189 };
190 
191 enum reg_info {
192 	ISR = 0,
193 	IMR,
194 	ECCR0_ERR_STATUS,
195 	ECCR0_ADDR_LO,
196 	ECCR0_ADDR_HI,
197 	ECCR0_DATA_LO,
198 	ECCR0_DATA_HI,
199 	ECCR0_PAR,
200 	ECCR1_ERR_STATUS,
201 	ECCR1_ADDR_LO,
202 	ECCR1_ADDR_HI,
203 	ECCR1_DATA_LO,
204 	ECCR1_DATA_HI,
205 	ECCR1_PAR,
206 	XMPU_ERR,
207 	XMPU_ERR_ADDR_L0,
208 	XMPU_ERR_ADDR_HI,
209 	XMPU_ERR_AXI_ID,
210 	ADEC_CHK_ERR_LOG,
211 };
212 
get_ddr_info(u32 * error_data,struct mc_priv * priv)213 static bool get_ddr_info(u32 *error_data, struct mc_priv *priv)
214 {
215 	u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr;
216 	struct ecc_status *p;
217 
218 	isr = error_data[ISR];
219 
220 	if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK)))
221 		return false;
222 
223 	eccr0_val = error_data[ECCR0_ERR_STATUS];
224 	eccr1_val = error_data[ECCR1_ERR_STATUS];
225 
226 	if (!eccr0_val && !eccr1_val)
227 		return false;
228 
229 	p = &priv->stat;
230 
231 	if (!eccr0_val)
232 		p->channel = 1;
233 	else
234 		p->channel = 0;
235 
236 	reglo = error_data[ECCR0_ADDR_LO];
237 	reghi = error_data[ECCR0_ADDR_HI];
238 	if (isr & MC5_IRQ_CE_MASK)
239 		p->ceinfo[0].i = reglo | (u64)reghi << 32;
240 	else if (isr & MC5_IRQ_UE_MASK)
241 		p->ueinfo[0].i = reglo | (u64)reghi << 32;
242 
243 	parity = error_data[ECCR0_PAR];
244 	edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
245 		 reghi, reglo, parity);
246 
247 	reglo = error_data[ECCR1_ADDR_LO];
248 	reghi = error_data[ECCR1_ADDR_HI];
249 	if (isr & MC5_IRQ_CE_MASK)
250 		p->ceinfo[1].i = reglo | (u64)reghi << 32;
251 	else if (isr & MC5_IRQ_UE_MASK)
252 		p->ueinfo[1].i = reglo | (u64)reghi << 32;
253 
254 	parity = error_data[ECCR1_PAR];
255 	edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
256 		 reghi, reglo, parity);
257 
258 	return true;
259 }
260 
261 /**
262  * convert_to_physical - Convert @error_data to a physical address.
263  * @priv:	DDR memory controller private instance data.
264  * @pinf:	ECC error info structure.
265  * @controller:	Controller number of the MC5
266  * @error_data:	the DDRMC5 ADEC address decoder register data
267  *
268  * Return: physical address of the DDR memory.
269  */
convert_to_physical(struct mc_priv * priv,union ecc_error_info pinf,int controller,int * error_data)270 static unsigned long convert_to_physical(struct mc_priv *priv,
271 					 union ecc_error_info pinf,
272 					 int controller, int *error_data)
273 {
274 	u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset;
275 	u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base;
276 	unsigned long err_addr = 0, addr;
277 	union row_col_mapping cols;
278 	union row_col_mapping rows;
279 	u32 col_bit_0;
280 
281 	row = pinf.rowhi << MC5_REGHI_ROW | pinf.row;
282 	offset = controller * ADEC_NUM;
283 
284 	reg = error_data[ADEC6];
285 	rows.i = reg;
286 	err_addr |= (row & BIT(0)) << rows.row0;
287 	row >>= MC5_EACHBIT;
288 	err_addr |= (row & BIT(0)) << rows.row1;
289 	row >>= MC5_EACHBIT;
290 	err_addr |= (row & BIT(0)) << rows.row2;
291 	row >>= MC5_EACHBIT;
292 	err_addr |= (row & BIT(0)) << rows.row3;
293 	row >>= MC5_EACHBIT;
294 	err_addr |= (row & BIT(0)) << rows.row4;
295 	row >>= MC5_EACHBIT;
296 
297 	reg = error_data[ADEC7];
298 	rows.i = reg;
299 	err_addr |= (row & BIT(0)) << rows.row0;
300 	row >>= MC5_EACHBIT;
301 	err_addr |= (row & BIT(0)) << rows.row1;
302 	row >>= MC5_EACHBIT;
303 	err_addr |= (row & BIT(0)) << rows.row2;
304 	row >>= MC5_EACHBIT;
305 	err_addr |= (row & BIT(0)) << rows.row3;
306 	row >>= MC5_EACHBIT;
307 	err_addr |= (row & BIT(0)) << rows.row4;
308 	row >>= MC5_EACHBIT;
309 
310 	reg = error_data[ADEC8];
311 	rows.i = reg;
312 	err_addr |= (row & BIT(0)) << rows.row0;
313 	row >>= MC5_EACHBIT;
314 	err_addr |= (row & BIT(0)) << rows.row1;
315 	row >>= MC5_EACHBIT;
316 	err_addr |= (row & BIT(0)) << rows.row2;
317 	row >>= MC5_EACHBIT;
318 	err_addr |= (row & BIT(0)) << rows.row3;
319 	row >>= MC5_EACHBIT;
320 	err_addr |= (row & BIT(0)) << rows.row4;
321 
322 	reg = error_data[ADEC9];
323 	rows.i = reg;
324 
325 	err_addr |= (row & BIT(0)) << rows.row0;
326 	row >>= MC5_EACHBIT;
327 	err_addr |= (row & BIT(0)) << rows.row1;
328 	row >>= MC5_EACHBIT;
329 	err_addr |= (row & BIT(0)) << rows.row2;
330 	row >>= MC5_EACHBIT;
331 
332 	col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]);
333 	pinf.col >>= 1;
334 	err_addr |= (pinf.col & 1) << col_bit_0;
335 
336 	cols.i = error_data[ADEC10];
337 	err_addr |= (pinf.col & 1) << cols.col1;
338 	pinf.col >>= 1;
339 	err_addr |= (pinf.col & 1) << cols.col2;
340 	pinf.col >>= 1;
341 	err_addr |= (pinf.col & 1) << cols.col3;
342 	pinf.col >>= 1;
343 	err_addr |= (pinf.col & 1) << cols.col4;
344 	pinf.col >>= 1;
345 	err_addr |= (pinf.col & 1) << cols.col5;
346 	pinf.col >>= 1;
347 
348 	cols.i = error_data[ADEC11];
349 	err_addr |= (pinf.col & 1) << cols.col1;
350 	pinf.col >>= 1;
351 	err_addr |= (pinf.col & 1) << cols.col2;
352 	pinf.col >>= 1;
353 	err_addr |= (pinf.col & 1) << cols.col3;
354 	pinf.col >>= 1;
355 	err_addr |= (pinf.col & 1) << cols.col4;
356 	pinf.col >>= 1;
357 	err_addr |= (pinf.col & 1) << cols.col5;
358 	pinf.col >>= 1;
359 
360 	reg = error_data[ADEC12];
361 	err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0);
362 	pinf.bank >>= MC5_EACHBIT;
363 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg);
364 	pinf.bank >>= MC5_EACHBIT;
365 
366 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg);
367 	pinf.group >>= MC5_EACHBIT;
368 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg);
369 	pinf.group >>= MC5_EACHBIT;
370 	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg);
371 	pinf.group >>= MC5_EACHBIT;
372 
373 	reg = error_data[ADEC4];
374 	err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0);
375 	pinf.rank >>= MC5_EACHBIT;
376 	err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg);
377 	pinf.rank >>= MC5_EACHBIT;
378 
379 	reg = error_data[ADEC5];
380 	err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0);
381 	pinf.lrank >>= MC5_EACHBIT;
382 	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg);
383 	pinf.lrank >>= MC5_EACHBIT;
384 	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg);
385 	pinf.lrank >>= MC5_EACHBIT;
386 	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg);
387 	pinf.lrank >>= MC5_EACHBIT;
388 
389 	high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE;
390 	interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL;
391 
392 	high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK;
393 	low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK;
394 	reg = priv->adec[ADEC14 + offset];
395 	ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN);
396 	ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M;
397 	if (ilc_himem_en)
398 		ilc_base_ctrl_add = ilcmem_base - high_mem_offset;
399 	else
400 		ilc_base_ctrl_add = ilcmem_base - low_mem_offset;
401 
402 	if (priv->dwidth == DEV_X16) {
403 		blk = err_addr / MC5_X16_SIZE;
404 		rsh_req_addr = (blk << 8) + ilc_base_ctrl_add;
405 		err_addr = rsh_req_addr * interleave * 2;
406 	} else {
407 		blk = err_addr / MC5_X32_SIZE;
408 		rsh_req_addr = (blk << 9) + ilc_base_ctrl_add;
409 		err_addr = rsh_req_addr * interleave * 2;
410 	}
411 
412 	if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base)
413 		addr = err_addr - high_mem_offset;
414 	else
415 		addr = err_addr - low_mem_offset;
416 
417 	return addr;
418 }
419 
420 /**
421  * handle_error - Handle errors.
422  * @priv:	DDR memory controller private instance data.
423  * @stat:	ECC status structure.
424  * @ctl_num:	Controller number of the MC5
425  * @error_data:	the MC5 ADEC address decoder register data
426  *
427  * Handles ECC correctable and uncorrectable errors.
428  */
handle_error(struct mc_priv * priv,struct ecc_status * stat,int ctl_num,int * error_data)429 static void handle_error(struct mc_priv  *priv, struct ecc_status *stat,
430 			 int ctl_num, int *error_data)
431 {
432 	union ecc_error_info pinf;
433 	struct mem_ctl_info *mci;
434 	unsigned long pa;
435 	phys_addr_t pfn;
436 	int err;
437 
438 	if (WARN_ON_ONCE(ctl_num >= NUM_CONTROLLERS))
439 		return;
440 
441 	mci = priv->mci[ctl_num];
442 
443 	if (stat->error_type == MC5_ERR_TYPE_CE) {
444 		pinf = stat->ceinfo[stat->channel];
445 		snprintf(priv->message, sizeof(priv->message),
446 			 "Error type:%s Controller %d Addr at %lx\n",
447 			 "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
448 
449 		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
450 				     1, 0, 0, 0, 0, 0, -1,
451 				     priv->message, "");
452 	}
453 
454 	if (stat->error_type == MC5_ERR_TYPE_UE) {
455 		pinf = stat->ueinfo[stat->channel];
456 		snprintf(priv->message, sizeof(priv->message),
457 			 "Error type:%s controller %d Addr at %lx\n",
458 			 "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
459 
460 		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
461 				     1, 0, 0, 0, 0, 0, -1,
462 				     priv->message, "");
463 		pa = convert_to_physical(priv, pinf, ctl_num, error_data);
464 		pfn = PHYS_PFN(pa);
465 
466 		if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) {
467 			err = memory_failure(pfn, MF_ACTION_REQUIRED);
468 			if (err)
469 				edac_dbg(2, "memory_failure() error: %d", err);
470 			else
471 				edac_dbg(2, "Poison page at PA 0x%lx\n", pa);
472 		}
473 	}
474 }
475 
mc_init(struct mem_ctl_info * mci,struct device * dev)476 static void mc_init(struct mem_ctl_info *mci, struct device *dev)
477 {
478 	struct mc_priv *priv = mci->pvt_info;
479 	struct csrow_info *csi;
480 	struct dimm_info *dimm;
481 	u32 row;
482 	int ch;
483 
484 	/* Initialize controller capabilities and configuration */
485 	mci->mtype_cap = MEM_FLAG_DDR5;
486 	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
487 	mci->scrub_cap = SCRUB_HW_SRC;
488 	mci->scrub_mode = SCRUB_NONE;
489 
490 	mci->edac_cap = EDAC_FLAG_SECDED;
491 	mci->ctl_name = "VersalNET DDR5";
492 	mci->dev_name = dev_name(dev);
493 	mci->mod_name = "versalnet_edac";
494 
495 	edac_op_state = EDAC_OPSTATE_INT;
496 
497 	for (row = 0; row < mci->nr_csrows; row++) {
498 		csi = mci->csrows[row];
499 		for (ch = 0; ch < csi->nr_channels; ch++) {
500 			dimm = csi->channels[ch]->dimm;
501 			dimm->edac_mode = EDAC_SECDED;
502 			dimm->mtype = MEM_DDR5;
503 			dimm->grain = MC5_ERR_GRAIN;
504 			dimm->dtype = priv->dwidth;
505 		}
506 	}
507 }
508 
509 #define to_mci(k) container_of(k, struct mem_ctl_info, dev)
510 
mcdi_rpc_timeout(struct cdx_mcdi * cdx,unsigned int cmd)511 static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd)
512 {
513 	return MCDI_RPC_TIMEOUT;
514 }
515 
mcdi_request(struct cdx_mcdi * cdx,const struct cdx_dword * hdr,size_t hdr_len,const struct cdx_dword * sdu,size_t sdu_len)516 static void mcdi_request(struct cdx_mcdi *cdx,
517 			 const struct cdx_dword *hdr, size_t hdr_len,
518 			 const struct cdx_dword *sdu, size_t sdu_len)
519 {
520 	void *send_buf;
521 	int ret;
522 
523 	send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL);
524 	if (!send_buf)
525 		return;
526 
527 	memcpy(send_buf, hdr, hdr_len);
528 	memcpy(send_buf + hdr_len, sdu, sdu_len);
529 
530 	ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len);
531 	if (ret)
532 		dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret);
533 
534 	kfree(send_buf);
535 }
536 
537 static const struct cdx_mcdi_ops mcdi_ops = {
538 	.mcdi_rpc_timeout = mcdi_rpc_timeout,
539 	.mcdi_request = mcdi_request,
540 };
541 
get_ddr_config(u32 index,u32 * buffer,struct cdx_mcdi * amd_mcdi)542 static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi)
543 {
544 	size_t outlen;
545 	int ret;
546 
547 	MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN);
548 	MCDI_DECLARE_BUF(outbuf, BUFFER_SZ);
549 
550 	MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index);
551 
552 	ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf),
553 			   outbuf, sizeof(outbuf), &outlen);
554 	if (!ret)
555 		memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG),
556 		       (ADEC_NUM * 4));
557 }
558 
setup_mcdi(struct mc_priv * mc_priv)559 static int setup_mcdi(struct mc_priv *mc_priv)
560 {
561 	struct cdx_mcdi *amd_mcdi;
562 	int ret, i;
563 
564 	amd_mcdi = kzalloc_obj(*amd_mcdi);
565 	if (!amd_mcdi)
566 		return -ENOMEM;
567 
568 	amd_mcdi->mcdi_ops = &mcdi_ops;
569 	ret = cdx_mcdi_init(amd_mcdi);
570 	if (ret) {
571 		kfree(amd_mcdi);
572 		return ret;
573 	}
574 
575 	amd_mcdi->ept = mc_priv->ept;
576 	mc_priv->mcdi = amd_mcdi;
577 
578 	for (i = 0; i < NUM_CONTROLLERS; i++)
579 		get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi);
580 
581 	return 0;
582 }
583 
584 static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2,
585 						 0xb8, 0xb4, 0x45, 0x56, 0x2e,
586 						 0x8c, 0x5b, 0xec);
587 
rpmsg_cb(struct rpmsg_device * rpdev,void * data,int len,void * priv,u32 src)588 static int rpmsg_cb(struct rpmsg_device *rpdev, void *data,
589 		    int len, void *priv, u32 src)
590 {
591 	struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
592 	const guid_t *sec_type = &guid_null;
593 	u32 length, offset, error_id;
594 	u32 *result = (u32 *)data;
595 	struct ecc_status *p;
596 	int i, j, k, sec_sev;
597 	const char *err_str;
598 	u32 *adec_data;
599 
600 	if (*(u8 *)data == MCDI_RESPONSE) {
601 		cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len);
602 		return 0;
603 	}
604 
605 	sec_sev = result[ERROR_LEVEL];
606 	error_id = result[ERROR_ID];
607 	length = result[MSG_ERR_LENGTH];
608 	offset = result[MSG_ERR_OFFSET];
609 
610 	/*
611 	 * The data can come in two stretches. Construct the regs from two
612 	 * messages. The offset indicates the offset from which the data is to
613 	 * be taken.
614 	 */
615 	for (i = 0 ; i < length; i++) {
616 		k = offset + i;
617 		j = ERROR_DATA + i;
618 		mc_priv->regs[k] = result[j];
619 	}
620 
621 	if (result[TOTAL_ERR_LENGTH] > length) {
622 		if (!mc_priv->part_len)
623 			mc_priv->part_len = length;
624 		else
625 			mc_priv->part_len += length;
626 
627 		if (mc_priv->part_len < result[TOTAL_ERR_LENGTH])
628 			return 0;
629 		mc_priv->part_len = 0;
630 	}
631 
632 	mc_priv->error_id = error_id;
633 	mc_priv->error_level = result[ERROR_LEVEL];
634 
635 	switch (error_id) {
636 	case 5:		err_str = "General Software Non-Correctable error"; break;
637 	case 6:		err_str = "CFU error"; break;
638 	case 7:		err_str = "CFRAME error"; break;
639 	case 10:	err_str = "DDRMC Microblaze Correctable ECC error"; break;
640 	case 11:	err_str = "DDRMC Microblaze Non-Correctable ECC error"; break;
641 	case 15:	err_str = "MMCM error"; break;
642 	case 16:	err_str = "HNICX Correctable error"; break;
643 	case 17:	err_str = "HNICX Non-Correctable error"; break;
644 
645 	case 18:
646 		p = &mc_priv->stat;
647 		memset(p, 0, sizeof(struct ecc_status));
648 		p->error_type = MC5_ERR_TYPE_CE;
649 		for (i = 0 ; i < NUM_CONTROLLERS; i++) {
650 			if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
651 				adec_data = mc_priv->adec + ADEC_NUM * i;
652 				handle_error(mc_priv, &mc_priv->stat, i, adec_data);
653 			}
654 		}
655 		return 0;
656 	case 19:
657 		p = &mc_priv->stat;
658 		memset(p, 0, sizeof(struct ecc_status));
659 		p->error_type = MC5_ERR_TYPE_UE;
660 		for (i = 0 ; i < NUM_CONTROLLERS; i++) {
661 			if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
662 				adec_data = mc_priv->adec + ADEC_NUM * i;
663 				handle_error(mc_priv, &mc_priv->stat, i, adec_data);
664 			}
665 		}
666 		return 0;
667 
668 	case 21:	err_str = "GT Non-Correctable error"; break;
669 	case 22:	err_str = "PL Sysmon Correctable error"; break;
670 	case 23:	err_str = "PL Sysmon Non-Correctable error"; break;
671 	case 111:	err_str = "LPX unexpected dfx activation error"; break;
672 	case 114:	err_str = "INT_LPD Non-Correctable error"; break;
673 	case 116:	err_str = "INT_OCM Non-Correctable error"; break;
674 	case 117:	err_str = "INT_FPD Correctable error"; break;
675 	case 118:	err_str = "INT_FPD Non-Correctable error"; break;
676 	case 120:	err_str = "INT_IOU Non-Correctable error"; break;
677 	case 123:	err_str = "err_int_irq from APU GIC Distributor"; break;
678 	case 124:	err_str = "fault_int_irq from APU GIC Distribute"; break;
679 	case 132 ... 139: err_str = "FPX SPLITTER error"; break;
680 	case 140:	err_str = "APU Cluster 0 error"; break;
681 	case 141:	err_str = "APU Cluster 1 error"; break;
682 	case 142:	err_str = "APU Cluster 2 error"; break;
683 	case 143:	err_str = "APU Cluster 3 error"; break;
684 	case 145:	err_str = "WWDT1 LPX error"; break;
685 	case 147:	err_str = "IPI error"; break;
686 	case 152 ... 153: err_str = "AFIFS error"; break;
687 	case 154 ... 155: err_str = "LPX glitch error"; break;
688 	case 185 ... 186: err_str = "FPX AFIFS error"; break;
689 	case 195 ... 199: err_str = "AFIFM error"; break;
690 	case 108:	err_str = "PSM Correctable error"; break;
691 	case 59:	err_str = "PMC correctable error"; break;
692 	case 60:	err_str = "PMC Un correctable error"; break;
693 	case 43 ... 47:	err_str = "PMC Sysmon error"; break;
694 	case 163 ... 184: err_str = "RPU error"; break;
695 	case 148:	err_str = "OCM0 correctable error"; break;
696 	case 149:	err_str = "OCM1 correctable error"; break;
697 	case 150:	err_str = "OCM0 Un-correctable error"; break;
698 	case 151:	err_str = "OCM1 Un-correctable error"; break;
699 	case 189:	err_str = "PSX_CMN_3 PD block consolidated error"; break;
700 	case 191:	err_str = "FPD_INT_WRAP PD block consolidated error"; break;
701 	case 232:	err_str = "CRAM Un-Correctable error"; break;
702 	default:	err_str = "VERSAL_EDAC_ERR_ID: %d"; break;
703 	}
704 
705 	snprintf(mc_priv->message,
706 		 sizeof(mc_priv->message),
707 		 "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str);
708 
709 	/* Convert to bytes */
710 	length = result[TOTAL_ERR_LENGTH] * 4;
711 	log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message,
712 			       sec_sev, (void *)&mc_priv->regs, length);
713 
714 	return 0;
715 }
716 
717 static struct rpmsg_device_id amd_rpmsg_id_table[] = {
718 	{ .name = "error_ipc" },
719 	{ },
720 };
721 MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table);
722 
rpmsg_probe(struct rpmsg_device * rpdev)723 static int rpmsg_probe(struct rpmsg_device *rpdev)
724 {
725 	struct rpmsg_channel_info chinfo;
726 	struct mc_priv *pg;
727 
728 	pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data;
729 	chinfo.src = RPMSG_ADDR_ANY;
730 	chinfo.dst = rpdev->dst;
731 	strscpy(chinfo.name, amd_rpmsg_id_table[0].name,
732 		strlen(amd_rpmsg_id_table[0].name));
733 
734 	pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo);
735 	if (!pg->ept)
736 		return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n",
737 				     chinfo.name);
738 
739 	dev_set_drvdata(&rpdev->dev, pg);
740 
741 	return 0;
742 }
743 
rpmsg_remove(struct rpmsg_device * rpdev)744 static void rpmsg_remove(struct rpmsg_device *rpdev)
745 {
746 	struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
747 
748 	rpmsg_destroy_ept(mc_priv->ept);
749 	dev_set_drvdata(&rpdev->dev, NULL);
750 }
751 
752 static struct rpmsg_driver amd_rpmsg_driver = {
753 	.drv.name = KBUILD_MODNAME,
754 	.probe = rpmsg_probe,
755 	.remove = rpmsg_remove,
756 	.callback = rpmsg_cb,
757 	.id_table = amd_rpmsg_id_table,
758 };
759 
versal_edac_release(struct device * dev)760 static void versal_edac_release(struct device *dev)
761 {
762 	kfree(dev);
763 }
764 
remove_one_mc(struct mc_priv * priv,int i)765 static void remove_one_mc(struct mc_priv *priv, int i)
766 {
767 	struct mem_ctl_info *mci;
768 
769 	mci = priv->mci[i];
770 	device_unregister(mci->pdev);
771 	edac_mc_del_mc(mci->pdev);
772 	edac_mc_free(mci);
773 }
774 
init_one_mc(struct mc_priv * priv,struct platform_device * pdev,int i)775 static int init_one_mc(struct mc_priv *priv, struct platform_device *pdev, int i)
776 {
777 	u32 num_chans, rank, dwidth, config;
778 	struct edac_mc_layer layers[2];
779 	struct mem_ctl_info *mci;
780 	struct device *dev;
781 	enum dev_type dt;
782 	char *name;
783 	int rc;
784 
785 	config = priv->adec[CONF + i * ADEC_NUM];
786 	num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config);
787 	rank = 1 << FIELD_GET(MC5_RANK_MASK, config);
788 	dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config);
789 
790 	switch (dwidth) {
791 	case XDDR5_BUS_WIDTH_16:
792 		dt = DEV_X16;
793 		break;
794 	case XDDR5_BUS_WIDTH_32:
795 		dt = DEV_X32;
796 		break;
797 	case XDDR5_BUS_WIDTH_64:
798 		dt = DEV_X64;
799 		break;
800 	default:
801 		dt = DEV_UNKNOWN;
802 	}
803 
804 	if (dt == DEV_UNKNOWN)
805 		return 0;
806 
807 	/* Find the first enabled device and register that one. */
808 	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
809 	layers[0].size = rank;
810 	layers[0].is_virt_csrow = true;
811 	layers[1].type = EDAC_MC_LAYER_CHANNEL;
812 	layers[1].size = num_chans;
813 	layers[1].is_virt_csrow = false;
814 
815 	rc = -ENOMEM;
816 	name = kzalloc(MC_NAME_LEN, GFP_KERNEL);
817 	if (!name)
818 		return rc;
819 
820 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
821 	if (!dev)
822 		goto err_name_free;
823 
824 	mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers, sizeof(struct mc_priv));
825 	if (!mci) {
826 		edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i);
827 		goto err_dev_free;
828 	}
829 
830 	sprintf(name, "versal-net-ddrmc5-edac-%d", i);
831 
832 	dev->init_name = name;
833 	dev->release = versal_edac_release;
834 
835 	rc = device_register(dev);
836 	if (rc)
837 		goto err_mc_free;
838 
839 	mci->pdev = dev;
840 	mc_init(mci, dev);
841 
842 	rc = edac_mc_add_mc(mci);
843 	if (rc) {
844 		edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i);
845 		goto err_unreg;
846 	}
847 
848 	priv->mci[i] = mci;
849 	priv->dwidth = dt;
850 
851 	platform_set_drvdata(pdev, priv);
852 
853 	return 0;
854 
855 err_unreg:
856 	device_unregister(mci->pdev);
857 err_mc_free:
858 	edac_mc_free(mci);
859 err_dev_free:
860 	kfree(dev);
861 err_name_free:
862 	kfree(name);
863 
864 	return rc;
865 }
866 
init_versalnet(struct mc_priv * priv,struct platform_device * pdev)867 static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev)
868 {
869 	int rc, i;
870 
871 	for (i = 0; i < NUM_CONTROLLERS; i++) {
872 		rc = init_one_mc(priv, pdev, i);
873 		if (rc) {
874 			while (i--)
875 				remove_one_mc(priv, i);
876 
877 			return rc;
878 		}
879 	}
880 	return 0;
881 }
882 
remove_versalnet(struct mc_priv * priv)883 static void remove_versalnet(struct mc_priv *priv)
884 {
885 	for (int i = 0; i < NUM_CONTROLLERS; i++)
886 		remove_one_mc(priv, i);
887 }
888 
mc_probe(struct platform_device * pdev)889 static int mc_probe(struct platform_device *pdev)
890 {
891 	struct mc_priv *priv;
892 	struct rproc *rp;
893 	int rc;
894 
895 	struct device_node *r5_core_node __free(device_node) =
896 		of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0);
897 	if (!r5_core_node) {
898 		dev_err(&pdev->dev, "amd,rproc: invalid phandle\n");
899 		return -EINVAL;
900 	}
901 
902 	rp = rproc_get_by_phandle(r5_core_node->phandle);
903 	if (!rp)
904 		return -EPROBE_DEFER;
905 
906 	rc = rproc_boot(rp);
907 	if (rc) {
908 		dev_err(&pdev->dev, "Failed to attach to remote processor\n");
909 		goto err_rproc_boot;
910 	}
911 
912 	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
913 	if (!priv) {
914 		rc = -ENOMEM;
915 		goto err_alloc;
916 	}
917 
918 	amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv;
919 
920 	rc = register_rpmsg_driver(&amd_rpmsg_driver);
921 	if (rc) {
922 		edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc);
923 		goto err_alloc;
924 	}
925 
926 	rc = setup_mcdi(priv);
927 	if (rc)
928 		goto err_unreg;
929 
930 	priv->mcdi->r5_rproc = rp;
931 
932 	rc = init_versalnet(priv, pdev);
933 	if (rc)
934 		goto err_init;
935 
936 	return 0;
937 
938 err_init:
939 	cdx_mcdi_finish(priv->mcdi);
940 	kfree(priv->mcdi);
941 
942 err_unreg:
943 	unregister_rpmsg_driver(&amd_rpmsg_driver);
944 
945 err_alloc:
946 	rproc_shutdown(rp);
947 
948 err_rproc_boot:
949 	rproc_put(rp);
950 
951 	return rc;
952 }
953 
mc_remove(struct platform_device * pdev)954 static void mc_remove(struct platform_device *pdev)
955 {
956 	struct mc_priv *priv = platform_get_drvdata(pdev);
957 
958 	unregister_rpmsg_driver(&amd_rpmsg_driver);
959 	remove_versalnet(priv);
960 	rproc_shutdown(priv->mcdi->r5_rproc);
961 	cdx_mcdi_finish(priv->mcdi);
962 	kfree(priv->mcdi);
963 }
964 
965 static const struct of_device_id amd_edac_match[] = {
966 	{ .compatible = "xlnx,versal-net-ddrmc5", },
967 	{}
968 };
969 MODULE_DEVICE_TABLE(of, amd_edac_match);
970 
971 static struct platform_driver amd_ddr_edac_mc_driver = {
972 	.driver = {
973 		.name = "versal-net-edac",
974 		.of_match_table = amd_edac_match,
975 	},
976 	.probe = mc_probe,
977 	.remove = mc_remove,
978 };
979 
980 module_platform_driver(amd_ddr_edac_mc_driver);
981 
982 MODULE_AUTHOR("AMD Inc");
983 MODULE_DESCRIPTION("Versal NET EDAC driver");
984 MODULE_LICENSE("GPL");
985