1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * AMD Versal NET memory controller driver
4 * Copyright (C) 2025 Advanced Micro Devices, Inc.
5 */
6
7 #include <linux/cdx/edac_cdx_pcol.h>
8 #include <linux/edac.h>
9 #include <linux/module.h>
10 #include <linux/of_device.h>
11 #include <linux/ras.h>
12 #include <linux/remoteproc.h>
13 #include <linux/rpmsg.h>
14 #include <linux/sizes.h>
15 #include <ras/ras_event.h>
16
17 #include "edac_module.h"
18
19 /* Granularity of reported error in bytes */
20 #define MC5_ERR_GRAIN 1
21 #define MC_GET_DDR_CONFIG_IN_LEN 4
22
23 #define MC5_IRQ_CE_MASK GENMASK(18, 15)
24 #define MC5_IRQ_UE_MASK GENMASK(14, 11)
25
26 #define MC5_RANK_1_MASK GENMASK(11, 6)
27 #define MASK_24 GENMASK(29, 24)
28 #define MASK_0 GENMASK(5, 0)
29
30 #define MC5_LRANK_1_MASK GENMASK(11, 6)
31 #define MC5_LRANK_2_MASK GENMASK(17, 12)
32 #define MC5_BANK1_MASK GENMASK(11, 6)
33 #define MC5_GRP_0_MASK GENMASK(17, 12)
34 #define MC5_GRP_1_MASK GENMASK(23, 18)
35
36 #define MC5_REGHI_ROW 7
37 #define MC5_EACHBIT 1
38 #define MC5_ERR_TYPE_CE 0
39 #define MC5_ERR_TYPE_UE 1
40 #define MC5_HIGH_MEM_EN BIT(20)
41 #define MC5_MEM_MASK GENMASK(19, 0)
42 #define MC5_X16_BASE 256
43 #define MC5_X16_ECC 32
44 #define MC5_X16_SIZE (MC5_X16_BASE + MC5_X16_ECC)
45 #define MC5_X32_SIZE 576
46 #define MC5_HIMEM_BASE (256 * SZ_1M)
47 #define MC5_ILC_HIMEM_EN BIT(28)
48 #define MC5_ILC_MEM GENMASK(27, 0)
49 #define MC5_INTERLEAVE_SEL GENMASK(3, 0)
50 #define MC5_BUS_WIDTH_MASK GENMASK(19, 18)
51 #define MC5_NUM_CHANS_MASK BIT(17)
52 #define MC5_RANK_MASK GENMASK(15, 14)
53
54 #define ERROR_LEVEL 2
55 #define ERROR_ID 3
56 #define TOTAL_ERR_LENGTH 5
57 #define MSG_ERR_OFFSET 8
58 #define MSG_ERR_LENGTH 9
59 #define ERROR_DATA 10
60 #define MCDI_RESPONSE 0xFF
61
62 #define REG_MAX 152
63 #define ADEC_MAX 152
64 #define NUM_CONTROLLERS 8
65 #define REGS_PER_CONTROLLER 19
66 #define ADEC_NUM 19
67 #define BUFFER_SZ 80
68
69 #define XDDR5_BUS_WIDTH_64 0
70 #define XDDR5_BUS_WIDTH_32 1
71 #define XDDR5_BUS_WIDTH_16 2
72
73 #define MC_NAME_LEN 32
74
75 /**
76 * struct ecc_error_info - ECC error log information.
77 * @burstpos: Burst position.
78 * @lrank: Logical Rank number.
79 * @rank: Rank number.
80 * @group: Group number.
81 * @bank: Bank number.
82 * @col: Column number.
83 * @row: Row number.
84 * @rowhi: Row number higher bits.
85 * @i: Combined ECC error vector containing encoded values of burst position,
86 * rank, bank, column, and row information.
87 */
88 union ecc_error_info {
89 struct {
90 u32 burstpos:3;
91 u32 lrank:4;
92 u32 rank:2;
93 u32 group:3;
94 u32 bank:2;
95 u32 col:11;
96 u32 row:7;
97 u32 rowhi;
98 };
99 u64 i;
100 } __packed;
101
102 /* Row and column bit positions in the address decoder (ADEC) registers. */
103 union row_col_mapping {
104 struct {
105 u32 row0:6;
106 u32 row1:6;
107 u32 row2:6;
108 u32 row3:6;
109 u32 row4:6;
110 u32 reserved:2;
111 };
112 struct {
113 u32 col1:6;
114 u32 col2:6;
115 u32 col3:6;
116 u32 col4:6;
117 u32 col5:6;
118 u32 reservedcol:2;
119 };
120 u32 i;
121 } __packed;
122
123 /**
124 * struct ecc_status - ECC status information to report.
125 * @ceinfo: Correctable errors.
126 * @ueinfo: Uncorrected errors.
127 * @channel: Channel number.
128 * @error_type: Error type.
129 */
130 struct ecc_status {
131 union ecc_error_info ceinfo[2];
132 union ecc_error_info ueinfo[2];
133 u8 channel;
134 u8 error_type;
135 };
136
137 /**
138 * struct mc_priv - DDR memory controller private instance data.
139 * @message: Buffer for framing the event specific info.
140 * @stat: ECC status information.
141 * @error_id: The error id.
142 * @error_level: The error level.
143 * @dwidth: Width of data bus excluding ECC bits.
144 * @part_len: The support of the message received.
145 * @regs: The registers sent on the rpmsg.
146 * @adec: Address decode registers.
147 * @mci: Memory controller interface.
148 * @ept: rpmsg endpoint.
149 * @mcdi: The mcdi handle.
150 */
151 struct mc_priv {
152 char message[256];
153 struct ecc_status stat;
154 u32 error_id;
155 u32 error_level;
156 u32 dwidth;
157 u32 part_len;
158 u32 regs[REG_MAX];
159 u32 adec[ADEC_MAX];
160 struct mem_ctl_info *mci[NUM_CONTROLLERS];
161 struct rpmsg_endpoint *ept;
162 struct cdx_mcdi *mcdi;
163 };
164
165 /*
166 * Address decoder (ADEC) registers to match the order in which the register
167 * information is received from the firmware.
168 */
169 enum adec_info {
170 CONF = 0,
171 ADEC0,
172 ADEC1,
173 ADEC2,
174 ADEC3,
175 ADEC4,
176 ADEC5,
177 ADEC6,
178 ADEC7,
179 ADEC8,
180 ADEC9,
181 ADEC10,
182 ADEC11,
183 ADEC12,
184 ADEC13,
185 ADEC14,
186 ADEC15,
187 ADEC16,
188 ADECILC,
189 };
190
191 enum reg_info {
192 ISR = 0,
193 IMR,
194 ECCR0_ERR_STATUS,
195 ECCR0_ADDR_LO,
196 ECCR0_ADDR_HI,
197 ECCR0_DATA_LO,
198 ECCR0_DATA_HI,
199 ECCR0_PAR,
200 ECCR1_ERR_STATUS,
201 ECCR1_ADDR_LO,
202 ECCR1_ADDR_HI,
203 ECCR1_DATA_LO,
204 ECCR1_DATA_HI,
205 ECCR1_PAR,
206 XMPU_ERR,
207 XMPU_ERR_ADDR_L0,
208 XMPU_ERR_ADDR_HI,
209 XMPU_ERR_AXI_ID,
210 ADEC_CHK_ERR_LOG,
211 };
212
get_ddr_info(u32 * error_data,struct mc_priv * priv)213 static bool get_ddr_info(u32 *error_data, struct mc_priv *priv)
214 {
215 u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr;
216 struct ecc_status *p;
217
218 isr = error_data[ISR];
219
220 if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK)))
221 return false;
222
223 eccr0_val = error_data[ECCR0_ERR_STATUS];
224 eccr1_val = error_data[ECCR1_ERR_STATUS];
225
226 if (!eccr0_val && !eccr1_val)
227 return false;
228
229 p = &priv->stat;
230
231 if (!eccr0_val)
232 p->channel = 1;
233 else
234 p->channel = 0;
235
236 reglo = error_data[ECCR0_ADDR_LO];
237 reghi = error_data[ECCR0_ADDR_HI];
238 if (isr & MC5_IRQ_CE_MASK)
239 p->ceinfo[0].i = reglo | (u64)reghi << 32;
240 else if (isr & MC5_IRQ_UE_MASK)
241 p->ueinfo[0].i = reglo | (u64)reghi << 32;
242
243 parity = error_data[ECCR0_PAR];
244 edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
245 reghi, reglo, parity);
246
247 reglo = error_data[ECCR1_ADDR_LO];
248 reghi = error_data[ECCR1_ADDR_HI];
249 if (isr & MC5_IRQ_CE_MASK)
250 p->ceinfo[1].i = reglo | (u64)reghi << 32;
251 else if (isr & MC5_IRQ_UE_MASK)
252 p->ueinfo[1].i = reglo | (u64)reghi << 32;
253
254 parity = error_data[ECCR1_PAR];
255 edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
256 reghi, reglo, parity);
257
258 return true;
259 }
260
261 /**
262 * convert_to_physical - Convert @error_data to a physical address.
263 * @priv: DDR memory controller private instance data.
264 * @pinf: ECC error info structure.
265 * @controller: Controller number of the MC5
266 * @error_data: the DDRMC5 ADEC address decoder register data
267 *
268 * Return: physical address of the DDR memory.
269 */
convert_to_physical(struct mc_priv * priv,union ecc_error_info pinf,int controller,int * error_data)270 static unsigned long convert_to_physical(struct mc_priv *priv,
271 union ecc_error_info pinf,
272 int controller, int *error_data)
273 {
274 u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset;
275 u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base;
276 unsigned long err_addr = 0, addr;
277 union row_col_mapping cols;
278 union row_col_mapping rows;
279 u32 col_bit_0;
280
281 row = pinf.rowhi << MC5_REGHI_ROW | pinf.row;
282 offset = controller * ADEC_NUM;
283
284 reg = error_data[ADEC6];
285 rows.i = reg;
286 err_addr |= (row & BIT(0)) << rows.row0;
287 row >>= MC5_EACHBIT;
288 err_addr |= (row & BIT(0)) << rows.row1;
289 row >>= MC5_EACHBIT;
290 err_addr |= (row & BIT(0)) << rows.row2;
291 row >>= MC5_EACHBIT;
292 err_addr |= (row & BIT(0)) << rows.row3;
293 row >>= MC5_EACHBIT;
294 err_addr |= (row & BIT(0)) << rows.row4;
295 row >>= MC5_EACHBIT;
296
297 reg = error_data[ADEC7];
298 rows.i = reg;
299 err_addr |= (row & BIT(0)) << rows.row0;
300 row >>= MC5_EACHBIT;
301 err_addr |= (row & BIT(0)) << rows.row1;
302 row >>= MC5_EACHBIT;
303 err_addr |= (row & BIT(0)) << rows.row2;
304 row >>= MC5_EACHBIT;
305 err_addr |= (row & BIT(0)) << rows.row3;
306 row >>= MC5_EACHBIT;
307 err_addr |= (row & BIT(0)) << rows.row4;
308 row >>= MC5_EACHBIT;
309
310 reg = error_data[ADEC8];
311 rows.i = reg;
312 err_addr |= (row & BIT(0)) << rows.row0;
313 row >>= MC5_EACHBIT;
314 err_addr |= (row & BIT(0)) << rows.row1;
315 row >>= MC5_EACHBIT;
316 err_addr |= (row & BIT(0)) << rows.row2;
317 row >>= MC5_EACHBIT;
318 err_addr |= (row & BIT(0)) << rows.row3;
319 row >>= MC5_EACHBIT;
320 err_addr |= (row & BIT(0)) << rows.row4;
321
322 reg = error_data[ADEC9];
323 rows.i = reg;
324
325 err_addr |= (row & BIT(0)) << rows.row0;
326 row >>= MC5_EACHBIT;
327 err_addr |= (row & BIT(0)) << rows.row1;
328 row >>= MC5_EACHBIT;
329 err_addr |= (row & BIT(0)) << rows.row2;
330 row >>= MC5_EACHBIT;
331
332 col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]);
333 pinf.col >>= 1;
334 err_addr |= (pinf.col & 1) << col_bit_0;
335
336 cols.i = error_data[ADEC10];
337 err_addr |= (pinf.col & 1) << cols.col1;
338 pinf.col >>= 1;
339 err_addr |= (pinf.col & 1) << cols.col2;
340 pinf.col >>= 1;
341 err_addr |= (pinf.col & 1) << cols.col3;
342 pinf.col >>= 1;
343 err_addr |= (pinf.col & 1) << cols.col4;
344 pinf.col >>= 1;
345 err_addr |= (pinf.col & 1) << cols.col5;
346 pinf.col >>= 1;
347
348 cols.i = error_data[ADEC11];
349 err_addr |= (pinf.col & 1) << cols.col1;
350 pinf.col >>= 1;
351 err_addr |= (pinf.col & 1) << cols.col2;
352 pinf.col >>= 1;
353 err_addr |= (pinf.col & 1) << cols.col3;
354 pinf.col >>= 1;
355 err_addr |= (pinf.col & 1) << cols.col4;
356 pinf.col >>= 1;
357 err_addr |= (pinf.col & 1) << cols.col5;
358 pinf.col >>= 1;
359
360 reg = error_data[ADEC12];
361 err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0);
362 pinf.bank >>= MC5_EACHBIT;
363 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg);
364 pinf.bank >>= MC5_EACHBIT;
365
366 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg);
367 pinf.group >>= MC5_EACHBIT;
368 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg);
369 pinf.group >>= MC5_EACHBIT;
370 err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg);
371 pinf.group >>= MC5_EACHBIT;
372
373 reg = error_data[ADEC4];
374 err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0);
375 pinf.rank >>= MC5_EACHBIT;
376 err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg);
377 pinf.rank >>= MC5_EACHBIT;
378
379 reg = error_data[ADEC5];
380 err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0);
381 pinf.lrank >>= MC5_EACHBIT;
382 err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg);
383 pinf.lrank >>= MC5_EACHBIT;
384 err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg);
385 pinf.lrank >>= MC5_EACHBIT;
386 err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg);
387 pinf.lrank >>= MC5_EACHBIT;
388
389 high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE;
390 interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL;
391
392 high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK;
393 low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK;
394 reg = priv->adec[ADEC14 + offset];
395 ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN);
396 ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M;
397 if (ilc_himem_en)
398 ilc_base_ctrl_add = ilcmem_base - high_mem_offset;
399 else
400 ilc_base_ctrl_add = ilcmem_base - low_mem_offset;
401
402 if (priv->dwidth == DEV_X16) {
403 blk = err_addr / MC5_X16_SIZE;
404 rsh_req_addr = (blk << 8) + ilc_base_ctrl_add;
405 err_addr = rsh_req_addr * interleave * 2;
406 } else {
407 blk = err_addr / MC5_X32_SIZE;
408 rsh_req_addr = (blk << 9) + ilc_base_ctrl_add;
409 err_addr = rsh_req_addr * interleave * 2;
410 }
411
412 if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base)
413 addr = err_addr - high_mem_offset;
414 else
415 addr = err_addr - low_mem_offset;
416
417 return addr;
418 }
419
420 /**
421 * handle_error - Handle errors.
422 * @priv: DDR memory controller private instance data.
423 * @stat: ECC status structure.
424 * @ctl_num: Controller number of the MC5
425 * @error_data: the MC5 ADEC address decoder register data
426 *
427 * Handles ECC correctable and uncorrectable errors.
428 */
handle_error(struct mc_priv * priv,struct ecc_status * stat,int ctl_num,int * error_data)429 static void handle_error(struct mc_priv *priv, struct ecc_status *stat,
430 int ctl_num, int *error_data)
431 {
432 union ecc_error_info pinf;
433 struct mem_ctl_info *mci;
434 unsigned long pa;
435 phys_addr_t pfn;
436 int err;
437
438 if (WARN_ON_ONCE(ctl_num >= NUM_CONTROLLERS))
439 return;
440
441 mci = priv->mci[ctl_num];
442
443 if (stat->error_type == MC5_ERR_TYPE_CE) {
444 pinf = stat->ceinfo[stat->channel];
445 snprintf(priv->message, sizeof(priv->message),
446 "Error type:%s Controller %d Addr at %lx\n",
447 "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
448
449 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
450 1, 0, 0, 0, 0, 0, -1,
451 priv->message, "");
452 }
453
454 if (stat->error_type == MC5_ERR_TYPE_UE) {
455 pinf = stat->ueinfo[stat->channel];
456 snprintf(priv->message, sizeof(priv->message),
457 "Error type:%s controller %d Addr at %lx\n",
458 "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
459
460 edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
461 1, 0, 0, 0, 0, 0, -1,
462 priv->message, "");
463 pa = convert_to_physical(priv, pinf, ctl_num, error_data);
464 pfn = PHYS_PFN(pa);
465
466 if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) {
467 err = memory_failure(pfn, MF_ACTION_REQUIRED);
468 if (err)
469 edac_dbg(2, "memory_failure() error: %d", err);
470 else
471 edac_dbg(2, "Poison page at PA 0x%lx\n", pa);
472 }
473 }
474 }
475
mc_init(struct mem_ctl_info * mci,struct device * dev)476 static void mc_init(struct mem_ctl_info *mci, struct device *dev)
477 {
478 struct mc_priv *priv = mci->pvt_info;
479 struct csrow_info *csi;
480 struct dimm_info *dimm;
481 u32 row;
482 int ch;
483
484 /* Initialize controller capabilities and configuration */
485 mci->mtype_cap = MEM_FLAG_DDR5;
486 mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
487 mci->scrub_cap = SCRUB_HW_SRC;
488 mci->scrub_mode = SCRUB_NONE;
489
490 mci->edac_cap = EDAC_FLAG_SECDED;
491 mci->ctl_name = "VersalNET DDR5";
492 mci->dev_name = dev_name(dev);
493 mci->mod_name = "versalnet_edac";
494
495 edac_op_state = EDAC_OPSTATE_INT;
496
497 for (row = 0; row < mci->nr_csrows; row++) {
498 csi = mci->csrows[row];
499 for (ch = 0; ch < csi->nr_channels; ch++) {
500 dimm = csi->channels[ch]->dimm;
501 dimm->edac_mode = EDAC_SECDED;
502 dimm->mtype = MEM_DDR5;
503 dimm->grain = MC5_ERR_GRAIN;
504 dimm->dtype = priv->dwidth;
505 }
506 }
507 }
508
509 #define to_mci(k) container_of(k, struct mem_ctl_info, dev)
510
mcdi_rpc_timeout(struct cdx_mcdi * cdx,unsigned int cmd)511 static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd)
512 {
513 return MCDI_RPC_TIMEOUT;
514 }
515
mcdi_request(struct cdx_mcdi * cdx,const struct cdx_dword * hdr,size_t hdr_len,const struct cdx_dword * sdu,size_t sdu_len)516 static void mcdi_request(struct cdx_mcdi *cdx,
517 const struct cdx_dword *hdr, size_t hdr_len,
518 const struct cdx_dword *sdu, size_t sdu_len)
519 {
520 void *send_buf;
521 int ret;
522
523 send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL);
524 if (!send_buf)
525 return;
526
527 memcpy(send_buf, hdr, hdr_len);
528 memcpy(send_buf + hdr_len, sdu, sdu_len);
529
530 ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len);
531 if (ret)
532 dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret);
533
534 kfree(send_buf);
535 }
536
537 static const struct cdx_mcdi_ops mcdi_ops = {
538 .mcdi_rpc_timeout = mcdi_rpc_timeout,
539 .mcdi_request = mcdi_request,
540 };
541
get_ddr_config(u32 index,u32 * buffer,struct cdx_mcdi * amd_mcdi)542 static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi)
543 {
544 size_t outlen;
545 int ret;
546
547 MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN);
548 MCDI_DECLARE_BUF(outbuf, BUFFER_SZ);
549
550 MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index);
551
552 ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf),
553 outbuf, sizeof(outbuf), &outlen);
554 if (!ret)
555 memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG),
556 (ADEC_NUM * 4));
557 }
558
setup_mcdi(struct mc_priv * mc_priv)559 static int setup_mcdi(struct mc_priv *mc_priv)
560 {
561 struct cdx_mcdi *amd_mcdi;
562 int ret, i;
563
564 amd_mcdi = kzalloc_obj(*amd_mcdi);
565 if (!amd_mcdi)
566 return -ENOMEM;
567
568 amd_mcdi->mcdi_ops = &mcdi_ops;
569 ret = cdx_mcdi_init(amd_mcdi);
570 if (ret) {
571 kfree(amd_mcdi);
572 return ret;
573 }
574
575 amd_mcdi->ept = mc_priv->ept;
576 mc_priv->mcdi = amd_mcdi;
577
578 for (i = 0; i < NUM_CONTROLLERS; i++)
579 get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi);
580
581 return 0;
582 }
583
584 static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2,
585 0xb8, 0xb4, 0x45, 0x56, 0x2e,
586 0x8c, 0x5b, 0xec);
587
rpmsg_cb(struct rpmsg_device * rpdev,void * data,int len,void * priv,u32 src)588 static int rpmsg_cb(struct rpmsg_device *rpdev, void *data,
589 int len, void *priv, u32 src)
590 {
591 struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
592 const guid_t *sec_type = &guid_null;
593 u32 length, offset, error_id;
594 u32 *result = (u32 *)data;
595 struct ecc_status *p;
596 int i, j, k, sec_sev;
597 const char *err_str;
598 u32 *adec_data;
599
600 if (*(u8 *)data == MCDI_RESPONSE) {
601 cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len);
602 return 0;
603 }
604
605 sec_sev = result[ERROR_LEVEL];
606 error_id = result[ERROR_ID];
607 length = result[MSG_ERR_LENGTH];
608 offset = result[MSG_ERR_OFFSET];
609
610 /*
611 * The data can come in two stretches. Construct the regs from two
612 * messages. The offset indicates the offset from which the data is to
613 * be taken.
614 */
615 for (i = 0 ; i < length; i++) {
616 k = offset + i;
617 j = ERROR_DATA + i;
618 mc_priv->regs[k] = result[j];
619 }
620
621 if (result[TOTAL_ERR_LENGTH] > length) {
622 if (!mc_priv->part_len)
623 mc_priv->part_len = length;
624 else
625 mc_priv->part_len += length;
626
627 if (mc_priv->part_len < result[TOTAL_ERR_LENGTH])
628 return 0;
629 mc_priv->part_len = 0;
630 }
631
632 mc_priv->error_id = error_id;
633 mc_priv->error_level = result[ERROR_LEVEL];
634
635 switch (error_id) {
636 case 5: err_str = "General Software Non-Correctable error"; break;
637 case 6: err_str = "CFU error"; break;
638 case 7: err_str = "CFRAME error"; break;
639 case 10: err_str = "DDRMC Microblaze Correctable ECC error"; break;
640 case 11: err_str = "DDRMC Microblaze Non-Correctable ECC error"; break;
641 case 15: err_str = "MMCM error"; break;
642 case 16: err_str = "HNICX Correctable error"; break;
643 case 17: err_str = "HNICX Non-Correctable error"; break;
644
645 case 18:
646 p = &mc_priv->stat;
647 memset(p, 0, sizeof(struct ecc_status));
648 p->error_type = MC5_ERR_TYPE_CE;
649 for (i = 0 ; i < NUM_CONTROLLERS; i++) {
650 if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
651 adec_data = mc_priv->adec + ADEC_NUM * i;
652 handle_error(mc_priv, &mc_priv->stat, i, adec_data);
653 }
654 }
655 return 0;
656 case 19:
657 p = &mc_priv->stat;
658 memset(p, 0, sizeof(struct ecc_status));
659 p->error_type = MC5_ERR_TYPE_UE;
660 for (i = 0 ; i < NUM_CONTROLLERS; i++) {
661 if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
662 adec_data = mc_priv->adec + ADEC_NUM * i;
663 handle_error(mc_priv, &mc_priv->stat, i, adec_data);
664 }
665 }
666 return 0;
667
668 case 21: err_str = "GT Non-Correctable error"; break;
669 case 22: err_str = "PL Sysmon Correctable error"; break;
670 case 23: err_str = "PL Sysmon Non-Correctable error"; break;
671 case 111: err_str = "LPX unexpected dfx activation error"; break;
672 case 114: err_str = "INT_LPD Non-Correctable error"; break;
673 case 116: err_str = "INT_OCM Non-Correctable error"; break;
674 case 117: err_str = "INT_FPD Correctable error"; break;
675 case 118: err_str = "INT_FPD Non-Correctable error"; break;
676 case 120: err_str = "INT_IOU Non-Correctable error"; break;
677 case 123: err_str = "err_int_irq from APU GIC Distributor"; break;
678 case 124: err_str = "fault_int_irq from APU GIC Distribute"; break;
679 case 132 ... 139: err_str = "FPX SPLITTER error"; break;
680 case 140: err_str = "APU Cluster 0 error"; break;
681 case 141: err_str = "APU Cluster 1 error"; break;
682 case 142: err_str = "APU Cluster 2 error"; break;
683 case 143: err_str = "APU Cluster 3 error"; break;
684 case 145: err_str = "WWDT1 LPX error"; break;
685 case 147: err_str = "IPI error"; break;
686 case 152 ... 153: err_str = "AFIFS error"; break;
687 case 154 ... 155: err_str = "LPX glitch error"; break;
688 case 185 ... 186: err_str = "FPX AFIFS error"; break;
689 case 195 ... 199: err_str = "AFIFM error"; break;
690 case 108: err_str = "PSM Correctable error"; break;
691 case 59: err_str = "PMC correctable error"; break;
692 case 60: err_str = "PMC Un correctable error"; break;
693 case 43 ... 47: err_str = "PMC Sysmon error"; break;
694 case 163 ... 184: err_str = "RPU error"; break;
695 case 148: err_str = "OCM0 correctable error"; break;
696 case 149: err_str = "OCM1 correctable error"; break;
697 case 150: err_str = "OCM0 Un-correctable error"; break;
698 case 151: err_str = "OCM1 Un-correctable error"; break;
699 case 189: err_str = "PSX_CMN_3 PD block consolidated error"; break;
700 case 191: err_str = "FPD_INT_WRAP PD block consolidated error"; break;
701 case 232: err_str = "CRAM Un-Correctable error"; break;
702 default: err_str = "VERSAL_EDAC_ERR_ID: %d"; break;
703 }
704
705 snprintf(mc_priv->message,
706 sizeof(mc_priv->message),
707 "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str);
708
709 /* Convert to bytes */
710 length = result[TOTAL_ERR_LENGTH] * 4;
711 log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message,
712 sec_sev, (void *)&mc_priv->regs, length);
713
714 return 0;
715 }
716
717 static struct rpmsg_device_id amd_rpmsg_id_table[] = {
718 { .name = "error_ipc" },
719 { },
720 };
721 MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table);
722
rpmsg_probe(struct rpmsg_device * rpdev)723 static int rpmsg_probe(struct rpmsg_device *rpdev)
724 {
725 struct rpmsg_channel_info chinfo;
726 struct mc_priv *pg;
727
728 pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data;
729 chinfo.src = RPMSG_ADDR_ANY;
730 chinfo.dst = rpdev->dst;
731 strscpy(chinfo.name, amd_rpmsg_id_table[0].name,
732 strlen(amd_rpmsg_id_table[0].name));
733
734 pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo);
735 if (!pg->ept)
736 return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n",
737 chinfo.name);
738
739 dev_set_drvdata(&rpdev->dev, pg);
740
741 return 0;
742 }
743
rpmsg_remove(struct rpmsg_device * rpdev)744 static void rpmsg_remove(struct rpmsg_device *rpdev)
745 {
746 struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
747
748 rpmsg_destroy_ept(mc_priv->ept);
749 dev_set_drvdata(&rpdev->dev, NULL);
750 }
751
752 static struct rpmsg_driver amd_rpmsg_driver = {
753 .drv.name = KBUILD_MODNAME,
754 .probe = rpmsg_probe,
755 .remove = rpmsg_remove,
756 .callback = rpmsg_cb,
757 .id_table = amd_rpmsg_id_table,
758 };
759
versal_edac_release(struct device * dev)760 static void versal_edac_release(struct device *dev)
761 {
762 kfree(dev);
763 }
764
remove_one_mc(struct mc_priv * priv,int i)765 static void remove_one_mc(struct mc_priv *priv, int i)
766 {
767 struct mem_ctl_info *mci;
768
769 mci = priv->mci[i];
770 device_unregister(mci->pdev);
771 edac_mc_del_mc(mci->pdev);
772 edac_mc_free(mci);
773 }
774
init_one_mc(struct mc_priv * priv,struct platform_device * pdev,int i)775 static int init_one_mc(struct mc_priv *priv, struct platform_device *pdev, int i)
776 {
777 u32 num_chans, rank, dwidth, config;
778 struct edac_mc_layer layers[2];
779 struct mem_ctl_info *mci;
780 struct device *dev;
781 enum dev_type dt;
782 char *name;
783 int rc;
784
785 config = priv->adec[CONF + i * ADEC_NUM];
786 num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config);
787 rank = 1 << FIELD_GET(MC5_RANK_MASK, config);
788 dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config);
789
790 switch (dwidth) {
791 case XDDR5_BUS_WIDTH_16:
792 dt = DEV_X16;
793 break;
794 case XDDR5_BUS_WIDTH_32:
795 dt = DEV_X32;
796 break;
797 case XDDR5_BUS_WIDTH_64:
798 dt = DEV_X64;
799 break;
800 default:
801 dt = DEV_UNKNOWN;
802 }
803
804 if (dt == DEV_UNKNOWN)
805 return 0;
806
807 /* Find the first enabled device and register that one. */
808 layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
809 layers[0].size = rank;
810 layers[0].is_virt_csrow = true;
811 layers[1].type = EDAC_MC_LAYER_CHANNEL;
812 layers[1].size = num_chans;
813 layers[1].is_virt_csrow = false;
814
815 rc = -ENOMEM;
816 name = kzalloc(MC_NAME_LEN, GFP_KERNEL);
817 if (!name)
818 return rc;
819
820 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
821 if (!dev)
822 goto err_name_free;
823
824 mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers, sizeof(struct mc_priv));
825 if (!mci) {
826 edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i);
827 goto err_dev_free;
828 }
829
830 sprintf(name, "versal-net-ddrmc5-edac-%d", i);
831
832 dev->init_name = name;
833 dev->release = versal_edac_release;
834
835 rc = device_register(dev);
836 if (rc)
837 goto err_mc_free;
838
839 mci->pdev = dev;
840 mc_init(mci, dev);
841
842 rc = edac_mc_add_mc(mci);
843 if (rc) {
844 edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i);
845 goto err_unreg;
846 }
847
848 priv->mci[i] = mci;
849 priv->dwidth = dt;
850
851 platform_set_drvdata(pdev, priv);
852
853 return 0;
854
855 err_unreg:
856 device_unregister(mci->pdev);
857 err_mc_free:
858 edac_mc_free(mci);
859 err_dev_free:
860 kfree(dev);
861 err_name_free:
862 kfree(name);
863
864 return rc;
865 }
866
init_versalnet(struct mc_priv * priv,struct platform_device * pdev)867 static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev)
868 {
869 int rc, i;
870
871 for (i = 0; i < NUM_CONTROLLERS; i++) {
872 rc = init_one_mc(priv, pdev, i);
873 if (rc) {
874 while (i--)
875 remove_one_mc(priv, i);
876
877 return rc;
878 }
879 }
880 return 0;
881 }
882
remove_versalnet(struct mc_priv * priv)883 static void remove_versalnet(struct mc_priv *priv)
884 {
885 for (int i = 0; i < NUM_CONTROLLERS; i++)
886 remove_one_mc(priv, i);
887 }
888
mc_probe(struct platform_device * pdev)889 static int mc_probe(struct platform_device *pdev)
890 {
891 struct mc_priv *priv;
892 struct rproc *rp;
893 int rc;
894
895 struct device_node *r5_core_node __free(device_node) =
896 of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0);
897 if (!r5_core_node) {
898 dev_err(&pdev->dev, "amd,rproc: invalid phandle\n");
899 return -EINVAL;
900 }
901
902 rp = rproc_get_by_phandle(r5_core_node->phandle);
903 if (!rp)
904 return -EPROBE_DEFER;
905
906 rc = rproc_boot(rp);
907 if (rc) {
908 dev_err(&pdev->dev, "Failed to attach to remote processor\n");
909 goto err_rproc_boot;
910 }
911
912 priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
913 if (!priv) {
914 rc = -ENOMEM;
915 goto err_alloc;
916 }
917
918 amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv;
919
920 rc = register_rpmsg_driver(&amd_rpmsg_driver);
921 if (rc) {
922 edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc);
923 goto err_alloc;
924 }
925
926 rc = setup_mcdi(priv);
927 if (rc)
928 goto err_unreg;
929
930 priv->mcdi->r5_rproc = rp;
931
932 rc = init_versalnet(priv, pdev);
933 if (rc)
934 goto err_init;
935
936 return 0;
937
938 err_init:
939 cdx_mcdi_finish(priv->mcdi);
940 kfree(priv->mcdi);
941
942 err_unreg:
943 unregister_rpmsg_driver(&amd_rpmsg_driver);
944
945 err_alloc:
946 rproc_shutdown(rp);
947
948 err_rproc_boot:
949 rproc_put(rp);
950
951 return rc;
952 }
953
mc_remove(struct platform_device * pdev)954 static void mc_remove(struct platform_device *pdev)
955 {
956 struct mc_priv *priv = platform_get_drvdata(pdev);
957
958 unregister_rpmsg_driver(&amd_rpmsg_driver);
959 remove_versalnet(priv);
960 rproc_shutdown(priv->mcdi->r5_rproc);
961 cdx_mcdi_finish(priv->mcdi);
962 kfree(priv->mcdi);
963 }
964
965 static const struct of_device_id amd_edac_match[] = {
966 { .compatible = "xlnx,versal-net-ddrmc5", },
967 {}
968 };
969 MODULE_DEVICE_TABLE(of, amd_edac_match);
970
971 static struct platform_driver amd_ddr_edac_mc_driver = {
972 .driver = {
973 .name = "versal-net-edac",
974 .of_match_table = amd_edac_match,
975 },
976 .probe = mc_probe,
977 .remove = mc_remove,
978 };
979
980 module_platform_driver(amd_ddr_edac_mc_driver);
981
982 MODULE_AUTHOR("AMD Inc");
983 MODULE_DESCRIPTION("Versal NET EDAC driver");
984 MODULE_LICENSE("GPL");
985