xref: /linux/drivers/accel/qaic/qaic_ras.c (revision 260f6f4fda93c8485c8037865c941b42b9cba5d2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 
3 /* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. */
4 /* Copyright (c) 2022-2024 Qualcomm Innovation Center, Inc. All rights reserved. */
5 /* Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. */
6 
7 #include <asm/byteorder.h>
8 #include <linux/device.h>
9 #include <linux/kernel.h>
10 #include <linux/mhi.h>
11 
12 #include "qaic.h"
13 #include "qaic_ras.h"
14 
15 #define MAGIC		0x55AA
16 #define VERSION		0x2
17 #define HDR_SZ		12
18 #define NUM_TEMP_LVL	3
19 #define POWER_BREAK	BIT(0)
20 
21 enum msg_type {
22 	MSG_PUSH, /* async push from device */
23 	MSG_REQ,  /* sync request to device */
24 	MSG_RESP, /* sync response from device */
25 };
26 
27 enum err_type {
28 	CE,	/* correctable error */
29 	UE,	/* uncorrectable error */
30 	UE_NF,	/* uncorrectable error that is non-fatal, expect a disruption */
31 	ERR_TYPE_MAX,
32 };
33 
34 static const char * const err_type_str[] = {
35 	[CE]    = "Correctable",
36 	[UE]    = "Uncorrectable",
37 	[UE_NF] = "Uncorrectable Non-Fatal",
38 };
39 
40 static const char * const err_class_str[] = {
41 	[CE]    = "Warning",
42 	[UE]    = "Fatal",
43 	[UE_NF] = "Warning",
44 };
45 
46 enum err_source {
47 	SOC_MEM,
48 	PCIE,
49 	DDR,
50 	SYS_BUS1,
51 	SYS_BUS2,
52 	NSP_MEM,
53 	TSENS,
54 };
55 
56 static const char * const err_src_str[TSENS + 1] = {
57 	[SOC_MEM]	= "SoC Memory",
58 	[PCIE]		= "PCIE",
59 	[DDR]		= "DDR",
60 	[SYS_BUS1]	= "System Bus source 1",
61 	[SYS_BUS2]	= "System Bus source 2",
62 	[NSP_MEM]	= "NSP Memory",
63 	[TSENS]		= "Temperature Sensors",
64 };
65 
66 struct ras_data {
67 	/* header start */
68 	/* Magic number to validate the message */
69 	u16 magic;
70 	/* RAS version number */
71 	u16 ver;
72 	u32 seq_num;
73 	/* RAS message type */
74 	u8  type;
75 	u8  id;
76 	/* Size of RAS message without the header in byte */
77 	u16 len;
78 	/* header end */
79 	s32 result;
80 	/*
81 	 * Error source
82 	 * 0 : SoC Memory
83 	 * 1 : PCIE
84 	 * 2 : DDR
85 	 * 3 : System Bus source 1
86 	 * 4 : System Bus source 2
87 	 * 5 : NSP Memory
88 	 * 6 : Temperature Sensors
89 	 */
90 	u32 source;
91 	/*
92 	 * Stores the error type, there are three types of error in RAS
93 	 * 0 : correctable error (CE)
94 	 * 1 : uncorrectable error (UE)
95 	 * 2 : uncorrectable error that is non-fatal (UE_NF)
96 	 */
97 	u32 err_type;
98 	u32 err_threshold;
99 	u32 ce_count;
100 	u32 ue_count;
101 	u32 intr_num;
102 	/* Data specific to error source */
103 	u8  syndrome[64];
104 } __packed;
105 
106 struct soc_mem_syndrome {
107 	u64 error_address[8];
108 } __packed;
109 
110 struct nsp_mem_syndrome {
111 	u32 error_address[8];
112 	u8 nsp_id;
113 } __packed;
114 
115 struct ddr_syndrome {
116 	u32 count;
117 	u32 irq_status;
118 	u32 data_31_0[2];
119 	u32 data_63_32[2];
120 	u32 data_95_64[2];
121 	u32 data_127_96[2];
122 	u32 addr_lsb;
123 	u16 addr_msb;
124 	u16 parity_bits;
125 	u16 instance;
126 	u16 err_type;
127 } __packed;
128 
129 struct tsens_syndrome {
130 	u32 threshold_type;
131 	s32 temp;
132 } __packed;
133 
134 struct sysbus1_syndrome {
135 	u32 slave;
136 	u32 err_type;
137 	u16 addr[8];
138 	u8  instance;
139 } __packed;
140 
141 struct sysbus2_syndrome {
142 	u32 lsb3;
143 	u32 msb3;
144 	u32 lsb2;
145 	u32 msb2;
146 	u32 ext_id;
147 	u16 path;
148 	u16 op_type;
149 	u16 len;
150 	u16 redirect;
151 	u8  valid;
152 	u8  word_error;
153 	u8  non_secure;
154 	u8  opc;
155 	u8  error_code;
156 	u8  trans_type;
157 	u8  addr_space;
158 	u8  instance;
159 } __packed;
160 
161 struct pcie_syndrome {
162 	/* CE info */
163 	u32 bad_tlp;
164 	u32 bad_dllp;
165 	u32 replay_rollover;
166 	u32 replay_timeout;
167 	u32 rx_err;
168 	u32 internal_ce_count;
169 	/* UE_NF info */
170 	u32 fc_timeout;
171 	u32 poison_tlp;
172 	u32 ecrc_err;
173 	u32 unsupported_req;
174 	u32 completer_abort;
175 	u32 completion_timeout;
176 	/* UE info */
177 	u32 addr;
178 	u8  index;
179 	/*
180 	 * Flag to indicate specific event of PCIe
181 	 * BIT(0): Power break (low power)
182 	 * BIT(1) to BIT(7): Reserved
183 	 */
184 	u8 flag;
185 } __packed;
186 
187 static const char * const threshold_type_str[NUM_TEMP_LVL] = {
188 	[0] = "lower",
189 	[1] = "upper",
190 	[2] = "critical",
191 };
192 
ras_msg_to_cpu(struct ras_data * msg)193 static void ras_msg_to_cpu(struct ras_data *msg)
194 {
195 	struct sysbus1_syndrome *sysbus1_syndrome = (struct sysbus1_syndrome *)&msg->syndrome[0];
196 	struct sysbus2_syndrome *sysbus2_syndrome = (struct sysbus2_syndrome *)&msg->syndrome[0];
197 	struct soc_mem_syndrome *soc_syndrome = (struct soc_mem_syndrome *)&msg->syndrome[0];
198 	struct nsp_mem_syndrome *nsp_syndrome = (struct nsp_mem_syndrome *)&msg->syndrome[0];
199 	struct tsens_syndrome *tsens_syndrome = (struct tsens_syndrome *)&msg->syndrome[0];
200 	struct pcie_syndrome *pcie_syndrome = (struct pcie_syndrome *)&msg->syndrome[0];
201 	struct ddr_syndrome *ddr_syndrome = (struct ddr_syndrome *)&msg->syndrome[0];
202 	int i;
203 
204 	le16_to_cpus(&msg->magic);
205 	le16_to_cpus(&msg->ver);
206 	le32_to_cpus(&msg->seq_num);
207 	le16_to_cpus(&msg->len);
208 	le32_to_cpus(&msg->result);
209 	le32_to_cpus(&msg->source);
210 	le32_to_cpus(&msg->err_type);
211 	le32_to_cpus(&msg->err_threshold);
212 	le32_to_cpus(&msg->ce_count);
213 	le32_to_cpus(&msg->ue_count);
214 	le32_to_cpus(&msg->intr_num);
215 
216 	switch (msg->source) {
217 	case SOC_MEM:
218 		for (i = 0; i < 8; i++)
219 			le64_to_cpus(&soc_syndrome->error_address[i]);
220 		break;
221 	case PCIE:
222 		le32_to_cpus(&pcie_syndrome->bad_tlp);
223 		le32_to_cpus(&pcie_syndrome->bad_dllp);
224 		le32_to_cpus(&pcie_syndrome->replay_rollover);
225 		le32_to_cpus(&pcie_syndrome->replay_timeout);
226 		le32_to_cpus(&pcie_syndrome->rx_err);
227 		le32_to_cpus(&pcie_syndrome->internal_ce_count);
228 		le32_to_cpus(&pcie_syndrome->fc_timeout);
229 		le32_to_cpus(&pcie_syndrome->poison_tlp);
230 		le32_to_cpus(&pcie_syndrome->ecrc_err);
231 		le32_to_cpus(&pcie_syndrome->unsupported_req);
232 		le32_to_cpus(&pcie_syndrome->completer_abort);
233 		le32_to_cpus(&pcie_syndrome->completion_timeout);
234 		le32_to_cpus(&pcie_syndrome->addr);
235 		break;
236 	case DDR:
237 		le16_to_cpus(&ddr_syndrome->instance);
238 		le16_to_cpus(&ddr_syndrome->err_type);
239 		le32_to_cpus(&ddr_syndrome->count);
240 		le32_to_cpus(&ddr_syndrome->irq_status);
241 		le32_to_cpus(&ddr_syndrome->data_31_0[0]);
242 		le32_to_cpus(&ddr_syndrome->data_31_0[1]);
243 		le32_to_cpus(&ddr_syndrome->data_63_32[0]);
244 		le32_to_cpus(&ddr_syndrome->data_63_32[1]);
245 		le32_to_cpus(&ddr_syndrome->data_95_64[0]);
246 		le32_to_cpus(&ddr_syndrome->data_95_64[1]);
247 		le32_to_cpus(&ddr_syndrome->data_127_96[0]);
248 		le32_to_cpus(&ddr_syndrome->data_127_96[1]);
249 		le16_to_cpus(&ddr_syndrome->parity_bits);
250 		le16_to_cpus(&ddr_syndrome->addr_msb);
251 		le32_to_cpus(&ddr_syndrome->addr_lsb);
252 		break;
253 	case SYS_BUS1:
254 		le32_to_cpus(&sysbus1_syndrome->slave);
255 		le32_to_cpus(&sysbus1_syndrome->err_type);
256 		for (i = 0; i < 8; i++)
257 			le16_to_cpus(&sysbus1_syndrome->addr[i]);
258 		break;
259 	case SYS_BUS2:
260 		le16_to_cpus(&sysbus2_syndrome->op_type);
261 		le16_to_cpus(&sysbus2_syndrome->len);
262 		le16_to_cpus(&sysbus2_syndrome->redirect);
263 		le16_to_cpus(&sysbus2_syndrome->path);
264 		le32_to_cpus(&sysbus2_syndrome->ext_id);
265 		le32_to_cpus(&sysbus2_syndrome->lsb2);
266 		le32_to_cpus(&sysbus2_syndrome->msb2);
267 		le32_to_cpus(&sysbus2_syndrome->lsb3);
268 		le32_to_cpus(&sysbus2_syndrome->msb3);
269 		break;
270 	case NSP_MEM:
271 		for (i = 0; i < 8; i++)
272 			le32_to_cpus(&nsp_syndrome->error_address[i]);
273 		break;
274 	case TSENS:
275 		le32_to_cpus(&tsens_syndrome->threshold_type);
276 		le32_to_cpus(&tsens_syndrome->temp);
277 		break;
278 	}
279 }
280 
decode_ras_msg(struct qaic_device * qdev,struct ras_data * msg)281 static void decode_ras_msg(struct qaic_device *qdev, struct ras_data *msg)
282 {
283 	struct sysbus1_syndrome *sysbus1_syndrome = (struct sysbus1_syndrome *)&msg->syndrome[0];
284 	struct sysbus2_syndrome *sysbus2_syndrome = (struct sysbus2_syndrome *)&msg->syndrome[0];
285 	struct soc_mem_syndrome *soc_syndrome = (struct soc_mem_syndrome *)&msg->syndrome[0];
286 	struct nsp_mem_syndrome *nsp_syndrome = (struct nsp_mem_syndrome *)&msg->syndrome[0];
287 	struct tsens_syndrome *tsens_syndrome = (struct tsens_syndrome *)&msg->syndrome[0];
288 	struct pcie_syndrome *pcie_syndrome = (struct pcie_syndrome *)&msg->syndrome[0];
289 	struct ddr_syndrome *ddr_syndrome = (struct ddr_syndrome *)&msg->syndrome[0];
290 	char *class;
291 	char *level;
292 
293 	if (msg->magic != MAGIC) {
294 		pci_warn(qdev->pdev, "Dropping RAS message with invalid magic %x\n", msg->magic);
295 		return;
296 	}
297 
298 	if (!msg->ver || msg->ver > VERSION) {
299 		pci_warn(qdev->pdev, "Dropping RAS message with invalid version %d\n", msg->ver);
300 		return;
301 	}
302 
303 	if (msg->type != MSG_PUSH) {
304 		pci_warn(qdev->pdev, "Dropping non-PUSH RAS message\n");
305 		return;
306 	}
307 
308 	if (msg->len != sizeof(*msg) - HDR_SZ) {
309 		pci_warn(qdev->pdev, "Dropping RAS message with invalid len %d\n", msg->len);
310 		return;
311 	}
312 
313 	if (msg->err_type >= ERR_TYPE_MAX) {
314 		pci_warn(qdev->pdev, "Dropping RAS message with err type %d\n", msg->err_type);
315 		return;
316 	}
317 
318 	if (msg->err_type == UE)
319 		level = KERN_ERR;
320 	else
321 		level = KERN_WARNING;
322 
323 	switch (msg->source) {
324 	case SOC_MEM:
325 		dev_printk(level, &qdev->pdev->dev, "RAS event.\nClass:%s\nDescription:%s %s %s\nError Threshold for this report %d\nSyndrome:\n    0x%llx\n    0x%llx\n    0x%llx\n    0x%llx\n    0x%llx\n    0x%llx\n    0x%llx\n    0x%llx\n",
326 			   err_class_str[msg->err_type],
327 			   err_type_str[msg->err_type],
328 			   "error from",
329 			   err_src_str[msg->source],
330 			   msg->err_threshold,
331 			   soc_syndrome->error_address[0],
332 			   soc_syndrome->error_address[1],
333 			   soc_syndrome->error_address[2],
334 			   soc_syndrome->error_address[3],
335 			   soc_syndrome->error_address[4],
336 			   soc_syndrome->error_address[5],
337 			   soc_syndrome->error_address[6],
338 			   soc_syndrome->error_address[7]);
339 		break;
340 	case PCIE:
341 		dev_printk(level, &qdev->pdev->dev, "RAS event.\nClass:%s\nDescription:%s %s %s\nError Threshold for this report %d\n",
342 			   err_class_str[msg->err_type],
343 			   err_type_str[msg->err_type],
344 			   "error from",
345 			   err_src_str[msg->source],
346 			   msg->err_threshold);
347 
348 		switch (msg->err_type) {
349 		case CE:
350 			/*
351 			 * Modeled after AER prints. This continues the dev_printk() from a few
352 			 * lines up. We reduce duplication of code, but also avoid re-printing the
353 			 * PCI device info so that the end result looks uniform to the log user.
354 			 */
355 			printk(KERN_WARNING pr_fmt("Syndrome:\n    Bad TLP count %d\n    Bad DLLP count %d\n    Replay Rollover count %d\n    Replay Timeout count %d\n    Recv Error count %d\n    Internal CE count %d\n"),
356 			       pcie_syndrome->bad_tlp,
357 			       pcie_syndrome->bad_dllp,
358 			       pcie_syndrome->replay_rollover,
359 			       pcie_syndrome->replay_timeout,
360 			       pcie_syndrome->rx_err,
361 			       pcie_syndrome->internal_ce_count);
362 			if (msg->ver > 0x1)
363 				pr_warn("    Power break %s\n",
364 					pcie_syndrome->flag & POWER_BREAK ? "ON" : "OFF");
365 			break;
366 		case UE:
367 			printk(KERN_ERR pr_fmt("Syndrome:\n    Index %d\n    Address 0x%x\n"),
368 			       pcie_syndrome->index, pcie_syndrome->addr);
369 			break;
370 		case UE_NF:
371 			printk(KERN_WARNING pr_fmt("Syndrome:\n    FC timeout count %d\n    Poisoned TLP count %d\n    ECRC error count %d\n    Unsupported request count %d\n    Completer abort count %d\n    Completion timeout count %d\n"),
372 			       pcie_syndrome->fc_timeout,
373 			       pcie_syndrome->poison_tlp,
374 			       pcie_syndrome->ecrc_err,
375 			       pcie_syndrome->unsupported_req,
376 			       pcie_syndrome->completer_abort,
377 			       pcie_syndrome->completion_timeout);
378 			break;
379 		default:
380 			break;
381 		}
382 		break;
383 	case DDR:
384 		dev_printk(level, &qdev->pdev->dev, "RAS event.\nClass:%s\nDescription:%s %s %s\nError Threshold for this report %d\nSyndrome:\n    Instance %d\n    Count %d\n    Data 31_0 0x%x 0x%x\n    Data 63_32 0x%x 0x%x\n    Data 95_64 0x%x 0x%x\n    Data 127_96 0x%x 0x%x\n    Parity bits 0x%x\n    Address msb 0x%x\n    Address lsb 0x%x\n",
385 			   err_class_str[msg->err_type],
386 			   err_type_str[msg->err_type],
387 			   "error from",
388 			   err_src_str[msg->source],
389 			   msg->err_threshold,
390 			   ddr_syndrome->instance,
391 			   ddr_syndrome->count,
392 			   ddr_syndrome->data_31_0[1],
393 			   ddr_syndrome->data_31_0[0],
394 			   ddr_syndrome->data_63_32[1],
395 			   ddr_syndrome->data_63_32[0],
396 			   ddr_syndrome->data_95_64[1],
397 			   ddr_syndrome->data_95_64[0],
398 			   ddr_syndrome->data_127_96[1],
399 			   ddr_syndrome->data_127_96[0],
400 			   ddr_syndrome->parity_bits,
401 			   ddr_syndrome->addr_msb,
402 			   ddr_syndrome->addr_lsb);
403 		break;
404 	case SYS_BUS1:
405 		dev_printk(level, &qdev->pdev->dev, "RAS event.\nClass:%s\nDescription:%s %s %s\nError Threshold for this report %d\nSyndrome:\n    instance %d\n    %s\n    err_type %d\n    address0 0x%x\n    address1 0x%x\n    address2 0x%x\n    address3 0x%x\n    address4 0x%x\n    address5 0x%x\n    address6 0x%x\n    address7 0x%x\n",
406 			   err_class_str[msg->err_type],
407 			   err_type_str[msg->err_type],
408 			   "error from",
409 			   err_src_str[msg->source],
410 			   msg->err_threshold,
411 			   sysbus1_syndrome->instance,
412 			   sysbus1_syndrome->slave ? "Slave" : "Master",
413 			   sysbus1_syndrome->err_type,
414 			   sysbus1_syndrome->addr[0],
415 			   sysbus1_syndrome->addr[1],
416 			   sysbus1_syndrome->addr[2],
417 			   sysbus1_syndrome->addr[3],
418 			   sysbus1_syndrome->addr[4],
419 			   sysbus1_syndrome->addr[5],
420 			   sysbus1_syndrome->addr[6],
421 			   sysbus1_syndrome->addr[7]);
422 		break;
423 	case SYS_BUS2:
424 		dev_printk(level, &qdev->pdev->dev, "RAS event.\nClass:%s\nDescription:%s %s %s\nError Threshold for this report %d\nSyndrome:\n    instance %d\n    valid %d\n    word error %d\n    non-secure %d\n    opc %d\n    error code %d\n    transaction type %d\n    address space %d\n    operation type %d\n    len %d\n    redirect %d\n    path %d\n    ext_id %d\n    lsb2 %d\n    msb2 %d\n    lsb3 %d\n    msb3 %d\n",
425 			   err_class_str[msg->err_type],
426 			   err_type_str[msg->err_type],
427 			   "error from",
428 			   err_src_str[msg->source],
429 			   msg->err_threshold,
430 			   sysbus2_syndrome->instance,
431 			   sysbus2_syndrome->valid,
432 			   sysbus2_syndrome->word_error,
433 			   sysbus2_syndrome->non_secure,
434 			   sysbus2_syndrome->opc,
435 			   sysbus2_syndrome->error_code,
436 			   sysbus2_syndrome->trans_type,
437 			   sysbus2_syndrome->addr_space,
438 			   sysbus2_syndrome->op_type,
439 			   sysbus2_syndrome->len,
440 			   sysbus2_syndrome->redirect,
441 			   sysbus2_syndrome->path,
442 			   sysbus2_syndrome->ext_id,
443 			   sysbus2_syndrome->lsb2,
444 			   sysbus2_syndrome->msb2,
445 			   sysbus2_syndrome->lsb3,
446 			   sysbus2_syndrome->msb3);
447 		break;
448 	case NSP_MEM:
449 		dev_printk(level, &qdev->pdev->dev, "RAS event.\nClass:%s\nDescription:%s %s %s\nError Threshold for this report %d\nSyndrome:\n    NSP ID %d\n    0x%x\n    0x%x\n    0x%x\n    0x%x\n    0x%x\n    0x%x\n    0x%x\n    0x%x\n",
450 			   err_class_str[msg->err_type],
451 			   err_type_str[msg->err_type],
452 			   "error from",
453 			   err_src_str[msg->source],
454 			   msg->err_threshold,
455 			   nsp_syndrome->nsp_id,
456 			   nsp_syndrome->error_address[0],
457 			   nsp_syndrome->error_address[1],
458 			   nsp_syndrome->error_address[2],
459 			   nsp_syndrome->error_address[3],
460 			   nsp_syndrome->error_address[4],
461 			   nsp_syndrome->error_address[5],
462 			   nsp_syndrome->error_address[6],
463 			   nsp_syndrome->error_address[7]);
464 		break;
465 	case TSENS:
466 		if (tsens_syndrome->threshold_type >= NUM_TEMP_LVL) {
467 			pci_warn(qdev->pdev, "Dropping RAS message with invalid temp threshold %d\n",
468 				 tsens_syndrome->threshold_type);
469 			break;
470 		}
471 
472 		if (msg->err_type)
473 			class = "Fatal";
474 		else if (tsens_syndrome->threshold_type)
475 			class = "Critical";
476 		else
477 			class = "Warning";
478 
479 		dev_printk(level, &qdev->pdev->dev, "RAS event.\nClass:%s\nDescription:%s %s %s\nError Threshold for this report %d\nSyndrome:\n    %s threshold\n    %d deg C\n",
480 			   class,
481 			   err_type_str[msg->err_type],
482 			   "error from",
483 			   err_src_str[msg->source],
484 			   msg->err_threshold,
485 			   threshold_type_str[tsens_syndrome->threshold_type],
486 			   tsens_syndrome->temp);
487 		break;
488 	}
489 
490 	/* Uncorrectable errors are fatal */
491 	if (msg->err_type == UE)
492 		mhi_soc_reset(qdev->mhi_cntrl);
493 
494 	switch (msg->err_type) {
495 	case CE:
496 		if (qdev->ce_count != UINT_MAX)
497 			qdev->ce_count++;
498 		break;
499 	case UE:
500 		if (qdev->ce_count != UINT_MAX)
501 			qdev->ue_count++;
502 		break;
503 	case UE_NF:
504 		if (qdev->ce_count != UINT_MAX)
505 			qdev->ue_nf_count++;
506 		break;
507 	default:
508 		/* not possible */
509 		break;
510 	}
511 }
512 
ce_count_show(struct device * dev,struct device_attribute * attr,char * buf)513 static ssize_t ce_count_show(struct device *dev, struct device_attribute *attr, char *buf)
514 {
515 	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev));
516 
517 	return snprintf(buf, PAGE_SIZE, "%d\n", qdev->ce_count);
518 }
519 
ue_count_show(struct device * dev,struct device_attribute * attr,char * buf)520 static ssize_t ue_count_show(struct device *dev, struct device_attribute *attr, char *buf)
521 {
522 	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev));
523 
524 	return snprintf(buf, PAGE_SIZE, "%d\n", qdev->ue_count);
525 }
526 
ue_nonfatal_count_show(struct device * dev,struct device_attribute * attr,char * buf)527 static ssize_t ue_nonfatal_count_show(struct device *dev, struct device_attribute *attr, char *buf)
528 {
529 	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev));
530 
531 	return snprintf(buf, PAGE_SIZE, "%d\n", qdev->ue_nf_count);
532 }
533 
534 static DEVICE_ATTR_RO(ce_count);
535 static DEVICE_ATTR_RO(ue_count);
536 static DEVICE_ATTR_RO(ue_nonfatal_count);
537 
538 static struct attribute *ras_attrs[] = {
539 	&dev_attr_ce_count.attr,
540 	&dev_attr_ue_count.attr,
541 	&dev_attr_ue_nonfatal_count.attr,
542 	NULL,
543 };
544 
545 static struct attribute_group ras_group = {
546 	.attrs = ras_attrs,
547 };
548 
qaic_ras_mhi_probe(struct mhi_device * mhi_dev,const struct mhi_device_id * id)549 static int qaic_ras_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
550 {
551 	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev));
552 	struct ras_data *resp;
553 	int ret;
554 
555 	ret = mhi_prepare_for_transfer(mhi_dev);
556 	if (ret)
557 		return ret;
558 
559 	resp = kzalloc(sizeof(*resp), GFP_KERNEL);
560 	if (!resp) {
561 		mhi_unprepare_from_transfer(mhi_dev);
562 		return -ENOMEM;
563 	}
564 
565 	ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, resp, sizeof(*resp), MHI_EOT);
566 	if (ret) {
567 		kfree(resp);
568 		mhi_unprepare_from_transfer(mhi_dev);
569 		return ret;
570 	}
571 
572 	ret = device_add_group(&qdev->pdev->dev, &ras_group);
573 	if (ret) {
574 		mhi_unprepare_from_transfer(mhi_dev);
575 		pci_dbg(qdev->pdev, "ras add sysfs failed %d\n", ret);
576 		return ret;
577 	}
578 
579 	dev_set_drvdata(&mhi_dev->dev, qdev);
580 	qdev->ras_ch = mhi_dev;
581 
582 	return ret;
583 }
584 
qaic_ras_mhi_remove(struct mhi_device * mhi_dev)585 static void qaic_ras_mhi_remove(struct mhi_device *mhi_dev)
586 {
587 	struct qaic_device *qdev;
588 
589 	qdev = dev_get_drvdata(&mhi_dev->dev);
590 	qdev->ras_ch = NULL;
591 	device_remove_group(&qdev->pdev->dev, &ras_group);
592 	mhi_unprepare_from_transfer(mhi_dev);
593 }
594 
qaic_ras_mhi_ul_xfer_cb(struct mhi_device * mhi_dev,struct mhi_result * mhi_result)595 static void qaic_ras_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result) {}
596 
qaic_ras_mhi_dl_xfer_cb(struct mhi_device * mhi_dev,struct mhi_result * mhi_result)597 static void qaic_ras_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
598 {
599 	struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev);
600 	struct ras_data *msg = mhi_result->buf_addr;
601 	int ret;
602 
603 	if (mhi_result->transaction_status) {
604 		kfree(msg);
605 		return;
606 	}
607 
608 	ras_msg_to_cpu(msg);
609 	decode_ras_msg(qdev, msg);
610 
611 	ret = mhi_queue_buf(qdev->ras_ch, DMA_FROM_DEVICE, msg, sizeof(*msg), MHI_EOT);
612 	if (ret) {
613 		dev_err(&mhi_dev->dev, "Cannot requeue RAS recv buf %d\n", ret);
614 		kfree(msg);
615 	}
616 }
617 
618 static const struct mhi_device_id qaic_ras_mhi_match_table[] = {
619 	{ .chan = "QAIC_STATUS", },
620 	{},
621 };
622 
623 static struct mhi_driver qaic_ras_mhi_driver = {
624 	.id_table = qaic_ras_mhi_match_table,
625 	.remove = qaic_ras_mhi_remove,
626 	.probe = qaic_ras_mhi_probe,
627 	.ul_xfer_cb = qaic_ras_mhi_ul_xfer_cb,
628 	.dl_xfer_cb = qaic_ras_mhi_dl_xfer_cb,
629 	.driver = {
630 		.name = "qaic_ras",
631 	},
632 };
633 
qaic_ras_register(void)634 int qaic_ras_register(void)
635 {
636 	return mhi_driver_register(&qaic_ras_mhi_driver);
637 }
638 
qaic_ras_unregister(void)639 void qaic_ras_unregister(void)
640 {
641 	mhi_driver_unregister(&qaic_ras_mhi_driver);
642 }
643