1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
4 */
5
6 #include <drm/drm_cache.h>
7 #include <drm/drm_device.h>
8 #include <drm/drm_print.h>
9 #include <drm/gpu_scheduler.h>
10 #include <linux/dma-mapping.h>
11 #include <linux/kthread.h>
12 #include <linux/kernel.h>
13
14 #include "aie2_msg_priv.h"
15 #include "aie2_pci.h"
16 #include "amdxdna_mailbox.h"
17 #include "amdxdna_pci_drv.h"
18
19 struct async_event {
20 struct amdxdna_dev_hdl *ndev;
21 struct async_event_msg_resp resp;
22 struct workqueue_struct *wq;
23 struct work_struct work;
24 u8 *buf;
25 dma_addr_t addr;
26 u32 size;
27 };
28
29 struct async_events {
30 struct workqueue_struct *wq;
31 u8 *buf;
32 dma_addr_t addr;
33 u32 size;
34 u32 event_cnt;
35 struct async_event event[] __counted_by(event_cnt);
36 };
37
38 /*
39 * Below enum, struct and lookup tables are porting from XAIE util header file.
40 *
41 * Below data is defined by AIE device and it is used for decode error message
42 * from the device.
43 */
44
45 enum aie_module_type {
46 AIE_MEM_MOD = 0,
47 AIE_CORE_MOD,
48 AIE_PL_MOD,
49 };
50
51 enum aie_error_category {
52 AIE_ERROR_SATURATION = 0,
53 AIE_ERROR_FP,
54 AIE_ERROR_STREAM,
55 AIE_ERROR_ACCESS,
56 AIE_ERROR_BUS,
57 AIE_ERROR_INSTRUCTION,
58 AIE_ERROR_ECC,
59 AIE_ERROR_LOCK,
60 AIE_ERROR_DMA,
61 AIE_ERROR_MEM_PARITY,
62 /* Unknown is not from XAIE, added for better category */
63 AIE_ERROR_UNKNOWN,
64 };
65
66 /* Don't pack, unless XAIE side changed */
67 struct aie_error {
68 __u8 row;
69 __u8 col;
70 __u32 mod_type;
71 __u8 event_id;
72 };
73
74 struct aie_err_info {
75 u32 err_cnt;
76 u32 ret_code;
77 u32 rsvd;
78 struct aie_error payload[] __counted_by(err_cnt);
79 };
80
81 struct aie_event_category {
82 u8 event_id;
83 enum aie_error_category category;
84 };
85
86 #define EVENT_CATEGORY(id, cat) { id, cat }
87 static const struct aie_event_category aie_ml_mem_event_cat[] = {
88 EVENT_CATEGORY(88U, AIE_ERROR_ECC),
89 EVENT_CATEGORY(90U, AIE_ERROR_ECC),
90 EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY),
91 EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY),
92 EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY),
93 EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY),
94 EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY),
95 EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY),
96 EVENT_CATEGORY(97U, AIE_ERROR_DMA),
97 EVENT_CATEGORY(98U, AIE_ERROR_DMA),
98 EVENT_CATEGORY(99U, AIE_ERROR_DMA),
99 EVENT_CATEGORY(100U, AIE_ERROR_DMA),
100 EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
101 };
102
103 static const struct aie_event_category aie_ml_core_event_cat[] = {
104 EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
105 EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
106 EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
107 EVENT_CATEGORY(58U, AIE_ERROR_BUS),
108 EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
109 EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
110 EVENT_CATEGORY(62U, AIE_ERROR_ECC),
111 EVENT_CATEGORY(64U, AIE_ERROR_ECC),
112 EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
113 EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
114 EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
115 EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
116 EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
117 EVENT_CATEGORY(72U, AIE_ERROR_BUS),
118 };
119
120 static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
121 EVENT_CATEGORY(130U, AIE_ERROR_ECC),
122 EVENT_CATEGORY(132U, AIE_ERROR_ECC),
123 EVENT_CATEGORY(133U, AIE_ERROR_DMA),
124 EVENT_CATEGORY(134U, AIE_ERROR_DMA),
125 EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
126 EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
127 EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
128 EVENT_CATEGORY(138U, AIE_ERROR_BUS),
129 EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
130 };
131
132 static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
133 EVENT_CATEGORY(64U, AIE_ERROR_BUS),
134 EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
135 EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
136 EVENT_CATEGORY(67U, AIE_ERROR_BUS),
137 EVENT_CATEGORY(68U, AIE_ERROR_BUS),
138 EVENT_CATEGORY(69U, AIE_ERROR_BUS),
139 EVENT_CATEGORY(70U, AIE_ERROR_BUS),
140 EVENT_CATEGORY(71U, AIE_ERROR_BUS),
141 EVENT_CATEGORY(72U, AIE_ERROR_DMA),
142 EVENT_CATEGORY(73U, AIE_ERROR_DMA),
143 EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
144 };
145
146 static enum aie_error_category
aie_get_error_category(u8 row,u8 event_id,enum aie_module_type mod_type)147 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
148 {
149 const struct aie_event_category *lut;
150 int num_entry;
151 int i;
152
153 switch (mod_type) {
154 case AIE_PL_MOD:
155 lut = aie_ml_shim_tile_event_cat;
156 num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
157 break;
158 case AIE_CORE_MOD:
159 lut = aie_ml_core_event_cat;
160 num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
161 break;
162 case AIE_MEM_MOD:
163 if (row == 1) {
164 lut = aie_ml_mem_tile_event_cat;
165 num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
166 } else {
167 lut = aie_ml_mem_event_cat;
168 num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
169 }
170 break;
171 default:
172 return AIE_ERROR_UNKNOWN;
173 }
174
175 for (i = 0; i < num_entry; i++) {
176 if (event_id != lut[i].event_id)
177 continue;
178
179 return lut[i].category;
180 }
181
182 return AIE_ERROR_UNKNOWN;
183 }
184
aie2_error_backtrack(struct amdxdna_dev_hdl * ndev,void * err_info,u32 num_err)185 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
186 {
187 struct aie_error *errs = err_info;
188 u32 err_col = 0; /* assume that AIE has less than 32 columns */
189 int i;
190
191 /* Get err column bitmap */
192 for (i = 0; i < num_err; i++) {
193 struct aie_error *err = &errs[i];
194 enum aie_error_category cat;
195
196 cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
197 XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
198 err->row, err->col, err->mod_type,
199 err->event_id, cat);
200
201 if (err->col >= 32) {
202 XDNA_WARN(ndev->xdna, "Invalid column number");
203 break;
204 }
205
206 err_col |= (1 << err->col);
207 }
208
209 return err_col;
210 }
211
aie2_error_async_cb(void * handle,void __iomem * data,size_t size)212 static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size)
213 {
214 struct async_event *e = handle;
215
216 if (data) {
217 e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type));
218 wmb(); /* Update status in the end, so that no lock for here */
219 e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status));
220 }
221 queue_work(e->wq, &e->work);
222 return 0;
223 }
224
aie2_error_event_send(struct async_event * e)225 static int aie2_error_event_send(struct async_event *e)
226 {
227 drm_clflush_virt_range(e->buf, e->size); /* device can access */
228 return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
229 aie2_error_async_cb);
230 }
231
aie2_error_worker(struct work_struct * err_work)232 static void aie2_error_worker(struct work_struct *err_work)
233 {
234 struct aie_err_info *info;
235 struct amdxdna_dev *xdna;
236 struct async_event *e;
237 u32 max_err;
238 u32 err_col;
239
240 e = container_of(err_work, struct async_event, work);
241
242 xdna = e->ndev->xdna;
243
244 if (e->resp.status == MAX_AIE2_STATUS_CODE)
245 return;
246
247 e->resp.status = MAX_AIE2_STATUS_CODE;
248
249 print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
250 e->buf, 0x100, false);
251
252 info = (struct aie_err_info *)e->buf;
253 XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
254
255 max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
256 if (unlikely(info->err_cnt > max_err)) {
257 WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
258 return;
259 }
260 err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
261 if (!err_col) {
262 XDNA_WARN(xdna, "Did not get error column");
263 return;
264 }
265
266 mutex_lock(&xdna->dev_lock);
267 /* Re-sent this event to firmware */
268 if (aie2_error_event_send(e))
269 XDNA_WARN(xdna, "Unable to register async event");
270 mutex_unlock(&xdna->dev_lock);
271 }
272
aie2_error_async_events_send(struct amdxdna_dev_hdl * ndev)273 int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
274 {
275 struct amdxdna_dev *xdna = ndev->xdna;
276 struct async_event *e;
277 int i, ret;
278
279 drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
280 for (i = 0; i < ndev->async_events->event_cnt; i++) {
281 e = &ndev->async_events->event[i];
282 ret = aie2_error_event_send(e);
283 if (ret)
284 return ret;
285 }
286
287 return 0;
288 }
289
aie2_error_async_events_free(struct amdxdna_dev_hdl * ndev)290 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
291 {
292 struct amdxdna_dev *xdna = ndev->xdna;
293 struct async_events *events;
294
295 events = ndev->async_events;
296
297 mutex_unlock(&xdna->dev_lock);
298 destroy_workqueue(events->wq);
299 mutex_lock(&xdna->dev_lock);
300
301 dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
302 events->addr, DMA_FROM_DEVICE);
303 kfree(events);
304 }
305
aie2_error_async_events_alloc(struct amdxdna_dev_hdl * ndev)306 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
307 {
308 struct amdxdna_dev *xdna = ndev->xdna;
309 u32 total_col = ndev->total_col;
310 u32 total_size = ASYNC_BUF_SIZE * total_col;
311 struct async_events *events;
312 int i, ret;
313
314 events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
315 if (!events)
316 return -ENOMEM;
317
318 events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
319 DMA_FROM_DEVICE, GFP_KERNEL);
320 if (!events->buf) {
321 ret = -ENOMEM;
322 goto free_events;
323 }
324 events->size = total_size;
325 events->event_cnt = total_col;
326
327 events->wq = alloc_ordered_workqueue("async_wq", 0);
328 if (!events->wq) {
329 ret = -ENOMEM;
330 goto free_buf;
331 }
332
333 for (i = 0; i < events->event_cnt; i++) {
334 struct async_event *e = &events->event[i];
335 u32 offset = i * ASYNC_BUF_SIZE;
336
337 e->ndev = ndev;
338 e->wq = events->wq;
339 e->buf = &events->buf[offset];
340 e->addr = events->addr + offset;
341 e->size = ASYNC_BUF_SIZE;
342 e->resp.status = MAX_AIE2_STATUS_CODE;
343 INIT_WORK(&e->work, aie2_error_worker);
344 }
345
346 ndev->async_events = events;
347
348 XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
349 events->event_cnt, events->size);
350 return 0;
351
352 free_buf:
353 dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
354 events->addr, DMA_FROM_DEVICE);
355 free_events:
356 kfree(events);
357 return ret;
358 }
359