1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
24 #include <linux/list.h>
25 #include "amdgpu.h"
26 #include "amdgpu_aca.h"
27 #include "amdgpu_ras.h"
28
29 #define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype}
30
31 typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
32
33 static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = {
34 ACA_BANK_HWID(SMU, 0x01, 0x01),
35 ACA_BANK_HWID(PCS_XGMI, 0x50, 0x00),
36 ACA_BANK_HWID(UMC, 0x96, 0x00),
37 };
38
aca_banks_init(struct aca_banks * banks)39 static void aca_banks_init(struct aca_banks *banks)
40 {
41 if (!banks)
42 return;
43
44 memset(banks, 0, sizeof(*banks));
45 INIT_LIST_HEAD(&banks->list);
46 }
47
aca_banks_add_bank(struct aca_banks * banks,struct aca_bank * bank)48 static int aca_banks_add_bank(struct aca_banks *banks, struct aca_bank *bank)
49 {
50 struct aca_bank_node *node;
51
52 if (!bank)
53 return -EINVAL;
54
55 node = kvzalloc(sizeof(*node), GFP_KERNEL);
56 if (!node)
57 return -ENOMEM;
58
59 memcpy(&node->bank, bank, sizeof(*bank));
60
61 INIT_LIST_HEAD(&node->node);
62 list_add_tail(&node->node, &banks->list);
63
64 banks->nr_banks++;
65
66 return 0;
67 }
68
aca_banks_release(struct aca_banks * banks)69 static void aca_banks_release(struct aca_banks *banks)
70 {
71 struct aca_bank_node *node, *tmp;
72
73 if (list_empty(&banks->list))
74 return;
75
76 list_for_each_entry_safe(node, tmp, &banks->list, node) {
77 list_del(&node->node);
78 kvfree(node);
79 }
80 }
81
aca_smu_get_valid_aca_count(struct amdgpu_device * adev,enum aca_smu_type type,u32 * count)82 static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count)
83 {
84 struct amdgpu_aca *aca = &adev->aca;
85 const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
86
87 if (!count)
88 return -EINVAL;
89
90 if (!smu_funcs || !smu_funcs->get_valid_aca_count)
91 return -EOPNOTSUPP;
92
93 return smu_funcs->get_valid_aca_count(adev, type, count);
94 }
95
96 static struct aca_regs_dump {
97 const char *name;
98 int reg_idx;
99 } aca_regs[] = {
100 {"CONTROL", ACA_REG_IDX_CTL},
101 {"STATUS", ACA_REG_IDX_STATUS},
102 {"ADDR", ACA_REG_IDX_ADDR},
103 {"MISC", ACA_REG_IDX_MISC0},
104 {"CONFIG", ACA_REG_IDX_CONFIG},
105 {"IPID", ACA_REG_IDX_IPID},
106 {"SYND", ACA_REG_IDX_SYND},
107 {"DESTAT", ACA_REG_IDX_DESTAT},
108 {"DEADDR", ACA_REG_IDX_DEADDR},
109 {"CONTROL_MASK", ACA_REG_IDX_CTL_MASK},
110 };
111
aca_smu_bank_dump(struct amdgpu_device * adev,int idx,int total,struct aca_bank * bank,struct ras_query_context * qctx)112 static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank,
113 struct ras_query_context *qctx)
114 {
115 u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
116 int i;
117
118 if (adev->debug_disable_ce_logs &&
119 bank->smu_err_type == ACA_SMU_TYPE_CE &&
120 !ACA_BANK_ERR_IS_DEFFERED(bank))
121 return;
122
123 RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
124 /* plus 1 for output format, e.g: ACA[08/08]: xxxx */
125 for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
126 RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
127 idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
128
129 if (ACA_REG__STATUS__SCRUB(bank->regs[ACA_REG_IDX_STATUS]))
130 RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged by the scrubber\n");
131 }
132
aca_smu_get_valid_aca_banks(struct amdgpu_device * adev,enum aca_smu_type type,int start,int count,struct aca_banks * banks,struct ras_query_context * qctx)133 static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
134 int start, int count,
135 struct aca_banks *banks, struct ras_query_context *qctx)
136 {
137 struct amdgpu_aca *aca = &adev->aca;
138 const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
139 struct aca_bank bank;
140 int i, max_count, ret;
141
142 if (!count)
143 return 0;
144
145 if (!smu_funcs || !smu_funcs->get_valid_aca_bank)
146 return -EOPNOTSUPP;
147
148 switch (type) {
149 case ACA_SMU_TYPE_UE:
150 max_count = smu_funcs->max_ue_bank_count;
151 break;
152 case ACA_SMU_TYPE_CE:
153 max_count = smu_funcs->max_ce_bank_count;
154 break;
155 default:
156 return -EINVAL;
157 }
158
159 if (start + count > max_count)
160 return -EINVAL;
161
162 count = min_t(int, count, max_count);
163 for (i = 0; i < count; i++) {
164 memset(&bank, 0, sizeof(bank));
165 ret = smu_funcs->get_valid_aca_bank(adev, type, start + i, &bank);
166 if (ret)
167 return ret;
168
169 bank.smu_err_type = type;
170
171 aca_smu_bank_dump(adev, i, count, &bank, qctx);
172
173 ret = aca_banks_add_bank(banks, &bank);
174 if (ret)
175 return ret;
176 }
177
178 return 0;
179 }
180
aca_bank_hwip_is_matched(struct aca_bank * bank,enum aca_hwip_type type)181 static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type)
182 {
183
184 struct aca_hwip *hwip;
185 int hwid, mcatype;
186 u64 ipid;
187
188 if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
189 return false;
190
191 hwip = &aca_hwid_mcatypes[type];
192 if (!hwip->hwid)
193 return false;
194
195 ipid = bank->regs[ACA_REG_IDX_IPID];
196 hwid = ACA_REG__IPID__HARDWAREID(ipid);
197 mcatype = ACA_REG__IPID__MCATYPE(ipid);
198
199 return hwip->hwid == hwid && hwip->mcatype == mcatype;
200 }
201
aca_bank_is_valid(struct aca_handle * handle,struct aca_bank * bank,enum aca_smu_type type)202 static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
203 {
204 const struct aca_bank_ops *bank_ops = handle->bank_ops;
205
206 /* Parse all deferred errors with UMC aca handle */
207 if (ACA_BANK_ERR_IS_DEFFERED(bank))
208 return handle->hwip == ACA_HWIP_TYPE_UMC;
209
210 if (!aca_bank_hwip_is_matched(bank, handle->hwip))
211 return false;
212
213 if (!bank_ops->aca_bank_is_valid)
214 return true;
215
216 return bank_ops->aca_bank_is_valid(handle, bank, type, handle->data);
217 }
218
new_bank_error(struct aca_error * aerr,struct aca_bank_info * info)219 static struct aca_bank_error *new_bank_error(struct aca_error *aerr, struct aca_bank_info *info)
220 {
221 struct aca_bank_error *bank_error;
222
223 bank_error = kvzalloc(sizeof(*bank_error), GFP_KERNEL);
224 if (!bank_error)
225 return NULL;
226
227 INIT_LIST_HEAD(&bank_error->node);
228 memcpy(&bank_error->info, info, sizeof(*info));
229
230 mutex_lock(&aerr->lock);
231 list_add_tail(&bank_error->node, &aerr->list);
232 mutex_unlock(&aerr->lock);
233
234 return bank_error;
235 }
236
find_bank_error(struct aca_error * aerr,struct aca_bank_info * info)237 static struct aca_bank_error *find_bank_error(struct aca_error *aerr, struct aca_bank_info *info)
238 {
239 struct aca_bank_error *bank_error = NULL;
240 struct aca_bank_info *tmp_info;
241 bool found = false;
242
243 mutex_lock(&aerr->lock);
244 list_for_each_entry(bank_error, &aerr->list, node) {
245 tmp_info = &bank_error->info;
246 if (tmp_info->socket_id == info->socket_id &&
247 tmp_info->die_id == info->die_id) {
248 found = true;
249 goto out_unlock;
250 }
251 }
252
253 out_unlock:
254 mutex_unlock(&aerr->lock);
255
256 return found ? bank_error : NULL;
257 }
258
aca_bank_error_remove(struct aca_error * aerr,struct aca_bank_error * bank_error)259 static void aca_bank_error_remove(struct aca_error *aerr, struct aca_bank_error *bank_error)
260 {
261 if (!aerr || !bank_error)
262 return;
263
264 list_del(&bank_error->node);
265 aerr->nr_errors--;
266
267 kvfree(bank_error);
268 }
269
get_bank_error(struct aca_error * aerr,struct aca_bank_info * info)270 static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_bank_info *info)
271 {
272 struct aca_bank_error *bank_error;
273
274 if (!aerr || !info)
275 return NULL;
276
277 bank_error = find_bank_error(aerr, info);
278 if (bank_error)
279 return bank_error;
280
281 return new_bank_error(aerr, info);
282 }
283
aca_error_cache_log_bank_error(struct aca_handle * handle,struct aca_bank_info * info,enum aca_error_type type,u64 count)284 int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info,
285 enum aca_error_type type, u64 count)
286 {
287 struct aca_error_cache *error_cache = &handle->error_cache;
288 struct aca_bank_error *bank_error;
289 struct aca_error *aerr;
290
291 if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT)
292 return -EINVAL;
293
294 if (!count)
295 return 0;
296
297 aerr = &error_cache->errors[type];
298 bank_error = get_bank_error(aerr, info);
299 if (!bank_error)
300 return -ENOMEM;
301
302 bank_error->count += count;
303
304 return 0;
305 }
306
aca_bank_parser(struct aca_handle * handle,struct aca_bank * bank,enum aca_smu_type type)307 static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
308 {
309 const struct aca_bank_ops *bank_ops = handle->bank_ops;
310
311 if (!bank)
312 return -EINVAL;
313
314 if (!bank_ops->aca_bank_parser)
315 return -EOPNOTSUPP;
316
317 return bank_ops->aca_bank_parser(handle, bank, type,
318 handle->data);
319 }
320
handler_aca_log_bank_error(struct aca_handle * handle,struct aca_bank * bank,enum aca_smu_type type,void * data)321 static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank,
322 enum aca_smu_type type, void *data)
323 {
324 int ret;
325
326 ret = aca_bank_parser(handle, bank, type);
327 if (ret)
328 return ret;
329
330 return 0;
331 }
332
aca_dispatch_bank(struct aca_handle_manager * mgr,struct aca_bank * bank,enum aca_smu_type type,bank_handler_t handler,void * data)333 static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
334 enum aca_smu_type type, bank_handler_t handler, void *data)
335 {
336 struct aca_handle *handle;
337 int ret;
338
339 if (list_empty(&mgr->list))
340 return 0;
341
342 list_for_each_entry(handle, &mgr->list, node) {
343 if (!aca_bank_is_valid(handle, bank, type))
344 continue;
345
346 ret = handler(handle, bank, type, data);
347 if (ret)
348 return ret;
349 }
350
351 return 0;
352 }
353
aca_dispatch_banks(struct aca_handle_manager * mgr,struct aca_banks * banks,enum aca_smu_type type,bank_handler_t handler,void * data)354 static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
355 enum aca_smu_type type, bank_handler_t handler, void *data)
356 {
357 struct aca_bank_node *node;
358 struct aca_bank *bank;
359 int ret;
360
361 if (!mgr || !banks)
362 return -EINVAL;
363
364 /* pre check to avoid unnecessary operations */
365 if (list_empty(&mgr->list) || list_empty(&banks->list))
366 return 0;
367
368 list_for_each_entry(node, &banks->list, node) {
369 bank = &node->bank;
370
371 ret = aca_dispatch_bank(mgr, bank, type, handler, data);
372 if (ret)
373 return ret;
374 }
375
376 return 0;
377 }
378
aca_bank_should_update(struct amdgpu_device * adev,enum aca_smu_type type)379 static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type)
380 {
381 struct amdgpu_aca *aca = &adev->aca;
382 bool ret = true;
383
384 /*
385 * Because the UE Valid MCA count will only be cleared after reset,
386 * in order to avoid repeated counting of the error count,
387 * the aca bank is only updated once during the gpu recovery stage.
388 */
389 if (type == ACA_SMU_TYPE_UE) {
390 if (amdgpu_ras_intr_triggered())
391 ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0;
392 else
393 atomic_set(&aca->ue_update_flag, 0);
394 }
395
396 return ret;
397 }
398
aca_banks_generate_cper(struct amdgpu_device * adev,enum aca_smu_type type,struct aca_banks * banks,int count)399 static void aca_banks_generate_cper(struct amdgpu_device *adev,
400 enum aca_smu_type type,
401 struct aca_banks *banks,
402 int count)
403 {
404 struct aca_bank_node *node;
405 struct aca_bank *bank;
406 int r;
407
408 if (!adev->cper.enabled)
409 return;
410
411 if (!banks || !count) {
412 dev_warn(adev->dev, "fail to generate cper records\n");
413 return;
414 }
415
416 /* UEs must be encoded into separate CPER entries */
417 if (type == ACA_SMU_TYPE_UE) {
418 struct aca_banks de_banks;
419
420 aca_banks_init(&de_banks);
421 list_for_each_entry(node, &banks->list, node) {
422 bank = &node->bank;
423 if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
424 r = aca_banks_add_bank(&de_banks, bank);
425 if (r)
426 dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r);
427 } else {
428 if (amdgpu_cper_generate_ue_record(adev, bank))
429 dev_warn(adev->dev, "fail to generate ue cper records\n");
430 }
431 }
432
433 if (!list_empty(&de_banks.list)) {
434 if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks))
435 dev_warn(adev->dev, "fail to generate de cper records\n");
436 }
437
438 aca_banks_release(&de_banks);
439 } else {
440 /*
441 * SMU_TYPE_CE banks are combined into 1 CPER entries,
442 * they could be CEs or DEs or both
443 */
444 if (amdgpu_cper_generate_ce_records(adev, banks, count))
445 dev_warn(adev->dev, "fail to generate ce cper records\n");
446 }
447 }
448
aca_banks_update(struct amdgpu_device * adev,enum aca_smu_type type,bank_handler_t handler,struct ras_query_context * qctx,void * data)449 static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
450 bank_handler_t handler, struct ras_query_context *qctx, void *data)
451 {
452 struct amdgpu_aca *aca = &adev->aca;
453 struct aca_banks banks;
454 u32 count = 0;
455 int ret;
456
457 if (list_empty(&aca->mgr.list))
458 return 0;
459
460 if (!aca_bank_should_update(adev, type))
461 return 0;
462
463 ret = aca_smu_get_valid_aca_count(adev, type, &count);
464 if (ret)
465 return ret;
466
467 if (!count)
468 return 0;
469
470 aca_banks_init(&banks);
471
472 ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);
473 if (ret)
474 goto err_release_banks;
475
476 if (list_empty(&banks.list)) {
477 ret = 0;
478 goto err_release_banks;
479 }
480
481 ret = aca_dispatch_banks(&aca->mgr, &banks, type,
482 handler, data);
483 if (ret)
484 goto err_release_banks;
485
486 aca_banks_generate_cper(adev, type, &banks, count);
487
488 err_release_banks:
489 aca_banks_release(&banks);
490
491 return ret;
492 }
493
aca_log_aca_error_data(struct aca_bank_error * bank_error,enum aca_error_type type,struct ras_err_data * err_data)494 static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_error_type type, struct ras_err_data *err_data)
495 {
496 struct aca_bank_info *info;
497 struct amdgpu_smuio_mcm_config_info mcm_info;
498 u64 count;
499
500 if (type >= ACA_ERROR_TYPE_COUNT)
501 return -EINVAL;
502
503 count = bank_error->count;
504 if (!count)
505 return 0;
506
507 info = &bank_error->info;
508 mcm_info.die_id = info->die_id;
509 mcm_info.socket_id = info->socket_id;
510
511 switch (type) {
512 case ACA_ERROR_TYPE_UE:
513 amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, count);
514 break;
515 case ACA_ERROR_TYPE_CE:
516 amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, count);
517 break;
518 case ACA_ERROR_TYPE_DEFERRED:
519 amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, count);
520 break;
521 default:
522 break;
523 }
524
525 return 0;
526 }
527
aca_log_aca_error(struct aca_handle * handle,enum aca_error_type type,struct ras_err_data * err_data)528 static int aca_log_aca_error(struct aca_handle *handle, enum aca_error_type type, struct ras_err_data *err_data)
529 {
530 struct aca_error_cache *error_cache = &handle->error_cache;
531 struct aca_error *aerr = &error_cache->errors[type];
532 struct aca_bank_error *bank_error, *tmp;
533
534 mutex_lock(&aerr->lock);
535
536 if (list_empty(&aerr->list))
537 goto out_unlock;
538
539 list_for_each_entry_safe(bank_error, tmp, &aerr->list, node) {
540 aca_log_aca_error_data(bank_error, type, err_data);
541 aca_bank_error_remove(aerr, bank_error);
542 }
543
544 out_unlock:
545 mutex_unlock(&aerr->lock);
546
547 return 0;
548 }
549
__aca_get_error_data(struct amdgpu_device * adev,struct aca_handle * handle,enum aca_error_type type,struct ras_err_data * err_data,struct ras_query_context * qctx)550 static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type,
551 struct ras_err_data *err_data, struct ras_query_context *qctx)
552 {
553 enum aca_smu_type smu_type;
554 int ret;
555
556 switch (type) {
557 case ACA_ERROR_TYPE_UE:
558 smu_type = ACA_SMU_TYPE_UE;
559 break;
560 case ACA_ERROR_TYPE_CE:
561 case ACA_ERROR_TYPE_DEFERRED:
562 smu_type = ACA_SMU_TYPE_CE;
563 break;
564 default:
565 return -EINVAL;
566 }
567
568 /* update aca bank to aca source error_cache first */
569 ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);
570 if (ret)
571 return ret;
572
573 /* DEs may contain in CEs or UEs */
574 if (type != ACA_ERROR_TYPE_DEFERRED)
575 aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data);
576
577 return aca_log_aca_error(handle, type, err_data);
578 }
579
aca_handle_is_valid(struct aca_handle * handle)580 static bool aca_handle_is_valid(struct aca_handle *handle)
581 {
582 if (!handle->mask || !list_empty(&handle->node))
583 return false;
584
585 return true;
586 }
587
amdgpu_aca_get_error_data(struct amdgpu_device * adev,struct aca_handle * handle,enum aca_error_type type,struct ras_err_data * err_data,struct ras_query_context * qctx)588 int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle,
589 enum aca_error_type type, struct ras_err_data *err_data,
590 struct ras_query_context *qctx)
591 {
592 if (!handle || !err_data)
593 return -EINVAL;
594
595 if (aca_handle_is_valid(handle))
596 return -EOPNOTSUPP;
597
598 if ((type < 0) || (!(BIT(type) & handle->mask)))
599 return 0;
600
601 return __aca_get_error_data(adev, handle, type, err_data, qctx);
602 }
603
aca_error_init(struct aca_error * aerr,enum aca_error_type type)604 static void aca_error_init(struct aca_error *aerr, enum aca_error_type type)
605 {
606 mutex_init(&aerr->lock);
607 INIT_LIST_HEAD(&aerr->list);
608 aerr->type = type;
609 aerr->nr_errors = 0;
610 }
611
aca_init_error_cache(struct aca_handle * handle)612 static void aca_init_error_cache(struct aca_handle *handle)
613 {
614 struct aca_error_cache *error_cache = &handle->error_cache;
615 int type;
616
617 for (type = ACA_ERROR_TYPE_UE; type < ACA_ERROR_TYPE_COUNT; type++)
618 aca_error_init(&error_cache->errors[type], type);
619 }
620
aca_error_fini(struct aca_error * aerr)621 static void aca_error_fini(struct aca_error *aerr)
622 {
623 struct aca_bank_error *bank_error, *tmp;
624
625 mutex_lock(&aerr->lock);
626 if (list_empty(&aerr->list))
627 goto out_unlock;
628
629 list_for_each_entry_safe(bank_error, tmp, &aerr->list, node)
630 aca_bank_error_remove(aerr, bank_error);
631
632 out_unlock:
633 mutex_destroy(&aerr->lock);
634 }
635
aca_fini_error_cache(struct aca_handle * handle)636 static void aca_fini_error_cache(struct aca_handle *handle)
637 {
638 struct aca_error_cache *error_cache = &handle->error_cache;
639 int type;
640
641 for (type = ACA_ERROR_TYPE_UE; type < ACA_ERROR_TYPE_COUNT; type++)
642 aca_error_fini(&error_cache->errors[type]);
643 }
644
add_aca_handle(struct amdgpu_device * adev,struct aca_handle_manager * mgr,struct aca_handle * handle,const char * name,const struct aca_info * ras_info,void * data)645 static int add_aca_handle(struct amdgpu_device *adev, struct aca_handle_manager *mgr, struct aca_handle *handle,
646 const char *name, const struct aca_info *ras_info, void *data)
647 {
648 memset(handle, 0, sizeof(*handle));
649
650 handle->adev = adev;
651 handle->mgr = mgr;
652 handle->name = name;
653 handle->hwip = ras_info->hwip;
654 handle->mask = ras_info->mask;
655 handle->bank_ops = ras_info->bank_ops;
656 handle->data = data;
657 aca_init_error_cache(handle);
658
659 INIT_LIST_HEAD(&handle->node);
660 list_add_tail(&handle->node, &mgr->list);
661 mgr->nr_handles++;
662
663 return 0;
664 }
665
aca_sysfs_read(struct device * dev,struct device_attribute * attr,char * buf)666 static ssize_t aca_sysfs_read(struct device *dev,
667 struct device_attribute *attr, char *buf)
668 {
669 struct aca_handle *handle = container_of(attr, struct aca_handle, aca_attr);
670
671 /* NOTE: the aca cache will be auto cleared once read,
672 * So the driver should unify the query entry point, forward request to ras query interface directly */
673 return amdgpu_ras_aca_sysfs_read(dev, attr, handle, buf, handle->data);
674 }
675
add_aca_sysfs(struct amdgpu_device * adev,struct aca_handle * handle)676 static int add_aca_sysfs(struct amdgpu_device *adev, struct aca_handle *handle)
677 {
678 struct device_attribute *aca_attr = &handle->aca_attr;
679
680 snprintf(handle->attr_name, sizeof(handle->attr_name) - 1, "aca_%s", handle->name);
681 aca_attr->show = aca_sysfs_read;
682 aca_attr->attr.name = handle->attr_name;
683 aca_attr->attr.mode = S_IRUGO;
684 sysfs_attr_init(&aca_attr->attr);
685
686 return sysfs_add_file_to_group(&adev->dev->kobj,
687 &aca_attr->attr,
688 "ras");
689 }
690
amdgpu_aca_add_handle(struct amdgpu_device * adev,struct aca_handle * handle,const char * name,const struct aca_info * ras_info,void * data)691 int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,
692 const char *name, const struct aca_info *ras_info, void *data)
693 {
694 struct amdgpu_aca *aca = &adev->aca;
695 int ret;
696
697 if (!amdgpu_aca_is_enabled(adev))
698 return 0;
699
700 ret = add_aca_handle(adev, &aca->mgr, handle, name, ras_info, data);
701 if (ret)
702 return ret;
703
704 return add_aca_sysfs(adev, handle);
705 }
706
remove_aca_handle(struct aca_handle * handle)707 static void remove_aca_handle(struct aca_handle *handle)
708 {
709 struct aca_handle_manager *mgr = handle->mgr;
710
711 aca_fini_error_cache(handle);
712 list_del(&handle->node);
713 mgr->nr_handles--;
714 }
715
remove_aca_sysfs(struct aca_handle * handle)716 static void remove_aca_sysfs(struct aca_handle *handle)
717 {
718 struct amdgpu_device *adev = handle->adev;
719 struct device_attribute *aca_attr = &handle->aca_attr;
720
721 if (adev->dev->kobj.sd)
722 sysfs_remove_file_from_group(&adev->dev->kobj,
723 &aca_attr->attr,
724 "ras");
725 }
726
amdgpu_aca_remove_handle(struct aca_handle * handle)727 void amdgpu_aca_remove_handle(struct aca_handle *handle)
728 {
729 if (!handle || list_empty(&handle->node))
730 return;
731
732 remove_aca_sysfs(handle);
733 remove_aca_handle(handle);
734 }
735
aca_manager_init(struct aca_handle_manager * mgr)736 static int aca_manager_init(struct aca_handle_manager *mgr)
737 {
738 INIT_LIST_HEAD(&mgr->list);
739 mgr->nr_handles = 0;
740
741 return 0;
742 }
743
aca_manager_fini(struct aca_handle_manager * mgr)744 static void aca_manager_fini(struct aca_handle_manager *mgr)
745 {
746 struct aca_handle *handle, *tmp;
747
748 if (list_empty(&mgr->list))
749 return;
750
751 list_for_each_entry_safe(handle, tmp, &mgr->list, node)
752 amdgpu_aca_remove_handle(handle);
753 }
754
amdgpu_aca_is_enabled(struct amdgpu_device * adev)755 bool amdgpu_aca_is_enabled(struct amdgpu_device *adev)
756 {
757 return (adev->aca.is_enabled ||
758 adev->debug_enable_ras_aca);
759 }
760
amdgpu_aca_init(struct amdgpu_device * adev)761 int amdgpu_aca_init(struct amdgpu_device *adev)
762 {
763 struct amdgpu_aca *aca = &adev->aca;
764 int ret;
765
766 atomic_set(&aca->ue_update_flag, 0);
767
768 ret = aca_manager_init(&aca->mgr);
769 if (ret)
770 return ret;
771
772 return 0;
773 }
774
amdgpu_aca_fini(struct amdgpu_device * adev)775 void amdgpu_aca_fini(struct amdgpu_device *adev)
776 {
777 struct amdgpu_aca *aca = &adev->aca;
778
779 aca_manager_fini(&aca->mgr);
780
781 atomic_set(&aca->ue_update_flag, 0);
782 }
783
amdgpu_aca_reset(struct amdgpu_device * adev)784 int amdgpu_aca_reset(struct amdgpu_device *adev)
785 {
786 struct amdgpu_aca *aca = &adev->aca;
787
788 atomic_set(&aca->ue_update_flag, 0);
789
790 return 0;
791 }
792
amdgpu_aca_set_smu_funcs(struct amdgpu_device * adev,const struct aca_smu_funcs * smu_funcs)793 void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs)
794 {
795 struct amdgpu_aca *aca = &adev->aca;
796
797 WARN_ON(aca->smu_funcs);
798 aca->smu_funcs = smu_funcs;
799 }
800
aca_bank_info_decode(struct aca_bank * bank,struct aca_bank_info * info)801 int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info)
802 {
803 u64 ipid;
804 u32 instidhi, instidlo;
805
806 if (!bank || !info)
807 return -EINVAL;
808
809 ipid = bank->regs[ACA_REG_IDX_IPID];
810 info->hwid = ACA_REG__IPID__HARDWAREID(ipid);
811 info->mcatype = ACA_REG__IPID__MCATYPE(ipid);
812 /*
813 * Unfied DieID Format: SAASS. A:AID, S:Socket.
814 * Unfied DieID[4:4] = InstanceId[0:0]
815 * Unfied DieID[0:3] = InstanceIdHi[0:3]
816 */
817 instidhi = ACA_REG__IPID__INSTANCEIDHI(ipid);
818 instidlo = ACA_REG__IPID__INSTANCEIDLO(ipid);
819 info->die_id = ((instidhi >> 2) & 0x03);
820 info->socket_id = ((instidlo & 0x1) << 2) | (instidhi & 0x03);
821
822 return 0;
823 }
824
aca_bank_get_error_code(struct amdgpu_device * adev,struct aca_bank * bank)825 static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank)
826 {
827 struct amdgpu_aca *aca = &adev->aca;
828 const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
829
830 if (!smu_funcs || !smu_funcs->parse_error_code)
831 return -EOPNOTSUPP;
832
833 return smu_funcs->parse_error_code(adev, bank);
834 }
835
aca_bank_check_error_codes(struct amdgpu_device * adev,struct aca_bank * bank,int * err_codes,int size)836 int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size)
837 {
838 int i, error_code;
839
840 if (!bank || !err_codes)
841 return -EINVAL;
842
843 error_code = aca_bank_get_error_code(adev, bank);
844 if (error_code < 0)
845 return error_code;
846
847 for (i = 0; i < size; i++) {
848 if (err_codes[i] == error_code)
849 return 0;
850 }
851
852 return -EINVAL;
853 }
854
amdgpu_aca_smu_set_debug_mode(struct amdgpu_device * adev,bool en)855 int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en)
856 {
857 struct amdgpu_aca *aca = &adev->aca;
858 const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
859
860 if (!smu_funcs || !smu_funcs->set_debug_mode)
861 return -EOPNOTSUPP;
862
863 return smu_funcs->set_debug_mode(adev, en);
864 }
865
866 #if defined(CONFIG_DEBUG_FS)
amdgpu_aca_smu_debug_mode_set(void * data,u64 val)867 static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val)
868 {
869 struct amdgpu_device *adev = (struct amdgpu_device *)data;
870 int ret;
871
872 ret = amdgpu_ras_set_aca_debug_mode(adev, val ? true : false);
873 if (ret)
874 return ret;
875
876 dev_info(adev->dev, "amdgpu set smu aca debug mode %s success\n", val ? "on" : "off");
877
878 return 0;
879 }
880
aca_dump_entry(struct seq_file * m,struct aca_bank * bank,enum aca_smu_type type,int idx)881 static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx)
882 {
883 struct aca_bank_info info;
884 int i, ret;
885
886 ret = aca_bank_info_decode(bank, &info);
887 if (ret)
888 return;
889
890 seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE");
891 seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
892 idx, info.socket_id, info.die_id, info.hwid, info.mcatype);
893
894 for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
895 seq_printf(m, "aca entry[%d].regs[%d]: 0x%016llx\n", idx, aca_regs[i].reg_idx, bank->regs[aca_regs[i].reg_idx]);
896 }
897
898 struct aca_dump_context {
899 struct seq_file *m;
900 int idx;
901 };
902
handler_aca_bank_dump(struct aca_handle * handle,struct aca_bank * bank,enum aca_smu_type type,void * data)903 static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank,
904 enum aca_smu_type type, void *data)
905 {
906 struct aca_dump_context *ctx = (struct aca_dump_context *)data;
907
908 aca_dump_entry(ctx->m, bank, type, ctx->idx++);
909
910 return handler_aca_log_bank_error(handle, bank, type, NULL);
911 }
912
aca_dump_show(struct seq_file * m,enum aca_smu_type type)913 static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)
914 {
915 struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
916 struct aca_dump_context context = {
917 .m = m,
918 .idx = 0,
919 };
920
921 return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);
922 }
923
aca_dump_ce_show(struct seq_file * m,void * unused)924 static int aca_dump_ce_show(struct seq_file *m, void *unused)
925 {
926 return aca_dump_show(m, ACA_SMU_TYPE_CE);
927 }
928
aca_dump_ce_open(struct inode * inode,struct file * file)929 static int aca_dump_ce_open(struct inode *inode, struct file *file)
930 {
931 return single_open(file, aca_dump_ce_show, inode->i_private);
932 }
933
934 static const struct file_operations aca_ce_dump_debug_fops = {
935 .owner = THIS_MODULE,
936 .open = aca_dump_ce_open,
937 .read = seq_read,
938 .llseek = seq_lseek,
939 .release = single_release,
940 };
941
aca_dump_ue_show(struct seq_file * m,void * unused)942 static int aca_dump_ue_show(struct seq_file *m, void *unused)
943 {
944 return aca_dump_show(m, ACA_SMU_TYPE_UE);
945 }
946
aca_dump_ue_open(struct inode * inode,struct file * file)947 static int aca_dump_ue_open(struct inode *inode, struct file *file)
948 {
949 return single_open(file, aca_dump_ue_show, inode->i_private);
950 }
951
952 static const struct file_operations aca_ue_dump_debug_fops = {
953 .owner = THIS_MODULE,
954 .open = aca_dump_ue_open,
955 .read = seq_read,
956 .llseek = seq_lseek,
957 .release = single_release,
958 };
959
960 DEFINE_DEBUGFS_ATTRIBUTE(aca_debug_mode_fops, NULL, amdgpu_aca_smu_debug_mode_set, "%llu\n");
961 #endif
962
amdgpu_aca_smu_debugfs_init(struct amdgpu_device * adev,struct dentry * root)963 void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root)
964 {
965 #if defined(CONFIG_DEBUG_FS)
966 if (!root)
967 return;
968
969 debugfs_create_file("aca_debug_mode", 0200, root, adev, &aca_debug_mode_fops);
970 debugfs_create_file("aca_ue_dump", 0400, root, adev, &aca_ue_dump_debug_fops);
971 debugfs_create_file("aca_ce_dump", 0400, root, adev, &aca_ce_dump_debug_fops);
972 #endif
973 }
974