1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24 #include "ras.h"
25 #include "ras_aca.h"
26 #include "ras_aca_v1_0.h"
27 #include "ras_mp1_v13_0.h"
28
29 #define ACA_MARK_FATAL_FLAG 0x100
30 #define ACA_MARK_UE_READ_FLAG 0x1
31
32 #define blk_name(block_id) ras_core_get_ras_block_name(block_id)
33
34 static struct aca_regs_dump {
35 const char *name;
36 int reg_idx;
37 } aca_regs[] = {
38 {"CONTROL", ACA_REG_IDX__CTL},
39 {"STATUS", ACA_REG_IDX__STATUS},
40 {"ADDR", ACA_REG_IDX__ADDR},
41 {"MISC", ACA_REG_IDX__MISC0},
42 {"CONFIG", ACA_REG_IDX__CONFG},
43 {"IPID", ACA_REG_IDX__IPID},
44 {"SYND", ACA_REG_IDX__SYND},
45 {"DESTAT", ACA_REG_IDX__DESTAT},
46 {"DEADDR", ACA_REG_IDX__DEADDR},
47 {"CONTROL_MASK", ACA_REG_IDX__CTL_MASK},
48 };
49
50
aca_report_ecc_info(struct ras_core_context * ras_core,u64 seq_no,u32 blk,u32 skt,u32 aid,struct aca_aid_ecc * aid_ecc,struct aca_bank_ecc * new_ecc)51 static void aca_report_ecc_info(struct ras_core_context *ras_core,
52 u64 seq_no, u32 blk, u32 skt, u32 aid,
53 struct aca_aid_ecc *aid_ecc,
54 struct aca_bank_ecc *new_ecc)
55 {
56 struct aca_ecc_count ecc_count = {0};
57
58 ecc_count.new_ue_count = new_ecc->ue_count;
59 ecc_count.new_de_count = new_ecc->de_count;
60 ecc_count.new_ce_count = new_ecc->ce_count;
61 if (blk == RAS_BLOCK_ID__GFX) {
62 struct aca_ecc_count *xcd_ecc;
63 int xcd_id;
64
65 for (xcd_id = 0; xcd_id < aid_ecc->xcd.xcd_num; xcd_id++) {
66 xcd_ecc = &aid_ecc->xcd.xcd[xcd_id].ecc_err;
67 ecc_count.total_ue_count += xcd_ecc->total_ue_count;
68 ecc_count.total_de_count += xcd_ecc->total_de_count;
69 ecc_count.total_ce_count += xcd_ecc->total_ce_count;
70 }
71 } else {
72 ecc_count.total_ue_count = aid_ecc->ecc_err.total_ue_count;
73 ecc_count.total_de_count = aid_ecc->ecc_err.total_de_count;
74 ecc_count.total_ce_count = aid_ecc->ecc_err.total_ce_count;
75 }
76
77 if (ecc_count.new_ue_count) {
78 RAS_DEV_INFO(ras_core->dev,
79 "{%llu} socket: %d, die: %d, %u new uncorrectable hardware errors detected in %s block\n",
80 seq_no, skt, aid, ecc_count.new_ue_count, blk_name(blk));
81 RAS_DEV_INFO(ras_core->dev,
82 "{%llu} socket: %d, die: %d, %u uncorrectable hardware errors detected in total in %s block\n",
83 seq_no, skt, aid, ecc_count.total_ue_count, blk_name(blk));
84 }
85
86 if (ecc_count.new_de_count) {
87 RAS_DEV_INFO(ras_core->dev,
88 "{%llu} socket: %d, die: %d, %u new %s detected in %s block\n",
89 seq_no, skt, aid, ecc_count.new_de_count,
90 (blk == RAS_BLOCK_ID__UMC) ?
91 "deferred hardware errors" : "poison consumption",
92 blk_name(blk));
93 RAS_DEV_INFO(ras_core->dev,
94 "{%llu} socket: %d, die: %d, %u %s detected in total in %s block\n",
95 seq_no, skt, aid, ecc_count.total_de_count,
96 (blk == RAS_BLOCK_ID__UMC) ?
97 "deferred hardware errors" : "poison consumption",
98 blk_name(blk));
99 }
100
101 if (ecc_count.new_ce_count) {
102 RAS_DEV_INFO(ras_core->dev,
103 "{%llu} socket: %d, die: %d, %u new correctable hardware errors detected in %s block\n",
104 seq_no, skt, aid, ecc_count.new_ce_count, blk_name(blk));
105 RAS_DEV_INFO(ras_core->dev,
106 "{%llu} socket: %d, die: %d, %u correctable hardware errors detected in total in %s block\n",
107 seq_no, skt, aid, ecc_count.total_ce_count, blk_name(blk));
108 }
109 }
110
aca_bank_log(struct ras_core_context * ras_core,int idx,int total,struct aca_bank_reg * bank,struct aca_bank_ecc * bank_ecc)111 static void aca_bank_log(struct ras_core_context *ras_core,
112 int idx, int total, struct aca_bank_reg *bank,
113 struct aca_bank_ecc *bank_ecc)
114 {
115 int i;
116
117 RAS_DEV_INFO(ras_core->dev,
118 "{%llu}" RAS_HW_ERR "Accelerator Check Architecture events logged\n",
119 bank->seq_no);
120 /* plus 1 for output format, e.g: ACA[08/08]: xxxx */
121 for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
122 RAS_DEV_INFO(ras_core->dev,
123 "{%llu}" RAS_HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
124 bank->seq_no, idx + 1, total,
125 aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
126 }
127
aca_log_bank_data(struct ras_core_context * ras_core,struct aca_bank_reg * bank,struct aca_bank_ecc * bank_ecc,struct ras_log_batch_tag * batch)128 static void aca_log_bank_data(struct ras_core_context *ras_core,
129 struct aca_bank_reg *bank, struct aca_bank_ecc *bank_ecc,
130 struct ras_log_batch_tag *batch)
131 {
132 if (bank_ecc->ue_count)
133 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_UE, bank->regs, batch);
134 else if (bank_ecc->de_count)
135 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_DE, bank->regs, batch);
136 else
137 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_CE, bank->regs, batch);
138 }
139
aca_get_bank_count(struct ras_core_context * ras_core,enum ras_err_type type,u32 * count)140 static int aca_get_bank_count(struct ras_core_context *ras_core,
141 enum ras_err_type type, u32 *count)
142 {
143 return ras_mp1_get_bank_count(ras_core, type, count);
144 }
145
aca_match_bank(struct aca_block * aca_blk,struct aca_bank_reg * bank)146 static bool aca_match_bank(struct aca_block *aca_blk, struct aca_bank_reg *bank)
147 {
148 const struct aca_bank_hw_ops *bank_ops;
149
150 if (!aca_blk->blk_info)
151 return false;
152
153 bank_ops = &aca_blk->blk_info->bank_ops;
154 if (!bank_ops->bank_match)
155 return false;
156
157 return bank_ops->bank_match(aca_blk, bank);
158 }
159
aca_parse_bank(struct ras_core_context * ras_core,struct aca_block * aca_blk,struct aca_bank_reg * bank,struct aca_bank_ecc * ecc)160 static int aca_parse_bank(struct ras_core_context *ras_core,
161 struct aca_block *aca_blk,
162 struct aca_bank_reg *bank,
163 struct aca_bank_ecc *ecc)
164 {
165 const struct aca_bank_hw_ops *bank_ops = &aca_blk->blk_info->bank_ops;
166
167 if (!bank_ops || !bank_ops->bank_parse)
168 return -RAS_CORE_NOT_SUPPORTED;
169
170 return bank_ops->bank_parse(ras_core, aca_blk, bank, ecc);
171 }
172
aca_check_block_ecc_info(struct ras_core_context * ras_core,struct aca_block * aca_blk,struct aca_ecc_info * info)173 static int aca_check_block_ecc_info(struct ras_core_context *ras_core,
174 struct aca_block *aca_blk, struct aca_ecc_info *info)
175 {
176 if (info->socket_id >= aca_blk->ecc.socket_num_per_hive) {
177 RAS_DEV_ERR(ras_core->dev,
178 "Socket id (%d) is out of config! max:%u\n",
179 info->socket_id, aca_blk->ecc.socket_num_per_hive);
180 return -ENODATA;
181 }
182
183 if (info->die_id >= aca_blk->ecc.socket[info->socket_id].aid_num) {
184 RAS_DEV_ERR(ras_core->dev,
185 "Die id (%d) is out of config! max:%u\n",
186 info->die_id, aca_blk->ecc.socket[info->socket_id].aid_num);
187 return -ENODATA;
188 }
189
190 if ((aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX) &&
191 (info->xcd_id >=
192 aca_blk->ecc.socket[info->socket_id].aid[info->die_id].xcd.xcd_num)) {
193 RAS_DEV_ERR(ras_core->dev,
194 "Xcd id (%d) is out of config! max:%u\n",
195 info->xcd_id,
196 aca_blk->ecc.socket[info->socket_id].aid[info->die_id].xcd.xcd_num);
197 return -ENODATA;
198 }
199
200 return 0;
201 }
202
aca_log_bad_bank(struct ras_core_context * ras_core,struct aca_block * aca_blk,struct aca_bank_reg * bank,struct aca_bank_ecc * bank_ecc)203 static int aca_log_bad_bank(struct ras_core_context *ras_core,
204 struct aca_block *aca_blk, struct aca_bank_reg *bank,
205 struct aca_bank_ecc *bank_ecc)
206 {
207 struct aca_ecc_info *info;
208 struct aca_ecc_count *ecc_err;
209 struct aca_aid_ecc *aid_ecc;
210 int ret;
211
212 info = &bank_ecc->bank_info;
213
214 ret = aca_check_block_ecc_info(ras_core, aca_blk, info);
215 if (ret)
216 return ret;
217
218 mutex_lock(&ras_core->ras_aca.aca_lock);
219 aid_ecc = &aca_blk->ecc.socket[info->socket_id].aid[info->die_id];
220 if (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX)
221 ecc_err = &aid_ecc->xcd.xcd[info->xcd_id].ecc_err;
222 else
223 ecc_err = &aid_ecc->ecc_err;
224
225 ecc_err->new_ce_count += bank_ecc->ce_count;
226 ecc_err->total_ce_count += bank_ecc->ce_count;
227 ecc_err->new_ue_count += bank_ecc->ue_count;
228 ecc_err->total_ue_count += bank_ecc->ue_count;
229 ecc_err->new_de_count += bank_ecc->de_count;
230 ecc_err->total_de_count += bank_ecc->de_count;
231 mutex_unlock(&ras_core->ras_aca.aca_lock);
232
233 if ((aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__UMC) &&
234 bank_ecc->de_count) {
235 struct ras_bank_ecc ras_ecc = {0};
236
237 ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core);
238 ras_ecc.addr = bank_ecc->bank_info.addr;
239 ras_ecc.ipid = bank_ecc->bank_info.ipid;
240 ras_ecc.status = bank_ecc->bank_info.status;
241 ras_ecc.seq_no = bank->seq_no;
242
243 if (ras_core_gpu_in_reset(ras_core))
244 ras_umc_log_bad_bank_pending(ras_core, &ras_ecc);
245 else
246 ras_umc_log_bad_bank(ras_core, &ras_ecc);
247 }
248
249 aca_report_ecc_info(ras_core,
250 bank->seq_no, aca_blk->blk_info->ras_block_id, info->socket_id, info->die_id,
251 &aca_blk->ecc.socket[info->socket_id].aid[info->die_id], bank_ecc);
252
253 return 0;
254 }
255
aca_get_bank_aca_block(struct ras_core_context * ras_core,struct aca_bank_reg * bank)256 static struct aca_block *aca_get_bank_aca_block(struct ras_core_context *ras_core,
257 struct aca_bank_reg *bank)
258 {
259 int i = 0;
260
261 for (i = 0; i < RAS_BLOCK_ID__LAST; i++)
262 if (aca_match_bank(&ras_core->ras_aca.aca_blk[i], bank))
263 return &ras_core->ras_aca.aca_blk[i];
264
265 return NULL;
266 }
267
aca_dump_bank(struct ras_core_context * ras_core,u32 ecc_type,int idx,void * data)268 static int aca_dump_bank(struct ras_core_context *ras_core, u32 ecc_type,
269 int idx, void *data)
270 {
271 struct aca_bank_reg *bank = (struct aca_bank_reg *)data;
272 int i, ret, reg_cnt;
273
274 reg_cnt = min_t(int, 16, ARRAY_SIZE(bank->regs));
275 for (i = 0; i < reg_cnt; i++) {
276 ret = ras_mp1_dump_bank(ras_core, ecc_type, idx, i, &bank->regs[i]);
277 if (ret)
278 return ret;
279 }
280
281 return 0;
282 }
283
aca_get_bank_seqno(struct ras_core_context * ras_core,enum ras_err_type err_type,struct aca_block * aca_blk,struct aca_bank_ecc * bank_ecc)284 static uint64_t aca_get_bank_seqno(struct ras_core_context *ras_core,
285 enum ras_err_type err_type, struct aca_block *aca_blk,
286 struct aca_bank_ecc *bank_ecc)
287 {
288 uint64_t seq_no = 0;
289
290 if (bank_ecc->de_count) {
291 if (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__UMC)
292 seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_DE, true);
293 else
294 seq_no = ras_core_get_seqno(ras_core,
295 RAS_SEQNO_TYPE_POISON_CONSUMPTION, true);
296 } else if (bank_ecc->ue_count) {
297 seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_UE, true);
298 } else {
299 seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_CE, true);
300 }
301
302 return seq_no;
303 }
304
aca_dup_update_ue_in_fatal(struct ras_core_context * ras_core,u32 ecc_type)305 static bool aca_dup_update_ue_in_fatal(struct ras_core_context *ras_core,
306 u32 ecc_type)
307 {
308 struct ras_aca *aca = &ras_core->ras_aca;
309
310 if (ecc_type != RAS_ERR_TYPE__UE)
311 return false;
312
313 if (aca->ue_updated_mark & ACA_MARK_FATAL_FLAG) {
314 if (aca->ue_updated_mark & ACA_MARK_UE_READ_FLAG)
315 return true;
316
317 aca->ue_updated_mark |= ACA_MARK_UE_READ_FLAG;
318 }
319
320 return false;
321 }
322
ras_aca_mark_fatal_flag(struct ras_core_context * ras_core)323 void ras_aca_mark_fatal_flag(struct ras_core_context *ras_core)
324 {
325 struct ras_aca *aca = &ras_core->ras_aca;
326
327 if (!aca)
328 return;
329
330 aca->ue_updated_mark |= ACA_MARK_FATAL_FLAG;
331 }
332
ras_aca_clear_fatal_flag(struct ras_core_context * ras_core)333 void ras_aca_clear_fatal_flag(struct ras_core_context *ras_core)
334 {
335 struct ras_aca *aca = &ras_core->ras_aca;
336
337 if (!aca)
338 return;
339
340 if ((aca->ue_updated_mark & ACA_MARK_FATAL_FLAG) &&
341 (aca->ue_updated_mark & ACA_MARK_UE_READ_FLAG))
342 aca->ue_updated_mark = 0;
343 }
344
aca_banks_update(struct ras_core_context * ras_core,u32 ecc_type,void * data)345 static int aca_banks_update(struct ras_core_context *ras_core,
346 u32 ecc_type, void *data)
347 {
348 struct aca_bank_reg bank;
349 struct aca_block *aca_blk;
350 struct aca_bank_ecc bank_ecc;
351 struct ras_log_batch_tag *batch_tag = NULL;
352 u32 count = 0;
353 int ret = 0;
354 int i;
355
356 mutex_lock(&ras_core->ras_aca.bank_op_lock);
357
358 if (aca_dup_update_ue_in_fatal(ras_core, ecc_type))
359 goto out;
360
361 ret = aca_get_bank_count(ras_core, ecc_type, &count);
362 if (ret)
363 goto out;
364
365 if (!count)
366 goto out;
367
368 batch_tag = ras_log_ring_create_batch_tag(ras_core);
369 for (i = 0; i < count; i++) {
370 memset(&bank, 0, sizeof(bank));
371 ret = aca_dump_bank(ras_core, ecc_type, i, &bank);
372 if (ret)
373 break;
374
375 bank.ecc_type = ecc_type;
376
377 memset(&bank_ecc, 0, sizeof(bank_ecc));
378 aca_blk = aca_get_bank_aca_block(ras_core, &bank);
379 if (aca_blk)
380 ret = aca_parse_bank(ras_core, aca_blk, &bank, &bank_ecc);
381
382 bank.seq_no = aca_get_bank_seqno(ras_core, ecc_type, aca_blk, &bank_ecc);
383
384 aca_log_bank_data(ras_core, &bank, &bank_ecc, batch_tag);
385 aca_bank_log(ras_core, i, count, &bank, &bank_ecc);
386
387 if (!ret && aca_blk)
388 ret = aca_log_bad_bank(ras_core, aca_blk, &bank, &bank_ecc);
389
390 if (ret)
391 break;
392 }
393 ras_log_ring_destroy_batch_tag(ras_core, batch_tag);
394
395 out:
396 mutex_unlock(&ras_core->ras_aca.bank_op_lock);
397 return ret;
398 }
399
ras_aca_update_ecc(struct ras_core_context * ras_core,u32 type,void * data)400 int ras_aca_update_ecc(struct ras_core_context *ras_core, u32 type, void *data)
401 {
402 /* Update aca bank to aca source error_cache first */
403 return aca_banks_update(ras_core, type, data);
404 }
405
ras_aca_get_block_handle(struct ras_core_context * ras_core,uint32_t blk)406 static struct aca_block *ras_aca_get_block_handle(struct ras_core_context *ras_core, uint32_t blk)
407 {
408 return &ras_core->ras_aca.aca_blk[blk];
409 }
410
ras_aca_clear_block_ecc_count(struct ras_core_context * ras_core,u32 blk)411 static int ras_aca_clear_block_ecc_count(struct ras_core_context *ras_core, u32 blk)
412 {
413 struct aca_block *aca_blk;
414 struct aca_aid_ecc *aid_ecc;
415 int skt, aid, xcd;
416
417 mutex_lock(&ras_core->ras_aca.aca_lock);
418 aca_blk = ras_aca_get_block_handle(ras_core, blk);
419 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
420 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) {
421 aid_ecc = &aca_blk->ecc.socket[skt].aid[aid];
422 if (blk == RAS_BLOCK_ID__GFX) {
423 for (xcd = 0; xcd < aid_ecc->xcd.xcd_num; xcd++)
424 memset(&aid_ecc->xcd.xcd[xcd],
425 0, sizeof(struct aca_xcd_ecc));
426 } else {
427 memset(&aid_ecc->ecc_err, 0, sizeof(aid_ecc->ecc_err));
428 }
429 }
430 }
431 mutex_unlock(&ras_core->ras_aca.aca_lock);
432
433 return 0;
434 }
435
ras_aca_clear_all_blocks_ecc_count(struct ras_core_context * ras_core)436 int ras_aca_clear_all_blocks_ecc_count(struct ras_core_context *ras_core)
437 {
438 enum ras_block_id blk;
439 int ret;
440
441 for (blk = RAS_BLOCK_ID__UMC; blk < RAS_BLOCK_ID__LAST; blk++) {
442 ret = ras_aca_clear_block_ecc_count(ras_core, blk);
443 if (ret)
444 break;
445 }
446
447 return ret;
448 }
449
ras_aca_clear_block_new_ecc_count(struct ras_core_context * ras_core,u32 blk)450 int ras_aca_clear_block_new_ecc_count(struct ras_core_context *ras_core, u32 blk)
451 {
452 struct aca_block *aca_blk;
453 int skt, aid, xcd;
454 struct aca_ecc_count *ecc_err;
455 struct aca_aid_ecc *aid_ecc;
456
457 mutex_lock(&ras_core->ras_aca.aca_lock);
458 aca_blk = ras_aca_get_block_handle(ras_core, blk);
459 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
460 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) {
461 aid_ecc = &aca_blk->ecc.socket[skt].aid[aid];
462 if (blk == RAS_BLOCK_ID__GFX) {
463 for (xcd = 0; xcd < aid_ecc->xcd.xcd_num; xcd++) {
464 ecc_err = &aid_ecc->xcd.xcd[xcd].ecc_err;
465 ecc_err->new_ce_count = 0;
466 ecc_err->new_ue_count = 0;
467 ecc_err->new_de_count = 0;
468 }
469 } else {
470 ecc_err = &aid_ecc->ecc_err;
471 ecc_err->new_ce_count = 0;
472 ecc_err->new_ue_count = 0;
473 ecc_err->new_de_count = 0;
474 }
475 }
476 }
477 mutex_unlock(&ras_core->ras_aca.aca_lock);
478
479 return 0;
480 }
481
ras_aca_get_block_each_aid_ecc_count(struct ras_core_context * ras_core,u32 blk,u32 skt,u32 aid,u32 xcd,struct aca_ecc_count * ecc_count)482 static int ras_aca_get_block_each_aid_ecc_count(struct ras_core_context *ras_core,
483 u32 blk, u32 skt, u32 aid, u32 xcd,
484 struct aca_ecc_count *ecc_count)
485 {
486 struct aca_block *aca_blk;
487 struct aca_ecc_count *ecc_err;
488
489 aca_blk = ras_aca_get_block_handle(ras_core, blk);
490 if (blk == RAS_BLOCK_ID__GFX)
491 ecc_err = &aca_blk->ecc.socket[skt].aid[aid].xcd.xcd[xcd].ecc_err;
492 else
493 ecc_err = &aca_blk->ecc.socket[skt].aid[aid].ecc_err;
494
495 ecc_count->new_ce_count = ecc_err->new_ce_count;
496 ecc_count->total_ce_count = ecc_err->total_ce_count;
497 ecc_count->new_ue_count = ecc_err->new_ue_count;
498 ecc_count->total_ue_count = ecc_err->total_ue_count;
499 ecc_count->new_de_count = ecc_err->new_de_count;
500 ecc_count->total_de_count = ecc_err->total_de_count;
501
502 return 0;
503 }
504
_add_ecc_count(struct aca_ecc_count * des,struct aca_ecc_count * src)505 static inline void _add_ecc_count(struct aca_ecc_count *des, struct aca_ecc_count *src)
506 {
507 des->new_ce_count += src->new_ce_count;
508 des->total_ce_count += src->total_ce_count;
509 des->new_ue_count += src->new_ue_count;
510 des->total_ue_count += src->total_ue_count;
511 des->new_de_count += src->new_de_count;
512 des->total_de_count += src->total_de_count;
513 }
514
aca_get_ip_func(struct ras_core_context * ras_core,uint32_t ip_version)515 static const struct ras_aca_ip_func *aca_get_ip_func(
516 struct ras_core_context *ras_core, uint32_t ip_version)
517 {
518 switch (ip_version) {
519 case IP_VERSION(1, 0, 0):
520 return &ras_aca_func_v1_0;
521 default:
522 RAS_DEV_ERR(ras_core->dev,
523 "ACA ip version(0x%x) is not supported!\n", ip_version);
524 break;
525 }
526
527 return NULL;
528 }
529
ras_aca_get_block_ecc_count(struct ras_core_context * ras_core,u32 blk,void * data)530 int ras_aca_get_block_ecc_count(struct ras_core_context *ras_core,
531 u32 blk, void *data)
532 {
533 struct ras_ecc_count *err_data = (struct ras_ecc_count *)data;
534 struct aca_block *aca_blk;
535 int skt, aid, xcd;
536 struct aca_ecc_count ecc_xcd;
537 struct aca_ecc_count ecc_aid;
538 struct aca_ecc_count ecc;
539
540 if (blk >= RAS_BLOCK_ID__LAST)
541 return -EINVAL;
542
543 if (!err_data)
544 return -EINVAL;
545
546 aca_blk = ras_aca_get_block_handle(ras_core, blk);
547 memset(&ecc, 0, sizeof(ecc));
548
549 mutex_lock(&ras_core->ras_aca.aca_lock);
550 if (blk == RAS_BLOCK_ID__GFX) {
551 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
552 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) {
553 memset(&ecc_aid, 0, sizeof(ecc_aid));
554 for (xcd = 0;
555 xcd < aca_blk->ecc.socket[skt].aid[aid].xcd.xcd_num;
556 xcd++) {
557 memset(&ecc_xcd, 0, sizeof(ecc_xcd));
558 if (ras_aca_get_block_each_aid_ecc_count(ras_core,
559 blk, skt, aid, xcd, &ecc_xcd))
560 continue;
561 _add_ecc_count(&ecc_aid, &ecc_xcd);
562 }
563 _add_ecc_count(&ecc, &ecc_aid);
564 }
565 }
566 } else {
567 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
568 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) {
569 memset(&ecc_aid, 0, sizeof(ecc_aid));
570 if (ras_aca_get_block_each_aid_ecc_count(ras_core,
571 blk, skt, aid, 0, &ecc_aid))
572 continue;
573 _add_ecc_count(&ecc, &ecc_aid);
574 }
575 }
576 }
577
578 err_data->new_ce_count = ecc.new_ce_count;
579 err_data->total_ce_count = ecc.total_ce_count;
580 err_data->new_ue_count = ecc.new_ue_count;
581 err_data->total_ue_count = ecc.total_ue_count;
582 err_data->new_de_count = ecc.new_de_count;
583 err_data->total_de_count = ecc.total_de_count;
584 mutex_unlock(&ras_core->ras_aca.aca_lock);
585
586 return 0;
587 }
588
ras_aca_sw_init(struct ras_core_context * ras_core)589 int ras_aca_sw_init(struct ras_core_context *ras_core)
590 {
591 struct ras_aca *ras_aca = &ras_core->ras_aca;
592 struct ras_aca_config *aca_cfg = &ras_core->config->aca_cfg;
593 struct aca_block *aca_blk;
594 uint32_t socket_num_per_hive;
595 uint32_t aid_num_per_socket;
596 uint32_t xcd_num_per_aid;
597 int blk, skt, aid;
598
599 socket_num_per_hive = aca_cfg->socket_num_per_hive;
600 aid_num_per_socket = aca_cfg->aid_num_per_socket;
601 xcd_num_per_aid = aca_cfg->xcd_num_per_aid;
602
603 if (!xcd_num_per_aid || !aid_num_per_socket ||
604 (socket_num_per_hive > MAX_SOCKET_NUM_PER_HIVE) ||
605 (aid_num_per_socket > MAX_AID_NUM_PER_SOCKET) ||
606 (xcd_num_per_aid > MAX_XCD_NUM_PER_AID)) {
607 RAS_DEV_ERR(ras_core->dev, "Invalid ACA system configuration: %d, %d, %d\n",
608 socket_num_per_hive, aid_num_per_socket, xcd_num_per_aid);
609 return -EINVAL;
610 }
611
612 memset(ras_aca, 0, sizeof(*ras_aca));
613
614 for (blk = 0; blk < RAS_BLOCK_ID__LAST; blk++) {
615 aca_blk = &ras_aca->aca_blk[blk];
616 aca_blk->ecc.socket_num_per_hive = socket_num_per_hive;
617 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
618 aca_blk->ecc.socket[skt].aid_num = aid_num_per_socket;
619 if (blk == RAS_BLOCK_ID__GFX) {
620 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++)
621 aca_blk->ecc.socket[skt].aid[aid].xcd.xcd_num =
622 xcd_num_per_aid;
623 }
624 }
625 }
626
627 mutex_init(&ras_aca->aca_lock);
628 mutex_init(&ras_aca->bank_op_lock);
629
630 return 0;
631 }
632
ras_aca_sw_fini(struct ras_core_context * ras_core)633 int ras_aca_sw_fini(struct ras_core_context *ras_core)
634 {
635 struct ras_aca *ras_aca = &ras_core->ras_aca;
636
637 mutex_destroy(&ras_aca->aca_lock);
638 mutex_destroy(&ras_aca->bank_op_lock);
639
640 return 0;
641 }
642
ras_aca_hw_init(struct ras_core_context * ras_core)643 int ras_aca_hw_init(struct ras_core_context *ras_core)
644 {
645 struct ras_aca *ras_aca = &ras_core->ras_aca;
646 struct aca_block *aca_blk;
647 const struct ras_aca_ip_func *ip_func;
648 int i;
649
650 ras_aca->aca_ip_version = ras_core->config->aca_ip_version;
651 ip_func = aca_get_ip_func(ras_core, ras_aca->aca_ip_version);
652 if (!ip_func)
653 return -EINVAL;
654
655 for (i = 0; i < ip_func->block_num; i++) {
656 aca_blk = &ras_aca->aca_blk[ip_func->block_info[i]->ras_block_id];
657 aca_blk->blk_info = ip_func->block_info[i];
658 }
659
660 ras_aca->ue_updated_mark = 0;
661
662 return 0;
663 }
664
ras_aca_hw_fini(struct ras_core_context * ras_core)665 int ras_aca_hw_fini(struct ras_core_context *ras_core)
666 {
667 struct ras_aca *ras_aca = &ras_core->ras_aca;
668
669 ras_aca->ue_updated_mark = 0;
670
671 return 0;
672 }
673