1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24 #include "ras.h"
25 #include "ras_core_status.h"
26 #include "ras_log_ring.h"
27 #include "ras_cper.h"
28
29 static const struct ras_cper_guid MCE = CPER_NOTIFY__MCE;
30 static const struct ras_cper_guid CMC = CPER_NOTIFY__CMC;
31 static const struct ras_cper_guid BOOT = BOOT__TYPE;
32
33 static const struct ras_cper_guid CRASHDUMP = GPU__CRASHDUMP;
34 static const struct ras_cper_guid RUNTIME = GPU__NONSTANDARD_ERROR;
35
cper_get_timestamp(struct ras_core_context * ras_core,struct ras_cper_timestamp * timestamp,uint64_t utc_second_timestamp)36 static void cper_get_timestamp(struct ras_core_context *ras_core,
37 struct ras_cper_timestamp *timestamp, uint64_t utc_second_timestamp)
38 {
39 struct ras_time tm = {0};
40
41 ras_core_convert_timestamp_to_time(ras_core, utc_second_timestamp, &tm);
42 timestamp->seconds = tm.tm_sec;
43 timestamp->minutes = tm.tm_min;
44 timestamp->hours = tm.tm_hour;
45 timestamp->flag = 0;
46 timestamp->day = tm.tm_mday;
47 timestamp->month = tm.tm_mon;
48 timestamp->year = tm.tm_year % 100;
49 timestamp->century = tm.tm_year / 100;
50 }
51
fill_section_hdr(struct ras_core_context * ras_core,struct cper_section_hdr * hdr,enum ras_cper_type type,enum ras_cper_severity sev,struct ras_log_info * trace)52 static void fill_section_hdr(struct ras_core_context *ras_core,
53 struct cper_section_hdr *hdr, enum ras_cper_type type,
54 enum ras_cper_severity sev, struct ras_log_info *trace)
55 {
56 struct device_system_info dev_info = {0};
57 char record_id[32];
58
59 hdr->signature[0] = 'C';
60 hdr->signature[1] = 'P';
61 hdr->signature[2] = 'E';
62 hdr->signature[3] = 'R';
63 hdr->revision = CPER_HDR__REV_1;
64 hdr->signature_end = 0xFFFFFFFF;
65 hdr->error_severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev);
66
67 hdr->valid_bits.platform_id = 1;
68 hdr->valid_bits.timestamp = 1;
69
70 ras_core_get_device_system_info(ras_core, &dev_info);
71
72 cper_get_timestamp(ras_core, &hdr->timestamp, trace->timestamp);
73
74 snprintf(record_id, sizeof(record_id), "%d:%llX", dev_info.socket_id,
75 RAS_LOG_SEQNO_TO_BATCH_IDX(trace->seqno));
76 memcpy(hdr->record_id, record_id, 8);
77
78 snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
79 dev_info.vendor_id, dev_info.device_id);
80 /* pmfw version should be part of creator_id according to CPER spec */
81 snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID__AMDGPU);
82
83 switch (type) {
84 case RAS_CPER_TYPE_BOOT:
85 hdr->notify_type = BOOT;
86 break;
87 case RAS_CPER_TYPE_FATAL:
88 case RAS_CPER_TYPE_RMA:
89 hdr->notify_type = MCE;
90 break;
91 case RAS_CPER_TYPE_RUNTIME:
92 if (sev == RAS_CPER_SEV_NON_FATAL_CE)
93 hdr->notify_type = CMC;
94 else
95 hdr->notify_type = MCE;
96 break;
97 default:
98 RAS_DEV_ERR(ras_core->dev, "Unknown CPER Type\n");
99 break;
100 }
101 }
102
fill_section_descriptor(struct ras_core_context * ras_core,struct cper_section_descriptor * descriptor,enum ras_cper_severity sev,struct ras_cper_guid sec_type,uint32_t section_offset,uint32_t section_length)103 static int fill_section_descriptor(struct ras_core_context *ras_core,
104 struct cper_section_descriptor *descriptor,
105 enum ras_cper_severity sev,
106 struct ras_cper_guid sec_type,
107 uint32_t section_offset,
108 uint32_t section_length)
109 {
110 struct device_system_info dev_info = {0};
111
112 descriptor->revision_minor = CPER_SEC__MINOR_REV_1;
113 descriptor->revision_major = CPER_SEC__MAJOR_REV_22;
114 descriptor->sec_offset = section_offset;
115 descriptor->sec_length = section_length;
116 descriptor->valid_bits.fru_text = 1;
117 descriptor->flag_bits.primary = 1;
118 descriptor->severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev);
119 descriptor->sec_type = sec_type;
120
121 ras_core_get_device_system_info(ras_core, &dev_info);
122
123 snprintf(descriptor->fru_text, 20, "OAM%d", dev_info.socket_id);
124
125 if (sev == RAS_CPER_SEV_RMA)
126 descriptor->flag_bits.exceed_err_threshold = 1;
127
128 if (sev == RAS_CPER_SEV_NON_FATAL_UE)
129 descriptor->flag_bits.latent_err = 1;
130
131 return 0;
132 }
133
fill_section_fatal(struct ras_core_context * ras_core,struct cper_section_fatal * fatal,struct ras_log_info * trace)134 static int fill_section_fatal(struct ras_core_context *ras_core,
135 struct cper_section_fatal *fatal, struct ras_log_info *trace)
136 {
137 fatal->data.reg_ctx_type = CPER_CTX_TYPE__CRASH;
138 fatal->data.reg_arr_size = sizeof(fatal->data.reg);
139
140 fatal->data.reg.status = trace->aca_reg.regs[RAS_CPER_ACA_REG_STATUS];
141 fatal->data.reg.addr = trace->aca_reg.regs[RAS_CPER_ACA_REG_ADDR];
142 fatal->data.reg.ipid = trace->aca_reg.regs[RAS_CPER_ACA_REG_IPID];
143 fatal->data.reg.synd = trace->aca_reg.regs[RAS_CPER_ACA_REG_SYND];
144
145 return 0;
146 }
147
fill_section_runtime(struct ras_core_context * ras_core,struct cper_section_runtime * runtime,struct ras_log_info * trace,enum ras_cper_severity sev)148 static int fill_section_runtime(struct ras_core_context *ras_core,
149 struct cper_section_runtime *runtime, struct ras_log_info *trace,
150 enum ras_cper_severity sev)
151 {
152 runtime->hdr.valid_bits.err_info_cnt = 1;
153 runtime->hdr.valid_bits.err_context_cnt = 1;
154
155 runtime->descriptor.error_type = RUNTIME;
156 runtime->descriptor.ms_chk_bits.err_type_valid = 1;
157 if (sev == RAS_CPER_SEV_RMA) {
158 runtime->descriptor.valid_bits.ms_chk = 1;
159 runtime->descriptor.ms_chk_bits.err_type = 1;
160 runtime->descriptor.ms_chk_bits.pcc = 1;
161 }
162
163 runtime->reg.reg_ctx_type = CPER_CTX_TYPE__CRASH;
164 runtime->reg.reg_arr_size = sizeof(runtime->reg.reg_dump);
165
166 runtime->reg.reg_dump[RAS_CPER_ACA_REG_CTL] = trace->aca_reg.regs[ACA_REG_IDX__CTL];
167 runtime->reg.reg_dump[RAS_CPER_ACA_REG_STATUS] = trace->aca_reg.regs[ACA_REG_IDX__STATUS];
168 runtime->reg.reg_dump[RAS_CPER_ACA_REG_ADDR] = trace->aca_reg.regs[ACA_REG_IDX__ADDR];
169 runtime->reg.reg_dump[RAS_CPER_ACA_REG_MISC0] = trace->aca_reg.regs[ACA_REG_IDX__MISC0];
170 runtime->reg.reg_dump[RAS_CPER_ACA_REG_CONFIG] = trace->aca_reg.regs[ACA_REG_IDX__CONFG];
171 runtime->reg.reg_dump[RAS_CPER_ACA_REG_IPID] = trace->aca_reg.regs[ACA_REG_IDX__IPID];
172 runtime->reg.reg_dump[RAS_CPER_ACA_REG_SYND] = trace->aca_reg.regs[ACA_REG_IDX__SYND];
173
174 return 0;
175 }
176
cper_generate_runtime_record(struct ras_core_context * ras_core,struct cper_section_hdr * hdr,struct ras_log_info ** trace_arr,uint32_t arr_num,enum ras_cper_severity sev)177 static int cper_generate_runtime_record(struct ras_core_context *ras_core,
178 struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, uint32_t arr_num,
179 enum ras_cper_severity sev)
180 {
181 struct cper_section_descriptor *descriptor;
182 struct cper_section_runtime *runtime;
183 int i;
184
185 fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, trace_arr[0]);
186 hdr->record_length = RAS_HDR_LEN + ((RAS_SEC_DESC_LEN + RAS_NONSTD_SEC_LEN) * arr_num);
187 hdr->sec_cnt = arr_num;
188 for (i = 0; i < arr_num; i++) {
189 descriptor = (struct cper_section_descriptor *)((uint8_t *)hdr +
190 RAS_SEC_DESC_OFFSET(i));
191 runtime = (struct cper_section_runtime *)((uint8_t *)hdr +
192 RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i));
193
194 fill_section_descriptor(ras_core, descriptor, sev, RUNTIME,
195 RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i),
196 sizeof(struct cper_section_runtime));
197 fill_section_runtime(ras_core, runtime, trace_arr[i], sev);
198 }
199
200 return 0;
201 }
202
cper_generate_fatal_record(struct ras_core_context * ras_core,uint8_t * buffer,struct ras_log_info ** trace_arr,uint32_t arr_num)203 static int cper_generate_fatal_record(struct ras_core_context *ras_core,
204 uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num)
205 {
206 struct ras_cper_fatal_record record = {0};
207 int i = 0;
208
209 for (i = 0; i < arr_num; i++) {
210 fill_section_hdr(ras_core, &record.hdr, RAS_CPER_TYPE_FATAL,
211 RAS_CPER_SEV_FATAL_UE, trace_arr[i]);
212 record.hdr.record_length = RAS_HDR_LEN + RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN;
213 record.hdr.sec_cnt = 1;
214
215 fill_section_descriptor(ras_core, &record.descriptor, RAS_CPER_SEV_FATAL_UE,
216 CRASHDUMP, offsetof(struct ras_cper_fatal_record, fatal),
217 sizeof(struct cper_section_fatal));
218
219 fill_section_fatal(ras_core, &record.fatal, trace_arr[i]);
220
221 memcpy(buffer + (i * record.hdr.record_length),
222 &record, record.hdr.record_length);
223 }
224
225 return 0;
226 }
227
cper_get_record_size(enum ras_cper_type type,uint16_t section_count)228 static int cper_get_record_size(enum ras_cper_type type, uint16_t section_count)
229 {
230 int size = 0;
231
232 size += RAS_HDR_LEN;
233 size += (RAS_SEC_DESC_LEN * section_count);
234
235 switch (type) {
236 case RAS_CPER_TYPE_RUNTIME:
237 case RAS_CPER_TYPE_RMA:
238 size += (RAS_NONSTD_SEC_LEN * section_count);
239 break;
240 case RAS_CPER_TYPE_FATAL:
241 size += (RAS_FATAL_SEC_LEN * section_count);
242 size += (RAS_HDR_LEN * (section_count - 1));
243 break;
244 case RAS_CPER_TYPE_BOOT:
245 size += (RAS_BOOT_SEC_LEN * section_count);
246 break;
247 default:
248 /* should never reach here */
249 break;
250 }
251
252 return size;
253 }
254
cper_ras_log_event_to_cper_type(enum ras_log_event event)255 static enum ras_cper_type cper_ras_log_event_to_cper_type(enum ras_log_event event)
256 {
257 switch (event) {
258 case RAS_LOG_EVENT_UE:
259 return RAS_CPER_TYPE_FATAL;
260 case RAS_LOG_EVENT_DE:
261 case RAS_LOG_EVENT_CE:
262 case RAS_LOG_EVENT_POISON_CREATION:
263 case RAS_LOG_EVENT_POISON_CONSUMPTION:
264 return RAS_CPER_TYPE_RUNTIME;
265 case RAS_LOG_EVENT_RMA:
266 return RAS_CPER_TYPE_RMA;
267 default:
268 /* should never reach here */
269 return RAS_CPER_TYPE_RUNTIME;
270 }
271 }
272
ras_cper_generate_cper(struct ras_core_context * ras_core,struct ras_log_info ** trace_list,uint32_t count,uint8_t * buf,uint32_t buf_len,uint32_t * real_data_len)273 int ras_cper_generate_cper(struct ras_core_context *ras_core,
274 struct ras_log_info **trace_list, uint32_t count,
275 uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len)
276 {
277 uint8_t *buffer = buf;
278 uint64_t buf_size = buf_len;
279 int record_size, saved_size = 0;
280 struct cper_section_hdr *hdr;
281
282 /* All the batch traces share the same event */
283 record_size = cper_get_record_size(
284 cper_ras_log_event_to_cper_type(trace_list[0]->event), count);
285
286 if ((record_size + saved_size) > buf_size)
287 return -ENOMEM;
288
289 hdr = (struct cper_section_hdr *)(buffer + saved_size);
290
291 switch (trace_list[0]->event) {
292 case RAS_LOG_EVENT_RMA:
293 cper_generate_runtime_record(ras_core, hdr, trace_list, count, RAS_CPER_SEV_RMA);
294 break;
295 case RAS_LOG_EVENT_DE:
296 cper_generate_runtime_record(ras_core,
297 hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_UE);
298 break;
299 case RAS_LOG_EVENT_CE:
300 cper_generate_runtime_record(ras_core,
301 hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_CE);
302 break;
303 case RAS_LOG_EVENT_UE:
304 cper_generate_fatal_record(ras_core, buffer + saved_size, trace_list, count);
305 break;
306 default:
307 RAS_DEV_WARN(ras_core->dev, "Unprocessed trace event: %d\n", trace_list[0]->event);
308 break;
309 }
310
311 saved_size += record_size;
312
313 *real_data_len = saved_size;
314 return 0;
315 }
316