1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24 #include "ras.h"
25 #include "ras_process.h"
26
27 #define RAS_EVENT_FIFO_SIZE (128 * sizeof(struct ras_event_req))
28
29 #define RAS_POLLING_ECC_TIMEOUT 300
30
ras_process_put_event(struct ras_core_context * ras_core,struct ras_event_req * req)31 static int ras_process_put_event(struct ras_core_context *ras_core,
32 struct ras_event_req *req)
33 {
34 struct ras_process *ras_proc = &ras_core->ras_proc;
35 int ret;
36
37 ret = kfifo_in_spinlocked(&ras_proc->event_fifo,
38 req, sizeof(*req), &ras_proc->fifo_spinlock);
39 if (!ret) {
40 RAS_DEV_ERR(ras_core->dev, "Poison message fifo is full!\n");
41 return -ENOSPC;
42 }
43
44 return 0;
45 }
46
ras_process_add_reset_gpu_event(struct ras_core_context * ras_core,uint32_t reset_cause)47 static int ras_process_add_reset_gpu_event(struct ras_core_context *ras_core,
48 uint32_t reset_cause)
49 {
50 struct ras_event_req req = {0};
51
52 req.reset = reset_cause;
53
54 return ras_process_put_event(ras_core, &req);
55 }
56
ras_process_get_event(struct ras_core_context * ras_core,struct ras_event_req * req)57 static int ras_process_get_event(struct ras_core_context *ras_core,
58 struct ras_event_req *req)
59 {
60 struct ras_process *ras_proc = &ras_core->ras_proc;
61
62 return kfifo_out_spinlocked(&ras_proc->event_fifo,
63 req, sizeof(*req), &ras_proc->fifo_spinlock);
64 }
65
ras_process_clear_event_fifo(struct ras_core_context * ras_core)66 static void ras_process_clear_event_fifo(struct ras_core_context *ras_core)
67 {
68 struct ras_event_req req;
69 int ret;
70
71 do {
72 ret = ras_process_get_event(ras_core, &req);
73 } while (ret);
74 }
75
76 #define AMDGPU_RAS_WAITING_DATA_READY 200
ras_process_umc_event(struct ras_core_context * ras_core,uint32_t event_count)77 static int ras_process_umc_event(struct ras_core_context *ras_core,
78 uint32_t event_count)
79 {
80 struct ras_ecc_count ecc_data;
81 int ret = 0;
82 uint32_t timeout = 0;
83 uint32_t detected_de_count = 0;
84
85 do {
86 memset(&ecc_data, 0, sizeof(ecc_data));
87 ret = ras_core_update_ecc_info(ras_core);
88 if (ret)
89 return ret;
90
91 ret = ras_core_query_block_ecc_data(ras_core, RAS_BLOCK_ID__UMC, &ecc_data);
92 if (ret)
93 return ret;
94
95 if (ecc_data.new_de_count) {
96 detected_de_count += ecc_data.new_de_count;
97 timeout = 0;
98 } else {
99 if (!timeout && event_count)
100 timeout = AMDGPU_RAS_WAITING_DATA_READY;
101
102 if (timeout) {
103 if (!--timeout)
104 break;
105
106 msleep(1);
107 }
108 }
109 } while (detected_de_count < event_count);
110
111 if (detected_de_count && ras_core_gpu_is_rma(ras_core))
112 ras_process_add_reset_gpu_event(ras_core, GPU_RESET_CAUSE_RMA);
113
114 return 0;
115 }
116
ras_process_non_umc_event(struct ras_core_context * ras_core)117 static int ras_process_non_umc_event(struct ras_core_context *ras_core)
118 {
119 struct ras_process *ras_proc = &ras_core->ras_proc;
120 struct ras_event_req req;
121 uint32_t event_count = kfifo_len(&ras_proc->event_fifo);
122 uint32_t reset_flags = 0;
123 int ret = 0, i;
124
125 for (i = 0; i < event_count; i++) {
126 memset(&req, 0, sizeof(req));
127 ret = ras_process_get_event(ras_core, &req);
128 if (!ret)
129 continue;
130
131 ras_core_event_notify(ras_core,
132 RAS_EVENT_ID__POISON_CONSUMPTION, &req);
133
134 reset_flags |= req.reset;
135
136 if (req.reset == GPU_RESET_CAUSE_RMA)
137 continue;
138
139 if (req.reset)
140 RAS_DEV_INFO(ras_core->dev,
141 "{%llu} GPU reset for %s RAS poison consumption is issued!\n",
142 req.seqno, ras_core_get_ras_block_name(req.block));
143 else
144 RAS_DEV_INFO(ras_core->dev,
145 "{%llu} %s RAS poison consumption is issued!\n",
146 req.seqno, ras_core_get_ras_block_name(req.block));
147 }
148
149 if (reset_flags) {
150 ret = ras_core_event_notify(ras_core,
151 RAS_EVENT_ID__RESET_GPU, &reset_flags);
152 if (!ret && (reset_flags & GPU_RESET_CAUSE_RMA))
153 return -RAS_CORE_GPU_IN_MODE1_RESET;
154 }
155
156 return ret;
157 }
158
ras_process_handle_ras_event(struct ras_core_context * ras_core)159 int ras_process_handle_ras_event(struct ras_core_context *ras_core)
160 {
161 struct ras_process *ras_proc = &ras_core->ras_proc;
162 uint32_t umc_event_count;
163 int ret;
164
165 ret = ras_core_event_notify(ras_core,
166 RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL);
167 if (ret)
168 return ret;
169
170 ras_aca_clear_fatal_flag(ras_core);
171 ras_umc_log_pending_bad_bank(ras_core);
172
173 do {
174 umc_event_count = atomic_read(&ras_proc->umc_interrupt_count);
175 ret = ras_process_umc_event(ras_core, umc_event_count);
176 if (ret == -RAS_CORE_GPU_IN_MODE1_RESET)
177 break;
178
179 if (umc_event_count)
180 atomic_sub(umc_event_count, &ras_proc->umc_interrupt_count);
181 } while (atomic_read(&ras_proc->umc_interrupt_count));
182
183 if ((ret != -RAS_CORE_GPU_IN_MODE1_RESET) &&
184 (kfifo_len(&ras_proc->event_fifo)))
185 ret = ras_process_non_umc_event(ras_core);
186
187 if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) {
188 /* Clear poison fifo */
189 ras_process_clear_event_fifo(ras_core);
190 atomic_set(&ras_proc->umc_interrupt_count, 0);
191 }
192
193 ras_core_event_notify(ras_core,
194 RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL);
195 return ret;
196 }
197
thread_wait_condition(void * param)198 static int thread_wait_condition(void *param)
199 {
200 struct ras_process *ras_proc = (struct ras_process *)param;
201
202 return (kthread_should_stop() ||
203 atomic_read(&ras_proc->ras_interrupt_req));
204 }
205
ras_process_thread(void * context)206 static int ras_process_thread(void *context)
207 {
208 struct ras_core_context *ras_core = (struct ras_core_context *)context;
209 struct ras_process *ras_proc = &ras_core->ras_proc;
210
211 while (!kthread_should_stop()) {
212 ras_wait_event_interruptible_timeout(&ras_proc->ras_process_wq,
213 thread_wait_condition, ras_proc,
214 msecs_to_jiffies(RAS_POLLING_ECC_TIMEOUT));
215
216 if (kthread_should_stop())
217 break;
218
219 if (!ras_core->is_initialized)
220 continue;
221
222 atomic_set(&ras_proc->ras_interrupt_req, 0);
223
224 if (ras_core_gpu_in_reset(ras_core))
225 continue;
226
227 if (ras_core->sys_fn && ras_core->sys_fn->async_handle_ras_event)
228 ras_core->sys_fn->async_handle_ras_event(ras_core, NULL);
229 else
230 ras_process_handle_ras_event(ras_core);
231 }
232
233 return 0;
234 }
235
ras_process_init(struct ras_core_context * ras_core)236 int ras_process_init(struct ras_core_context *ras_core)
237 {
238 struct ras_process *ras_proc = &ras_core->ras_proc;
239 int ret;
240
241 ret = kfifo_alloc(&ras_proc->event_fifo, RAS_EVENT_FIFO_SIZE, GFP_KERNEL);
242 if (ret)
243 return ret;
244
245 spin_lock_init(&ras_proc->fifo_spinlock);
246
247 init_waitqueue_head(&ras_proc->ras_process_wq);
248
249 ras_proc->ras_process_thread = kthread_run(ras_process_thread,
250 (void *)ras_core, "ras_process_thread");
251 if (!ras_proc->ras_process_thread) {
252 RAS_DEV_ERR(ras_core->dev, "Failed to create ras_process_thread.\n");
253 ret = -ENOMEM;
254 goto err;
255 }
256
257 return 0;
258
259 err:
260 ras_process_fini(ras_core);
261 return ret;
262 }
263
ras_process_fini(struct ras_core_context * ras_core)264 int ras_process_fini(struct ras_core_context *ras_core)
265 {
266 struct ras_process *ras_proc = &ras_core->ras_proc;
267
268 if (ras_proc->ras_process_thread) {
269 kthread_stop(ras_proc->ras_process_thread);
270 ras_proc->ras_process_thread = NULL;
271 }
272
273 kfifo_free(&ras_proc->event_fifo);
274
275 return 0;
276 }
277
ras_process_add_umc_interrupt_req(struct ras_core_context * ras_core,struct ras_event_req * req)278 static int ras_process_add_umc_interrupt_req(struct ras_core_context *ras_core,
279 struct ras_event_req *req)
280 {
281 struct ras_process *ras_proc = &ras_core->ras_proc;
282
283 atomic_inc(&ras_proc->umc_interrupt_count);
284 atomic_inc(&ras_proc->ras_interrupt_req);
285
286 wake_up(&ras_proc->ras_process_wq);
287 return 0;
288 }
289
ras_process_add_non_umc_interrupt_req(struct ras_core_context * ras_core,struct ras_event_req * req)290 static int ras_process_add_non_umc_interrupt_req(struct ras_core_context *ras_core,
291 struct ras_event_req *req)
292 {
293 struct ras_process *ras_proc = &ras_core->ras_proc;
294 int ret;
295
296 ret = ras_process_put_event(ras_core, req);
297 if (!ret) {
298 atomic_inc(&ras_proc->ras_interrupt_req);
299 wake_up(&ras_proc->ras_process_wq);
300 }
301
302 return ret;
303 }
304
ras_process_add_interrupt_req(struct ras_core_context * ras_core,struct ras_event_req * req,bool is_umc)305 int ras_process_add_interrupt_req(struct ras_core_context *ras_core,
306 struct ras_event_req *req, bool is_umc)
307 {
308 int ret;
309
310 if (!ras_core)
311 return -EINVAL;
312
313 if (!ras_core->is_initialized)
314 return -EPERM;
315
316 if (is_umc)
317 ret = ras_process_add_umc_interrupt_req(ras_core, req);
318 else
319 ret = ras_process_add_non_umc_interrupt_req(ras_core, req);
320
321 return ret;
322 }
323