1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24 #include "ras_sys.h"
25 #include "amdgpu_ras_mgr.h"
26 #include "amdgpu_ras.h"
27 #include "amdgpu_reset.h"
28
amdgpu_ras_sys_detect_fatal_event(struct ras_core_context * ras_core,void * data)29 static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context *ras_core, void *data)
30 {
31 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
32 int ret;
33 uint64_t seq_no;
34
35 ret = amdgpu_ras_global_ras_isr(adev);
36 if (ret)
37 return ret;
38
39 seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_UE);
40 RAS_DEV_INFO(adev,
41 "{%llu} Uncorrectable hardware error(ERREVENT_ATHUB_INTERRUPT) detected!\n",
42 seq_no);
43
44 return amdgpu_ras_process_handle_unexpected_interrupt(adev, data);
45 }
46
amdgpu_ras_sys_poison_consumption_event(struct ras_core_context * ras_core,void * data)47 static int amdgpu_ras_sys_poison_consumption_event(struct ras_core_context *ras_core,
48 void *data)
49 {
50 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
51 struct ras_event_req *req = (struct ras_event_req *)data;
52 pasid_notify pasid_fn;
53
54 if (!req)
55 return -EINVAL;
56
57 if (req->pasid_fn) {
58 pasid_fn = (pasid_notify)req->pasid_fn;
59 pasid_fn(adev, req->pasid, req->data);
60 }
61
62 return 0;
63 }
64
amdgpu_ras_sys_gen_seqno(struct ras_core_context * ras_core,enum ras_seqno_type seqno_type,uint64_t * seqno)65 static int amdgpu_ras_sys_gen_seqno(struct ras_core_context *ras_core,
66 enum ras_seqno_type seqno_type, uint64_t *seqno)
67 {
68 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
69 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
70 struct ras_event_manager *event_mgr;
71 struct ras_event_state *event_state;
72 struct amdgpu_hive_info *hive;
73 enum ras_event_type event_type;
74 uint64_t seq_no;
75
76 if (!ras_mgr || !seqno ||
77 (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX))
78 return -EINVAL;
79
80 switch (seqno_type) {
81 case RAS_SEQNO_TYPE_UE:
82 event_type = RAS_EVENT_TYPE_FATAL;
83 break;
84 case RAS_SEQNO_TYPE_CE:
85 case RAS_SEQNO_TYPE_DE:
86 event_type = RAS_EVENT_TYPE_POISON_CREATION;
87 break;
88 case RAS_SEQNO_TYPE_POISON_CONSUMPTION:
89 event_type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
90 break;
91 default:
92 event_type = RAS_EVENT_TYPE_INVALID;
93 break;
94 }
95
96 hive = amdgpu_get_xgmi_hive(adev);
97 event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr;
98 event_state = &event_mgr->event_state[event_type];
99 if ((event_type == RAS_EVENT_TYPE_FATAL) && amdgpu_ras_in_recovery(adev)) {
100 seq_no = event_state->last_seqno;
101 } else {
102 seq_no = atomic64_inc_return(&event_mgr->seqno);
103 event_state->last_seqno = seq_no;
104 atomic64_inc(&event_state->count);
105 }
106 amdgpu_put_xgmi_hive(hive);
107
108 *seqno = seq_no;
109 return 0;
110
111 }
112
amdgpu_ras_sys_event_notifier(struct ras_core_context * ras_core,enum ras_notify_event event_id,void * data)113 static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
114 enum ras_notify_event event_id, void *data)
115 {
116 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
117 int ret = 0;
118
119 switch (event_id) {
120 case RAS_EVENT_ID__BAD_PAGE_DETECTED:
121 schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
122 break;
123 case RAS_EVENT_ID__POISON_CONSUMPTION:
124 amdgpu_ras_sys_poison_consumption_event(ras_core, data);
125 break;
126 case RAS_EVENT_ID__RESERVE_BAD_PAGE:
127 ret = amdgpu_ras_reserve_page(ras_core->dev, *(uint64_t *)data);
128 break;
129 case RAS_EVENT_ID__FATAL_ERROR_DETECTED:
130 ret = amdgpu_ras_sys_detect_fatal_event(ras_core, data);
131 break;
132 case RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM:
133 ret = amdgpu_dpm_send_hbm_bad_pages_num(ras_core->dev, *(uint32_t *)data);
134 break;
135 case RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP:
136 ret = amdgpu_dpm_send_hbm_bad_channel_flag(ras_core->dev, *(uint32_t *)data);
137 break;
138 case RAS_EVENT_ID__DEVICE_RMA:
139 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL);
140 ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
141 break;
142 case RAS_EVENT_ID__RESET_GPU:
143 ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data);
144 break;
145 case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN:
146 ret = amdgpu_ras_process_begin(ras_core->dev);
147 break;
148 case RAS_EVENT_ID__RAS_EVENT_PROC_END:
149 ret = amdgpu_ras_process_end(ras_core->dev);
150 break;
151 default:
152 RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id);
153 break;
154 }
155
156 return ret;
157 }
158
amdgpu_ras_sys_get_utc_second_timestamp(struct ras_core_context * ras_core)159 static u64 amdgpu_ras_sys_get_utc_second_timestamp(struct ras_core_context *ras_core)
160 {
161 return ktime_get_real_seconds();
162 }
163
amdgpu_ras_sys_check_gpu_status(struct ras_core_context * ras_core,uint32_t * status)164 static int amdgpu_ras_sys_check_gpu_status(struct ras_core_context *ras_core,
165 uint32_t *status)
166 {
167 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
168 uint32_t gpu_status = 0;
169
170 if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev))
171 gpu_status |= RAS_GPU_STATUS__IN_RESET;
172
173 if (amdgpu_sriov_vf(adev))
174 gpu_status |= RAS_GPU_STATUS__IS_VF;
175
176 *status = gpu_status;
177
178 return 0;
179 }
180
amdgpu_ras_sys_get_device_system_info(struct ras_core_context * ras_core,struct device_system_info * dev_info)181 static int amdgpu_ras_sys_get_device_system_info(struct ras_core_context *ras_core,
182 struct device_system_info *dev_info)
183 {
184 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
185
186 dev_info->device_id = adev->pdev->device;
187 dev_info->vendor_id = adev->pdev->vendor;
188 dev_info->socket_id = adev->smuio.funcs->get_socket_id(adev);
189
190 return 0;
191 }
192
amdgpu_ras_sys_gpu_reset_lock(struct ras_core_context * ras_core,bool down,bool try)193 static int amdgpu_ras_sys_gpu_reset_lock(struct ras_core_context *ras_core,
194 bool down, bool try)
195 {
196 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
197 int ret = 0;
198
199 if (down && try)
200 ret = down_read_trylock(&adev->reset_domain->sem);
201 else if (down)
202 down_read(&adev->reset_domain->sem);
203 else
204 up_read(&adev->reset_domain->sem);
205
206 return ret;
207 }
208
amdgpu_ras_sys_detect_ras_interrupt(struct ras_core_context * ras_core)209 static bool amdgpu_ras_sys_detect_ras_interrupt(struct ras_core_context *ras_core)
210 {
211 return !!atomic_read(&amdgpu_ras_in_intr);
212 }
213
amdgpu_ras_sys_get_gpu_mem(struct ras_core_context * ras_core,enum gpu_mem_type mem_type,struct gpu_mem_block * gpu_mem)214 static int amdgpu_ras_sys_get_gpu_mem(struct ras_core_context *ras_core,
215 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
216 {
217 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
218 struct psp_context *psp = &adev->psp;
219 struct psp_ring *psp_ring;
220 struct ta_mem_context *mem_ctx;
221
222 if (mem_type == GPU_MEM_TYPE_RAS_PSP_RING) {
223 psp_ring = &psp->km_ring;
224 gpu_mem->mem_bo = adev->firmware.rbuf;
225 gpu_mem->mem_size = psp_ring->ring_size;
226 gpu_mem->mem_mc_addr = psp_ring->ring_mem_mc_addr;
227 gpu_mem->mem_cpu_addr = psp_ring->ring_mem;
228 } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_CMD) {
229 gpu_mem->mem_bo = psp->cmd_buf_bo;
230 gpu_mem->mem_size = PSP_CMD_BUFFER_SIZE;
231 gpu_mem->mem_mc_addr = psp->cmd_buf_mc_addr;
232 gpu_mem->mem_cpu_addr = psp->cmd_buf_mem;
233 } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_FENCE) {
234 gpu_mem->mem_bo = psp->fence_buf_bo;
235 gpu_mem->mem_size = PSP_FENCE_BUFFER_SIZE;
236 gpu_mem->mem_mc_addr = psp->fence_buf_mc_addr;
237 gpu_mem->mem_cpu_addr = psp->fence_buf;
238 } else if (mem_type == GPU_MEM_TYPE_RAS_TA_FW) {
239 gpu_mem->mem_bo = psp->fw_pri_bo;
240 gpu_mem->mem_size = PSP_1_MEG;
241 gpu_mem->mem_mc_addr = psp->fw_pri_mc_addr;
242 gpu_mem->mem_cpu_addr = psp->fw_pri_buf;
243 } else if (mem_type == GPU_MEM_TYPE_RAS_TA_CMD) {
244 mem_ctx = &psp->ras_context.context.mem_context;
245 gpu_mem->mem_bo = mem_ctx->shared_bo;
246 gpu_mem->mem_size = mem_ctx->shared_mem_size;
247 gpu_mem->mem_mc_addr = mem_ctx->shared_mc_addr;
248 gpu_mem->mem_cpu_addr = mem_ctx->shared_buf;
249 } else {
250 return -EINVAL;
251 }
252
253 if (!gpu_mem->mem_bo || !gpu_mem->mem_size ||
254 !gpu_mem->mem_mc_addr || !gpu_mem->mem_cpu_addr) {
255 RAS_DEV_ERR(ras_core->dev, "The ras psp gpu memory is invalid!\n");
256 return -ENOMEM;
257 }
258
259 return 0;
260 }
261
amdgpu_ras_sys_put_gpu_mem(struct ras_core_context * ras_core,enum gpu_mem_type mem_type,struct gpu_mem_block * gpu_mem)262 static int amdgpu_ras_sys_put_gpu_mem(struct ras_core_context *ras_core,
263 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
264 {
265
266 return 0;
267 }
268
269 const struct ras_sys_func amdgpu_ras_sys_fn = {
270 .ras_notifier = amdgpu_ras_sys_event_notifier,
271 .get_utc_second_timestamp = amdgpu_ras_sys_get_utc_second_timestamp,
272 .gen_seqno = amdgpu_ras_sys_gen_seqno,
273 .check_gpu_status = amdgpu_ras_sys_check_gpu_status,
274 .get_device_system_info = amdgpu_ras_sys_get_device_system_info,
275 .gpu_reset_lock = amdgpu_ras_sys_gpu_reset_lock,
276 .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt,
277 .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem,
278 .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem,
279 };
280