1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a copy
6 * of this software and associated documentation files (the "Software"), to deal
7 * in the Software without restriction, including without limitation the rights
8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 * copies of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 * THE SOFTWARE.
22 */
23
24 #include "amdgpu.h"
25 #include "amdgpu_reset.h"
26 #include "amdgpu_xgmi.h"
27 #include "ras_sys.h"
28 #include "amdgpu_ras_mgr.h"
29 #include "amdgpu_ras_process.h"
30
31 #define RAS_MGR_RETIRE_PAGE_INTERVAL 100
32 #define RAS_EVENT_PROCESS_TIMEOUT 1200
33
ras_process_retire_page_dwork(struct work_struct * work)34 static void ras_process_retire_page_dwork(struct work_struct *work)
35 {
36 struct amdgpu_ras_mgr *ras_mgr =
37 container_of(work, struct amdgpu_ras_mgr, retire_page_dwork.work);
38 struct amdgpu_device *adev = ras_mgr->adev;
39 int ret;
40
41 if (amdgpu_ras_is_rma(adev))
42 return;
43
44 /* If gpu reset is ongoing, delay retiring the bad pages */
45 if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
46 schedule_delayed_work(&ras_mgr->retire_page_dwork,
47 msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL * 3));
48 return;
49 }
50
51 ret = ras_umc_handle_bad_pages(ras_mgr->ras_core, NULL);
52 if (!ret)
53 schedule_delayed_work(&ras_mgr->retire_page_dwork,
54 msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL));
55 }
56
amdgpu_ras_process_init(struct amdgpu_device * adev)57 int amdgpu_ras_process_init(struct amdgpu_device *adev)
58 {
59 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
60
61 ras_mgr->is_paused = false;
62 init_completion(&ras_mgr->ras_event_done);
63
64 INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork);
65
66 return 0;
67 }
68
amdgpu_ras_process_fini(struct amdgpu_device * adev)69 int amdgpu_ras_process_fini(struct amdgpu_device *adev)
70 {
71 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
72
73 ras_mgr->is_paused = false;
74 /* Save all cached bad pages to eeprom */
75 flush_delayed_work(&ras_mgr->retire_page_dwork);
76 cancel_delayed_work_sync(&ras_mgr->retire_page_dwork);
77 return 0;
78 }
79
amdgpu_ras_process_handle_umc_interrupt(struct amdgpu_device * adev,void * data)80 int amdgpu_ras_process_handle_umc_interrupt(struct amdgpu_device *adev, void *data)
81 {
82 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
83
84 if (!ras_mgr->ras_core)
85 return -EINVAL;
86
87 return ras_process_add_interrupt_req(ras_mgr->ras_core, NULL, true);
88 }
89
amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device * adev,void * data)90 int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev, void *data)
91 {
92 amdgpu_ras_set_fed(adev, true);
93 return amdgpu_ras_mgr_reset_gpu(adev, AMDGPU_RAS_GPU_RESET_MODE1_RESET);
94 }
95
amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device * adev,void * data)96 int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, void *data)
97 {
98 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
99 struct ras_ih_info *ih_info = (struct ras_ih_info *)data;
100 struct ras_event_req req;
101 uint64_t seqno;
102
103 if (!ih_info)
104 return -EINVAL;
105
106 memset(&req, 0, sizeof(req));
107 req.block = ih_info->block;
108 req.data = ih_info->data;
109 req.pasid = ih_info->pasid;
110 req.pasid_fn = ih_info->pasid_fn;
111 req.reset = ih_info->reset;
112
113 seqno = ras_core_get_seqno(ras_mgr->ras_core,
114 RAS_SEQNO_TYPE_POISON_CONSUMPTION, false);
115
116 /* When the ACA register cannot be read from FW, the poison
117 * consumption seqno in the fifo will not pop up, so it is
118 * necessary to check whether the seqno is the previous seqno.
119 */
120 if (seqno == ras_mgr->last_poison_consumption_seqno) {
121 /* Pop and discard the previous seqno */
122 ras_core_get_seqno(ras_mgr->ras_core,
123 RAS_SEQNO_TYPE_POISON_CONSUMPTION, true);
124 seqno = ras_core_get_seqno(ras_mgr->ras_core,
125 RAS_SEQNO_TYPE_POISON_CONSUMPTION, false);
126 }
127 ras_mgr->last_poison_consumption_seqno = seqno;
128 req.seqno = seqno;
129
130 return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false);
131 }
132
amdgpu_ras_process_begin(struct amdgpu_device * adev)133 int amdgpu_ras_process_begin(struct amdgpu_device *adev)
134 {
135 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
136
137 if (ras_mgr->is_paused)
138 return -EAGAIN;
139
140 reinit_completion(&ras_mgr->ras_event_done);
141 return 0;
142 }
143
amdgpu_ras_process_end(struct amdgpu_device * adev)144 int amdgpu_ras_process_end(struct amdgpu_device *adev)
145 {
146 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
147
148 complete(&ras_mgr->ras_event_done);
149 return 0;
150 }
151
amdgpu_ras_process_pre_reset(struct amdgpu_device * adev)152 int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev)
153 {
154 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
155 long rc;
156
157 if (!ras_mgr || !ras_mgr->ras_core)
158 return -EINVAL;
159
160 if (!ras_mgr->ras_core->is_initialized)
161 return -EPERM;
162
163 ras_mgr->is_paused = true;
164
165 /* Wait for RAS event processing to complete */
166 rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done,
167 msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT));
168 if (rc <= 0)
169 RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n",
170 rc ? "interrupted" : "timeout");
171
172 flush_delayed_work(&ras_mgr->retire_page_dwork);
173 return 0;
174 }
175
amdgpu_ras_process_post_reset(struct amdgpu_device * adev)176 int amdgpu_ras_process_post_reset(struct amdgpu_device *adev)
177 {
178 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
179
180 if (!ras_mgr || !ras_mgr->ras_core)
181 return -EINVAL;
182
183 if (!ras_mgr->ras_core->is_initialized)
184 return -EPERM;
185
186 ras_mgr->is_paused = false;
187
188 schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
189 return 0;
190 }
191