1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2025 Intel Corporation
4 */
5
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12
13 #include "xe_device.h"
14 #include "xe_gt.h"
15 #include "xe_heci_gsc.h"
16 #include "xe_mmio.h"
17 #include "xe_pcode_api.h"
18 #include "xe_vsec.h"
19
20 #define MAX_SCRATCH_MMIO 8
21
22 /**
23 * DOC: Xe Boot Survivability
24 *
25 * Boot Survivability is a software based workflow for recovering a system in a failed boot state
26 * Here system recoverability is concerned with recovering the firmware responsible for boot.
27 *
28 * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
29 * to be flashed through mei and collect telemetry. The driver's probe flow is modified
30 * such that it enters survivability mode when pcode initialization is incomplete and boot status
31 * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating
32 * survivability mode and provides additional information required for debug
33 *
34 * KMD exposes below admin-only readable sysfs in survivability mode
35 *
36 * device/survivability_mode: The presence of this file indicates that the card is in survivability
37 * mode. Also, provides additional information on why the driver entered
38 * survivability mode.
39 *
40 * Capability Information - Provides boot status
41 * Postcode Information - Provides information about the failure
42 * Overflow Information - Provides history of previous failures
43 * Auxiliary Information - Certain failures may have information in
44 * addition to postcode information
45 */
46
aux_history_offset(u32 reg_value)47 static u32 aux_history_offset(u32 reg_value)
48 {
49 return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
50 }
51
set_survivability_info(struct xe_mmio * mmio,struct xe_survivability_info * info,int id,char * name)52 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
53 int id, char *name)
54 {
55 strscpy(info[id].name, name, sizeof(info[id].name));
56 info[id].reg = PCODE_SCRATCH(id).raw;
57 info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
58 }
59
populate_survivability_info(struct xe_device * xe)60 static void populate_survivability_info(struct xe_device *xe)
61 {
62 struct xe_survivability *survivability = &xe->survivability;
63 struct xe_survivability_info *info = survivability->info;
64 struct xe_mmio *mmio;
65 u32 id = 0, reg_value;
66 char name[NAME_MAX];
67 int index;
68
69 mmio = xe_root_tile_mmio(xe);
70 set_survivability_info(mmio, info, id, "Capability Info");
71 reg_value = info[id].value;
72
73 if (reg_value & HISTORY_TRACKING) {
74 id++;
75 set_survivability_info(mmio, info, id, "Postcode Info");
76
77 if (reg_value & OVERFLOW_SUPPORT) {
78 id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
79 set_survivability_info(mmio, info, id, "Overflow Info");
80 }
81 }
82
83 if (reg_value & AUXINFO_SUPPORT) {
84 id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
85
86 for (index = 0; id && reg_value; index++, reg_value = info[id].value,
87 id = aux_history_offset(reg_value)) {
88 snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
89 set_survivability_info(mmio, info, id, name);
90 }
91 }
92 }
93
log_survivability_info(struct pci_dev * pdev)94 static void log_survivability_info(struct pci_dev *pdev)
95 {
96 struct xe_device *xe = pdev_to_xe_device(pdev);
97 struct xe_survivability *survivability = &xe->survivability;
98 struct xe_survivability_info *info = survivability->info;
99 int id;
100
101 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
102 survivability->boot_status);
103 for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
104 if (info[id].reg)
105 dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
106 info[id].reg, info[id].value);
107 }
108 }
109
survivability_mode_show(struct device * dev,struct device_attribute * attr,char * buff)110 static ssize_t survivability_mode_show(struct device *dev,
111 struct device_attribute *attr, char *buff)
112 {
113 struct pci_dev *pdev = to_pci_dev(dev);
114 struct xe_device *xe = pdev_to_xe_device(pdev);
115 struct xe_survivability *survivability = &xe->survivability;
116 struct xe_survivability_info *info = survivability->info;
117 int index = 0, count = 0;
118
119 for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
120 if (info[index].reg)
121 count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
122 info[index].reg, info[index].value);
123 }
124
125 return count;
126 }
127
128 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
129
xe_survivability_mode_fini(void * arg)130 static void xe_survivability_mode_fini(void *arg)
131 {
132 struct xe_device *xe = arg;
133 struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
134 struct device *dev = &pdev->dev;
135
136 sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
137 }
138
enable_survivability_mode(struct pci_dev * pdev)139 static int enable_survivability_mode(struct pci_dev *pdev)
140 {
141 struct device *dev = &pdev->dev;
142 struct xe_device *xe = pdev_to_xe_device(pdev);
143 struct xe_survivability *survivability = &xe->survivability;
144 int ret = 0;
145
146 /* create survivability mode sysfs */
147 ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
148 if (ret) {
149 dev_warn(dev, "Failed to create survivability sysfs files\n");
150 return ret;
151 }
152
153 ret = devm_add_action_or_reset(xe->drm.dev,
154 xe_survivability_mode_fini, xe);
155 if (ret)
156 return ret;
157
158 /* Make sure xe_heci_gsc_init() knows about survivability mode */
159 survivability->mode = true;
160
161 ret = xe_heci_gsc_init(xe);
162 if (ret) {
163 /*
164 * But if it fails, device can't enter survivability
165 * so move it back for correct error handling
166 */
167 survivability->mode = false;
168 return ret;
169 }
170
171 xe_vsec_init(xe);
172
173 dev_err(dev, "In Survivability Mode\n");
174
175 return 0;
176 }
177
178 /**
179 * xe_survivability_mode_is_enabled - check if survivability mode is enabled
180 * @xe: xe device instance
181 *
182 * Returns true if in survivability mode, false otherwise
183 */
xe_survivability_mode_is_enabled(struct xe_device * xe)184 bool xe_survivability_mode_is_enabled(struct xe_device *xe)
185 {
186 return xe->survivability.mode;
187 }
188
189 /*
190 * survivability_mode_requested - check if it's possible to enable
191 * survivability mode and that was requested by firmware
192 *
193 * This function reads the boot status from Pcode.
194 *
195 * Return: true if platform support is available and boot status indicates
196 * failure, false otherwise.
197 */
survivability_mode_requested(struct xe_device * xe)198 static bool survivability_mode_requested(struct xe_device *xe)
199 {
200 struct xe_survivability *survivability = &xe->survivability;
201 struct xe_mmio *mmio = xe_root_tile_mmio(xe);
202 u32 data;
203
204 if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
205 return false;
206
207 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
208 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
209
210 return survivability->boot_status == NON_CRITICAL_FAILURE ||
211 survivability->boot_status == CRITICAL_FAILURE;
212 }
213
214 /**
215 * xe_survivability_mode_enable - Initialize and enable the survivability mode
216 * @xe: xe device instance
217 *
218 * Initialize survivability information and enable survivability mode
219 *
220 * Return: 0 if survivability mode is enabled or not requested; negative error
221 * code otherwise.
222 */
xe_survivability_mode_enable(struct xe_device * xe)223 int xe_survivability_mode_enable(struct xe_device *xe)
224 {
225 struct xe_survivability *survivability = &xe->survivability;
226 struct xe_survivability_info *info;
227 struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
228
229 if (!survivability_mode_requested(xe))
230 return 0;
231
232 survivability->size = MAX_SCRATCH_MMIO;
233
234 info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
235 GFP_KERNEL);
236 if (!info)
237 return -ENOMEM;
238
239 survivability->info = info;
240
241 populate_survivability_info(xe);
242
243 /* Only log debug information and exit if it is a critical failure */
244 if (survivability->boot_status == CRITICAL_FAILURE) {
245 log_survivability_info(pdev);
246 return -ENXIO;
247 }
248
249 return enable_survivability_mode(pdev);
250 }
251