1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
49 #include "amdgpu.h"
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
52 #include "atom.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
55 #include "amd_pcie.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
57 #include "si.h"
58 #endif
59 #ifdef CONFIG_DRM_AMDGPU_CIK
60 #include "cik.h"
61 #endif
62 #include "vi.h"
63 #include "soc15.h"
64 #include "nv.h"
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
68
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
71
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77 #include "amdgpu_virt.h"
78 #include "amdgpu_dev_coredump.h"
79
80 #include <linux/suspend.h>
81 #include <drm/task_barrier.h>
82 #include <linux/pm_runtime.h>
83
84 #include <drm/drm_drv.h>
85
86 #if IS_ENABLED(CONFIG_X86)
87 #include <asm/intel-family.h>
88 #endif
89
90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
97
98 #define AMDGPU_RESUME_MS 2000
99 #define AMDGPU_MAX_RETRY_LIMIT 2
100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
104
105 #define AMDGPU_VBIOS_SKIP (1U << 0)
106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1)
107
108 static const struct drm_driver amdgpu_kms_driver;
109
110 const char *amdgpu_asic_name[] = {
111 "TAHITI",
112 "PITCAIRN",
113 "VERDE",
114 "OLAND",
115 "HAINAN",
116 "BONAIRE",
117 "KAVERI",
118 "KABINI",
119 "HAWAII",
120 "MULLINS",
121 "TOPAZ",
122 "TONGA",
123 "FIJI",
124 "CARRIZO",
125 "STONEY",
126 "POLARIS10",
127 "POLARIS11",
128 "POLARIS12",
129 "VEGAM",
130 "VEGA10",
131 "VEGA12",
132 "VEGA20",
133 "RAVEN",
134 "ARCTURUS",
135 "RENOIR",
136 "ALDEBARAN",
137 "NAVI10",
138 "CYAN_SKILLFISH",
139 "NAVI14",
140 "NAVI12",
141 "SIENNA_CICHLID",
142 "NAVY_FLOUNDER",
143 "VANGOGH",
144 "DIMGREY_CAVEFISH",
145 "BEIGE_GOBY",
146 "YELLOW_CARP",
147 "IP DISCOVERY",
148 "LAST",
149 };
150
151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
152 /*
153 * Default init level where all blocks are expected to be initialized. This is
154 * the level of initialization expected by default and also after a full reset
155 * of the device.
156 */
157 struct amdgpu_init_level amdgpu_init_default = {
158 .level = AMDGPU_INIT_LEVEL_DEFAULT,
159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
160 };
161
162 struct amdgpu_init_level amdgpu_init_recovery = {
163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
165 };
166
167 /*
168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
169 * is used for cases like reset on initialization where the entire hive needs to
170 * be reset before first use.
171 */
172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
174 .hwini_ip_block_mask =
175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
177 BIT(AMD_IP_BLOCK_TYPE_PSP)
178 };
179
amdgpu_ip_member_of_hwini(struct amdgpu_device * adev,enum amd_ip_block_type block)180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
181 enum amd_ip_block_type block)
182 {
183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
184 }
185
amdgpu_set_init_level(struct amdgpu_device * adev,enum amdgpu_init_lvl_id lvl)186 void amdgpu_set_init_level(struct amdgpu_device *adev,
187 enum amdgpu_init_lvl_id lvl)
188 {
189 switch (lvl) {
190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
191 adev->init_lvl = &amdgpu_init_minimal_xgmi;
192 break;
193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
194 adev->init_lvl = &amdgpu_init_recovery;
195 break;
196 case AMDGPU_INIT_LEVEL_DEFAULT:
197 fallthrough;
198 default:
199 adev->init_lvl = &amdgpu_init_default;
200 break;
201 }
202 }
203
204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
206 void *data);
207
208 /**
209 * DOC: pcie_replay_count
210 *
211 * The amdgpu driver provides a sysfs API for reporting the total number
212 * of PCIe replays (NAKs).
213 * The file pcie_replay_count is used for this and returns the total
214 * number of replays as a sum of the NAKs generated and NAKs received.
215 */
216
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
218 struct device_attribute *attr, char *buf)
219 {
220 struct drm_device *ddev = dev_get_drvdata(dev);
221 struct amdgpu_device *adev = drm_to_adev(ddev);
222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
223
224 return sysfs_emit(buf, "%llu\n", cnt);
225 }
226
227 static DEVICE_ATTR(pcie_replay_count, 0444,
228 amdgpu_device_get_pcie_replay_count, NULL);
229
amdgpu_device_attr_sysfs_init(struct amdgpu_device * adev)230 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
231 {
232 int ret = 0;
233
234 if (!amdgpu_sriov_vf(adev))
235 ret = sysfs_create_file(&adev->dev->kobj,
236 &dev_attr_pcie_replay_count.attr);
237
238 return ret;
239 }
240
amdgpu_device_attr_sysfs_fini(struct amdgpu_device * adev)241 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
242 {
243 if (!amdgpu_sriov_vf(adev))
244 sysfs_remove_file(&adev->dev->kobj,
245 &dev_attr_pcie_replay_count.attr);
246 }
247
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,const struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)248 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
249 const struct bin_attribute *attr, char *buf,
250 loff_t ppos, size_t count)
251 {
252 struct device *dev = kobj_to_dev(kobj);
253 struct drm_device *ddev = dev_get_drvdata(dev);
254 struct amdgpu_device *adev = drm_to_adev(ddev);
255 ssize_t bytes_read;
256
257 switch (ppos) {
258 case AMDGPU_SYS_REG_STATE_XGMI:
259 bytes_read = amdgpu_asic_get_reg_state(
260 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
261 break;
262 case AMDGPU_SYS_REG_STATE_WAFL:
263 bytes_read = amdgpu_asic_get_reg_state(
264 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
265 break;
266 case AMDGPU_SYS_REG_STATE_PCIE:
267 bytes_read = amdgpu_asic_get_reg_state(
268 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
269 break;
270 case AMDGPU_SYS_REG_STATE_USR:
271 bytes_read = amdgpu_asic_get_reg_state(
272 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
273 break;
274 case AMDGPU_SYS_REG_STATE_USR_1:
275 bytes_read = amdgpu_asic_get_reg_state(
276 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
277 break;
278 default:
279 return -EINVAL;
280 }
281
282 return bytes_read;
283 }
284
285 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
286 AMDGPU_SYS_REG_STATE_END);
287
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)288 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
289 {
290 int ret;
291
292 if (!amdgpu_asic_get_reg_state_supported(adev))
293 return 0;
294
295 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
296
297 return ret;
298 }
299
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)300 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
301 {
302 if (!amdgpu_asic_get_reg_state_supported(adev))
303 return;
304 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
305 }
306
amdgpu_ip_block_suspend(struct amdgpu_ip_block * ip_block)307 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
308 {
309 int r;
310
311 if (ip_block->version->funcs->suspend) {
312 r = ip_block->version->funcs->suspend(ip_block);
313 if (r) {
314 dev_err(ip_block->adev->dev,
315 "suspend of IP block <%s> failed %d\n",
316 ip_block->version->funcs->name, r);
317 return r;
318 }
319 }
320
321 ip_block->status.hw = false;
322 return 0;
323 }
324
amdgpu_ip_block_resume(struct amdgpu_ip_block * ip_block)325 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
326 {
327 int r;
328
329 if (ip_block->version->funcs->resume) {
330 r = ip_block->version->funcs->resume(ip_block);
331 if (r) {
332 dev_err(ip_block->adev->dev,
333 "resume of IP block <%s> failed %d\n",
334 ip_block->version->funcs->name, r);
335 return r;
336 }
337 }
338
339 ip_block->status.hw = true;
340 return 0;
341 }
342
343 /**
344 * DOC: board_info
345 *
346 * The amdgpu driver provides a sysfs API for giving board related information.
347 * It provides the form factor information in the format
348 *
349 * type : form factor
350 *
351 * Possible form factor values
352 *
353 * - "cem" - PCIE CEM card
354 * - "oam" - Open Compute Accelerator Module
355 * - "unknown" - Not known
356 *
357 */
358
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)359 static ssize_t amdgpu_device_get_board_info(struct device *dev,
360 struct device_attribute *attr,
361 char *buf)
362 {
363 struct drm_device *ddev = dev_get_drvdata(dev);
364 struct amdgpu_device *adev = drm_to_adev(ddev);
365 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
366 const char *pkg;
367
368 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
369 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
370
371 switch (pkg_type) {
372 case AMDGPU_PKG_TYPE_CEM:
373 pkg = "cem";
374 break;
375 case AMDGPU_PKG_TYPE_OAM:
376 pkg = "oam";
377 break;
378 default:
379 pkg = "unknown";
380 break;
381 }
382
383 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
384 }
385
386 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
387
388 static struct attribute *amdgpu_board_attrs[] = {
389 &dev_attr_board_info.attr,
390 NULL,
391 };
392
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)393 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
394 struct attribute *attr, int n)
395 {
396 struct device *dev = kobj_to_dev(kobj);
397 struct drm_device *ddev = dev_get_drvdata(dev);
398 struct amdgpu_device *adev = drm_to_adev(ddev);
399
400 if (adev->flags & AMD_IS_APU)
401 return 0;
402
403 return attr->mode;
404 }
405
406 static const struct attribute_group amdgpu_board_attrs_group = {
407 .attrs = amdgpu_board_attrs,
408 .is_visible = amdgpu_board_attrs_is_visible
409 };
410
411 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
412
413
414 /**
415 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
416 *
417 * @dev: drm_device pointer
418 *
419 * Returns true if the device is a dGPU with ATPX power control,
420 * otherwise return false.
421 */
amdgpu_device_supports_px(struct drm_device * dev)422 bool amdgpu_device_supports_px(struct drm_device *dev)
423 {
424 struct amdgpu_device *adev = drm_to_adev(dev);
425
426 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
427 return true;
428 return false;
429 }
430
431 /**
432 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
433 *
434 * @dev: drm_device pointer
435 *
436 * Returns true if the device is a dGPU with ACPI power control,
437 * otherwise return false.
438 */
amdgpu_device_supports_boco(struct drm_device * dev)439 bool amdgpu_device_supports_boco(struct drm_device *dev)
440 {
441 struct amdgpu_device *adev = drm_to_adev(dev);
442
443 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
444 return false;
445
446 if (adev->has_pr3 ||
447 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
448 return true;
449 return false;
450 }
451
452 /**
453 * amdgpu_device_supports_baco - Does the device support BACO
454 *
455 * @dev: drm_device pointer
456 *
457 * Return:
458 * 1 if the device supports BACO;
459 * 3 if the device supports MACO (only works if BACO is supported)
460 * otherwise return 0.
461 */
amdgpu_device_supports_baco(struct drm_device * dev)462 int amdgpu_device_supports_baco(struct drm_device *dev)
463 {
464 struct amdgpu_device *adev = drm_to_adev(dev);
465
466 return amdgpu_asic_supports_baco(adev);
467 }
468
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)469 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
470 {
471 struct drm_device *dev;
472 int bamaco_support;
473
474 dev = adev_to_drm(adev);
475
476 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
477 bamaco_support = amdgpu_device_supports_baco(dev);
478
479 switch (amdgpu_runtime_pm) {
480 case 2:
481 if (bamaco_support & MACO_SUPPORT) {
482 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
483 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
484 } else if (bamaco_support == BACO_SUPPORT) {
485 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
486 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
487 }
488 break;
489 case 1:
490 if (bamaco_support & BACO_SUPPORT) {
491 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
492 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
493 }
494 break;
495 case -1:
496 case -2:
497 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
499 dev_info(adev->dev, "Using ATPX for runtime pm\n");
500 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
501 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
502 dev_info(adev->dev, "Using BOCO for runtime pm\n");
503 } else {
504 if (!bamaco_support)
505 goto no_runtime_pm;
506
507 switch (adev->asic_type) {
508 case CHIP_VEGA20:
509 case CHIP_ARCTURUS:
510 /* BACO are not supported on vega20 and arctrus */
511 break;
512 case CHIP_VEGA10:
513 /* enable BACO as runpm mode if noretry=0 */
514 if (!adev->gmc.noretry)
515 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
516 break;
517 default:
518 /* enable BACO as runpm mode on CI+ */
519 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
520 break;
521 }
522
523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
524 if (bamaco_support & MACO_SUPPORT) {
525 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
526 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
527 } else {
528 dev_info(adev->dev, "Using BACO for runtime pm\n");
529 }
530 }
531 }
532 break;
533 case 0:
534 dev_info(adev->dev, "runtime pm is manually disabled\n");
535 break;
536 default:
537 break;
538 }
539
540 no_runtime_pm:
541 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
542 dev_info(adev->dev, "Runtime PM not available\n");
543 }
544 /**
545 * amdgpu_device_supports_smart_shift - Is the device dGPU with
546 * smart shift support
547 *
548 * @dev: drm_device pointer
549 *
550 * Returns true if the device is a dGPU with Smart Shift support,
551 * otherwise returns false.
552 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)553 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
554 {
555 return (amdgpu_device_supports_boco(dev) &&
556 amdgpu_acpi_is_power_shift_control_supported());
557 }
558
559 /*
560 * VRAM access helper functions
561 */
562
563 /**
564 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
565 *
566 * @adev: amdgpu_device pointer
567 * @pos: offset of the buffer in vram
568 * @buf: virtual address of the buffer in system memory
569 * @size: read/write size, sizeof(@buf) must > @size
570 * @write: true - write to vram, otherwise - read from vram
571 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)572 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
573 void *buf, size_t size, bool write)
574 {
575 unsigned long flags;
576 uint32_t hi = ~0, tmp = 0;
577 uint32_t *data = buf;
578 uint64_t last;
579 int idx;
580
581 if (!drm_dev_enter(adev_to_drm(adev), &idx))
582 return;
583
584 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
585
586 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
587 for (last = pos + size; pos < last; pos += 4) {
588 tmp = pos >> 31;
589
590 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
591 if (tmp != hi) {
592 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
593 hi = tmp;
594 }
595 if (write)
596 WREG32_NO_KIQ(mmMM_DATA, *data++);
597 else
598 *data++ = RREG32_NO_KIQ(mmMM_DATA);
599 }
600
601 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
602 drm_dev_exit(idx);
603 }
604
605 /**
606 * amdgpu_device_aper_access - access vram by vram aperture
607 *
608 * @adev: amdgpu_device pointer
609 * @pos: offset of the buffer in vram
610 * @buf: virtual address of the buffer in system memory
611 * @size: read/write size, sizeof(@buf) must > @size
612 * @write: true - write to vram, otherwise - read from vram
613 *
614 * The return value means how many bytes have been transferred.
615 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)616 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
617 void *buf, size_t size, bool write)
618 {
619 #ifdef CONFIG_64BIT
620 void __iomem *addr;
621 size_t count = 0;
622 uint64_t last;
623
624 if (!adev->mman.aper_base_kaddr)
625 return 0;
626
627 last = min(pos + size, adev->gmc.visible_vram_size);
628 if (last > pos) {
629 addr = adev->mman.aper_base_kaddr + pos;
630 count = last - pos;
631
632 if (write) {
633 memcpy_toio(addr, buf, count);
634 /* Make sure HDP write cache flush happens without any reordering
635 * after the system memory contents are sent over PCIe device
636 */
637 mb();
638 amdgpu_device_flush_hdp(adev, NULL);
639 } else {
640 amdgpu_device_invalidate_hdp(adev, NULL);
641 /* Make sure HDP read cache is invalidated before issuing a read
642 * to the PCIe device
643 */
644 mb();
645 memcpy_fromio(buf, addr, count);
646 }
647
648 }
649
650 return count;
651 #else
652 return 0;
653 #endif
654 }
655
656 /**
657 * amdgpu_device_vram_access - read/write a buffer in vram
658 *
659 * @adev: amdgpu_device pointer
660 * @pos: offset of the buffer in vram
661 * @buf: virtual address of the buffer in system memory
662 * @size: read/write size, sizeof(@buf) must > @size
663 * @write: true - write to vram, otherwise - read from vram
664 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)665 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
666 void *buf, size_t size, bool write)
667 {
668 size_t count;
669
670 /* try to using vram apreature to access vram first */
671 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
672 size -= count;
673 if (size) {
674 /* using MM to access rest vram */
675 pos += count;
676 buf += count;
677 amdgpu_device_mm_access(adev, pos, buf, size, write);
678 }
679 }
680
681 /*
682 * register access helper functions.
683 */
684
685 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)686 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
687 {
688 if (adev->no_hw_access)
689 return true;
690
691 #ifdef CONFIG_LOCKDEP
692 /*
693 * This is a bit complicated to understand, so worth a comment. What we assert
694 * here is that the GPU reset is not running on another thread in parallel.
695 *
696 * For this we trylock the read side of the reset semaphore, if that succeeds
697 * we know that the reset is not running in parallel.
698 *
699 * If the trylock fails we assert that we are either already holding the read
700 * side of the lock or are the reset thread itself and hold the write side of
701 * the lock.
702 */
703 if (in_task()) {
704 if (down_read_trylock(&adev->reset_domain->sem))
705 up_read(&adev->reset_domain->sem);
706 else
707 lockdep_assert_held(&adev->reset_domain->sem);
708 }
709 #endif
710 return false;
711 }
712
713 /**
714 * amdgpu_device_rreg - read a memory mapped IO or indirect register
715 *
716 * @adev: amdgpu_device pointer
717 * @reg: dword aligned register offset
718 * @acc_flags: access flags which require special behavior
719 *
720 * Returns the 32 bit value from the offset specified.
721 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)722 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
723 uint32_t reg, uint32_t acc_flags)
724 {
725 uint32_t ret;
726
727 if (amdgpu_device_skip_hw_access(adev))
728 return 0;
729
730 if ((reg * 4) < adev->rmmio_size) {
731 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
732 amdgpu_sriov_runtime(adev) &&
733 down_read_trylock(&adev->reset_domain->sem)) {
734 ret = amdgpu_kiq_rreg(adev, reg, 0);
735 up_read(&adev->reset_domain->sem);
736 } else {
737 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
738 }
739 } else {
740 ret = adev->pcie_rreg(adev, reg * 4);
741 }
742
743 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
744
745 return ret;
746 }
747
748 /*
749 * MMIO register read with bytes helper functions
750 * @offset:bytes offset from MMIO start
751 */
752
753 /**
754 * amdgpu_mm_rreg8 - read a memory mapped IO register
755 *
756 * @adev: amdgpu_device pointer
757 * @offset: byte aligned register offset
758 *
759 * Returns the 8 bit value from the offset specified.
760 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)761 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
762 {
763 if (amdgpu_device_skip_hw_access(adev))
764 return 0;
765
766 if (offset < adev->rmmio_size)
767 return (readb(adev->rmmio + offset));
768 BUG();
769 }
770
771
772 /**
773 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
774 *
775 * @adev: amdgpu_device pointer
776 * @reg: dword aligned register offset
777 * @acc_flags: access flags which require special behavior
778 * @xcc_id: xcc accelerated compute core id
779 *
780 * Returns the 32 bit value from the offset specified.
781 */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)782 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
783 uint32_t reg, uint32_t acc_flags,
784 uint32_t xcc_id)
785 {
786 uint32_t ret, rlcg_flag;
787
788 if (amdgpu_device_skip_hw_access(adev))
789 return 0;
790
791 if ((reg * 4) < adev->rmmio_size) {
792 if (amdgpu_sriov_vf(adev) &&
793 !amdgpu_sriov_runtime(adev) &&
794 adev->gfx.rlc.rlcg_reg_access_supported &&
795 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
796 GC_HWIP, false,
797 &rlcg_flag)) {
798 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
799 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
800 amdgpu_sriov_runtime(adev) &&
801 down_read_trylock(&adev->reset_domain->sem)) {
802 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
803 up_read(&adev->reset_domain->sem);
804 } else {
805 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
806 }
807 } else {
808 ret = adev->pcie_rreg(adev, reg * 4);
809 }
810
811 return ret;
812 }
813
814 /*
815 * MMIO register write with bytes helper functions
816 * @offset:bytes offset from MMIO start
817 * @value: the value want to be written to the register
818 */
819
820 /**
821 * amdgpu_mm_wreg8 - read a memory mapped IO register
822 *
823 * @adev: amdgpu_device pointer
824 * @offset: byte aligned register offset
825 * @value: 8 bit value to write
826 *
827 * Writes the value specified to the offset specified.
828 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)829 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
830 {
831 if (amdgpu_device_skip_hw_access(adev))
832 return;
833
834 if (offset < adev->rmmio_size)
835 writeb(value, adev->rmmio + offset);
836 else
837 BUG();
838 }
839
840 /**
841 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
842 *
843 * @adev: amdgpu_device pointer
844 * @reg: dword aligned register offset
845 * @v: 32 bit value to write to the register
846 * @acc_flags: access flags which require special behavior
847 *
848 * Writes the value specified to the offset specified.
849 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)850 void amdgpu_device_wreg(struct amdgpu_device *adev,
851 uint32_t reg, uint32_t v,
852 uint32_t acc_flags)
853 {
854 if (amdgpu_device_skip_hw_access(adev))
855 return;
856
857 if ((reg * 4) < adev->rmmio_size) {
858 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
859 amdgpu_sriov_runtime(adev) &&
860 down_read_trylock(&adev->reset_domain->sem)) {
861 amdgpu_kiq_wreg(adev, reg, v, 0);
862 up_read(&adev->reset_domain->sem);
863 } else {
864 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
865 }
866 } else {
867 adev->pcie_wreg(adev, reg * 4, v);
868 }
869
870 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
871 }
872
873 /**
874 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
875 *
876 * @adev: amdgpu_device pointer
877 * @reg: mmio/rlc register
878 * @v: value to write
879 * @xcc_id: xcc accelerated compute core id
880 *
881 * this function is invoked only for the debugfs register access
882 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)883 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
884 uint32_t reg, uint32_t v,
885 uint32_t xcc_id)
886 {
887 if (amdgpu_device_skip_hw_access(adev))
888 return;
889
890 if (amdgpu_sriov_fullaccess(adev) &&
891 adev->gfx.rlc.funcs &&
892 adev->gfx.rlc.funcs->is_rlcg_access_range) {
893 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
894 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
895 } else if ((reg * 4) >= adev->rmmio_size) {
896 adev->pcie_wreg(adev, reg * 4, v);
897 } else {
898 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
899 }
900 }
901
902 /**
903 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
904 *
905 * @adev: amdgpu_device pointer
906 * @reg: dword aligned register offset
907 * @v: 32 bit value to write to the register
908 * @acc_flags: access flags which require special behavior
909 * @xcc_id: xcc accelerated compute core id
910 *
911 * Writes the value specified to the offset specified.
912 */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)913 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
914 uint32_t reg, uint32_t v,
915 uint32_t acc_flags, uint32_t xcc_id)
916 {
917 uint32_t rlcg_flag;
918
919 if (amdgpu_device_skip_hw_access(adev))
920 return;
921
922 if ((reg * 4) < adev->rmmio_size) {
923 if (amdgpu_sriov_vf(adev) &&
924 !amdgpu_sriov_runtime(adev) &&
925 adev->gfx.rlc.rlcg_reg_access_supported &&
926 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
927 GC_HWIP, true,
928 &rlcg_flag)) {
929 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
930 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
931 amdgpu_sriov_runtime(adev) &&
932 down_read_trylock(&adev->reset_domain->sem)) {
933 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
934 up_read(&adev->reset_domain->sem);
935 } else {
936 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
937 }
938 } else {
939 adev->pcie_wreg(adev, reg * 4, v);
940 }
941 }
942
943 /**
944 * amdgpu_device_indirect_rreg - read an indirect register
945 *
946 * @adev: amdgpu_device pointer
947 * @reg_addr: indirect register address to read from
948 *
949 * Returns the value of indirect register @reg_addr
950 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)951 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
952 u32 reg_addr)
953 {
954 unsigned long flags, pcie_index, pcie_data;
955 void __iomem *pcie_index_offset;
956 void __iomem *pcie_data_offset;
957 u32 r;
958
959 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
960 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
961
962 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
963 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
964 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
965
966 writel(reg_addr, pcie_index_offset);
967 readl(pcie_index_offset);
968 r = readl(pcie_data_offset);
969 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
970
971 return r;
972 }
973
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)974 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
975 u64 reg_addr)
976 {
977 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
978 u32 r;
979 void __iomem *pcie_index_offset;
980 void __iomem *pcie_index_hi_offset;
981 void __iomem *pcie_data_offset;
982
983 if (unlikely(!adev->nbio.funcs)) {
984 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
985 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
986 } else {
987 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
988 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
989 }
990
991 if (reg_addr >> 32) {
992 if (unlikely(!adev->nbio.funcs))
993 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
994 else
995 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
996 } else {
997 pcie_index_hi = 0;
998 }
999
1000 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1001 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1002 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1003 if (pcie_index_hi != 0)
1004 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1005 pcie_index_hi * 4;
1006
1007 writel(reg_addr, pcie_index_offset);
1008 readl(pcie_index_offset);
1009 if (pcie_index_hi != 0) {
1010 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1011 readl(pcie_index_hi_offset);
1012 }
1013 r = readl(pcie_data_offset);
1014
1015 /* clear the high bits */
1016 if (pcie_index_hi != 0) {
1017 writel(0, pcie_index_hi_offset);
1018 readl(pcie_index_hi_offset);
1019 }
1020
1021 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1022
1023 return r;
1024 }
1025
1026 /**
1027 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1028 *
1029 * @adev: amdgpu_device pointer
1030 * @reg_addr: indirect register address to read from
1031 *
1032 * Returns the value of indirect register @reg_addr
1033 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)1034 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1035 u32 reg_addr)
1036 {
1037 unsigned long flags, pcie_index, pcie_data;
1038 void __iomem *pcie_index_offset;
1039 void __iomem *pcie_data_offset;
1040 u64 r;
1041
1042 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1043 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1044
1045 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1046 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1047 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1048
1049 /* read low 32 bits */
1050 writel(reg_addr, pcie_index_offset);
1051 readl(pcie_index_offset);
1052 r = readl(pcie_data_offset);
1053 /* read high 32 bits */
1054 writel(reg_addr + 4, pcie_index_offset);
1055 readl(pcie_index_offset);
1056 r |= ((u64)readl(pcie_data_offset) << 32);
1057 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1058
1059 return r;
1060 }
1061
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)1062 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1063 u64 reg_addr)
1064 {
1065 unsigned long flags, pcie_index, pcie_data;
1066 unsigned long pcie_index_hi = 0;
1067 void __iomem *pcie_index_offset;
1068 void __iomem *pcie_index_hi_offset;
1069 void __iomem *pcie_data_offset;
1070 u64 r;
1071
1072 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1073 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1074 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1075 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1076
1077 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1078 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1079 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1080 if (pcie_index_hi != 0)
1081 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1082 pcie_index_hi * 4;
1083
1084 /* read low 32 bits */
1085 writel(reg_addr, pcie_index_offset);
1086 readl(pcie_index_offset);
1087 if (pcie_index_hi != 0) {
1088 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1089 readl(pcie_index_hi_offset);
1090 }
1091 r = readl(pcie_data_offset);
1092 /* read high 32 bits */
1093 writel(reg_addr + 4, pcie_index_offset);
1094 readl(pcie_index_offset);
1095 if (pcie_index_hi != 0) {
1096 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1097 readl(pcie_index_hi_offset);
1098 }
1099 r |= ((u64)readl(pcie_data_offset) << 32);
1100
1101 /* clear the high bits */
1102 if (pcie_index_hi != 0) {
1103 writel(0, pcie_index_hi_offset);
1104 readl(pcie_index_hi_offset);
1105 }
1106
1107 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1108
1109 return r;
1110 }
1111
1112 /**
1113 * amdgpu_device_indirect_wreg - write an indirect register address
1114 *
1115 * @adev: amdgpu_device pointer
1116 * @reg_addr: indirect register offset
1117 * @reg_data: indirect register data
1118 *
1119 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1120 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1121 u32 reg_addr, u32 reg_data)
1122 {
1123 unsigned long flags, pcie_index, pcie_data;
1124 void __iomem *pcie_index_offset;
1125 void __iomem *pcie_data_offset;
1126
1127 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1128 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1129
1130 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1131 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1132 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1133
1134 writel(reg_addr, pcie_index_offset);
1135 readl(pcie_index_offset);
1136 writel(reg_data, pcie_data_offset);
1137 readl(pcie_data_offset);
1138 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1139 }
1140
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1141 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1142 u64 reg_addr, u32 reg_data)
1143 {
1144 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1145 void __iomem *pcie_index_offset;
1146 void __iomem *pcie_index_hi_offset;
1147 void __iomem *pcie_data_offset;
1148
1149 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1150 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1151 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1152 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1153 else
1154 pcie_index_hi = 0;
1155
1156 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1157 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1158 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1159 if (pcie_index_hi != 0)
1160 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1161 pcie_index_hi * 4;
1162
1163 writel(reg_addr, pcie_index_offset);
1164 readl(pcie_index_offset);
1165 if (pcie_index_hi != 0) {
1166 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1167 readl(pcie_index_hi_offset);
1168 }
1169 writel(reg_data, pcie_data_offset);
1170 readl(pcie_data_offset);
1171
1172 /* clear the high bits */
1173 if (pcie_index_hi != 0) {
1174 writel(0, pcie_index_hi_offset);
1175 readl(pcie_index_hi_offset);
1176 }
1177
1178 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1179 }
1180
1181 /**
1182 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1183 *
1184 * @adev: amdgpu_device pointer
1185 * @reg_addr: indirect register offset
1186 * @reg_data: indirect register data
1187 *
1188 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1189 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1190 u32 reg_addr, u64 reg_data)
1191 {
1192 unsigned long flags, pcie_index, pcie_data;
1193 void __iomem *pcie_index_offset;
1194 void __iomem *pcie_data_offset;
1195
1196 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1197 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1198
1199 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1200 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1201 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1202
1203 /* write low 32 bits */
1204 writel(reg_addr, pcie_index_offset);
1205 readl(pcie_index_offset);
1206 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1207 readl(pcie_data_offset);
1208 /* write high 32 bits */
1209 writel(reg_addr + 4, pcie_index_offset);
1210 readl(pcie_index_offset);
1211 writel((u32)(reg_data >> 32), pcie_data_offset);
1212 readl(pcie_data_offset);
1213 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1214 }
1215
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1216 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1217 u64 reg_addr, u64 reg_data)
1218 {
1219 unsigned long flags, pcie_index, pcie_data;
1220 unsigned long pcie_index_hi = 0;
1221 void __iomem *pcie_index_offset;
1222 void __iomem *pcie_index_hi_offset;
1223 void __iomem *pcie_data_offset;
1224
1225 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1226 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1227 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1228 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1229
1230 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1231 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1232 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1233 if (pcie_index_hi != 0)
1234 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1235 pcie_index_hi * 4;
1236
1237 /* write low 32 bits */
1238 writel(reg_addr, pcie_index_offset);
1239 readl(pcie_index_offset);
1240 if (pcie_index_hi != 0) {
1241 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1242 readl(pcie_index_hi_offset);
1243 }
1244 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1245 readl(pcie_data_offset);
1246 /* write high 32 bits */
1247 writel(reg_addr + 4, pcie_index_offset);
1248 readl(pcie_index_offset);
1249 if (pcie_index_hi != 0) {
1250 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1251 readl(pcie_index_hi_offset);
1252 }
1253 writel((u32)(reg_data >> 32), pcie_data_offset);
1254 readl(pcie_data_offset);
1255
1256 /* clear the high bits */
1257 if (pcie_index_hi != 0) {
1258 writel(0, pcie_index_hi_offset);
1259 readl(pcie_index_hi_offset);
1260 }
1261
1262 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1263 }
1264
1265 /**
1266 * amdgpu_device_get_rev_id - query device rev_id
1267 *
1268 * @adev: amdgpu_device pointer
1269 *
1270 * Return device rev_id
1271 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1272 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1273 {
1274 return adev->nbio.funcs->get_rev_id(adev);
1275 }
1276
1277 /**
1278 * amdgpu_invalid_rreg - dummy reg read function
1279 *
1280 * @adev: amdgpu_device pointer
1281 * @reg: offset of register
1282 *
1283 * Dummy register read function. Used for register blocks
1284 * that certain asics don't have (all asics).
1285 * Returns the value in the register.
1286 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1287 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1288 {
1289 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1290 BUG();
1291 return 0;
1292 }
1293
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1294 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1295 {
1296 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1297 BUG();
1298 return 0;
1299 }
1300
1301 /**
1302 * amdgpu_invalid_wreg - dummy reg write function
1303 *
1304 * @adev: amdgpu_device pointer
1305 * @reg: offset of register
1306 * @v: value to write to the register
1307 *
1308 * Dummy register read function. Used for register blocks
1309 * that certain asics don't have (all asics).
1310 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1311 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1312 {
1313 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1314 reg, v);
1315 BUG();
1316 }
1317
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1318 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1319 {
1320 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1321 reg, v);
1322 BUG();
1323 }
1324
1325 /**
1326 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1327 *
1328 * @adev: amdgpu_device pointer
1329 * @reg: offset of register
1330 *
1331 * Dummy register read function. Used for register blocks
1332 * that certain asics don't have (all asics).
1333 * Returns the value in the register.
1334 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1335 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1336 {
1337 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1338 BUG();
1339 return 0;
1340 }
1341
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1342 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1343 {
1344 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1345 BUG();
1346 return 0;
1347 }
1348
1349 /**
1350 * amdgpu_invalid_wreg64 - dummy reg write function
1351 *
1352 * @adev: amdgpu_device pointer
1353 * @reg: offset of register
1354 * @v: value to write to the register
1355 *
1356 * Dummy register read function. Used for register blocks
1357 * that certain asics don't have (all asics).
1358 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1359 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1360 {
1361 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1362 reg, v);
1363 BUG();
1364 }
1365
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1366 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1367 {
1368 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1369 reg, v);
1370 BUG();
1371 }
1372
1373 /**
1374 * amdgpu_block_invalid_rreg - dummy reg read function
1375 *
1376 * @adev: amdgpu_device pointer
1377 * @block: offset of instance
1378 * @reg: offset of register
1379 *
1380 * Dummy register read function. Used for register blocks
1381 * that certain asics don't have (all asics).
1382 * Returns the value in the register.
1383 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1384 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1385 uint32_t block, uint32_t reg)
1386 {
1387 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1388 reg, block);
1389 BUG();
1390 return 0;
1391 }
1392
1393 /**
1394 * amdgpu_block_invalid_wreg - dummy reg write function
1395 *
1396 * @adev: amdgpu_device pointer
1397 * @block: offset of instance
1398 * @reg: offset of register
1399 * @v: value to write to the register
1400 *
1401 * Dummy register read function. Used for register blocks
1402 * that certain asics don't have (all asics).
1403 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1404 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1405 uint32_t block,
1406 uint32_t reg, uint32_t v)
1407 {
1408 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1409 reg, block, v);
1410 BUG();
1411 }
1412
amdgpu_device_get_vbios_flags(struct amdgpu_device * adev)1413 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev)
1414 {
1415 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1416 return AMDGPU_VBIOS_SKIP;
1417
1418 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev))
1419 return AMDGPU_VBIOS_OPTIONAL;
1420
1421 return 0;
1422 }
1423
1424 /**
1425 * amdgpu_device_asic_init - Wrapper for atom asic_init
1426 *
1427 * @adev: amdgpu_device pointer
1428 *
1429 * Does any asic specific work and then calls atom asic init.
1430 */
amdgpu_device_asic_init(struct amdgpu_device * adev)1431 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1432 {
1433 uint32_t flags;
1434 bool optional;
1435 int ret;
1436
1437 amdgpu_asic_pre_asic_init(adev);
1438 flags = amdgpu_device_get_vbios_flags(adev);
1439 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP));
1440
1441 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1442 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
1444 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1445 amdgpu_psp_wait_for_bootloader(adev);
1446 if (optional && !adev->bios)
1447 return 0;
1448
1449 ret = amdgpu_atomfirmware_asic_init(adev, true);
1450 return ret;
1451 } else {
1452 if (optional && !adev->bios)
1453 return 0;
1454
1455 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1456 }
1457
1458 return 0;
1459 }
1460
1461 /**
1462 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1463 *
1464 * @adev: amdgpu_device pointer
1465 *
1466 * Allocates a scratch page of VRAM for use by various things in the
1467 * driver.
1468 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1469 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1470 {
1471 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1472 AMDGPU_GEM_DOMAIN_VRAM |
1473 AMDGPU_GEM_DOMAIN_GTT,
1474 &adev->mem_scratch.robj,
1475 &adev->mem_scratch.gpu_addr,
1476 (void **)&adev->mem_scratch.ptr);
1477 }
1478
1479 /**
1480 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1481 *
1482 * @adev: amdgpu_device pointer
1483 *
1484 * Frees the VRAM scratch page.
1485 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1486 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1487 {
1488 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1489 }
1490
1491 /**
1492 * amdgpu_device_program_register_sequence - program an array of registers.
1493 *
1494 * @adev: amdgpu_device pointer
1495 * @registers: pointer to the register array
1496 * @array_size: size of the register array
1497 *
1498 * Programs an array or registers with and or masks.
1499 * This is a helper for setting golden registers.
1500 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1501 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1502 const u32 *registers,
1503 const u32 array_size)
1504 {
1505 u32 tmp, reg, and_mask, or_mask;
1506 int i;
1507
1508 if (array_size % 3)
1509 return;
1510
1511 for (i = 0; i < array_size; i += 3) {
1512 reg = registers[i + 0];
1513 and_mask = registers[i + 1];
1514 or_mask = registers[i + 2];
1515
1516 if (and_mask == 0xffffffff) {
1517 tmp = or_mask;
1518 } else {
1519 tmp = RREG32(reg);
1520 tmp &= ~and_mask;
1521 if (adev->family >= AMDGPU_FAMILY_AI)
1522 tmp |= (or_mask & and_mask);
1523 else
1524 tmp |= or_mask;
1525 }
1526 WREG32(reg, tmp);
1527 }
1528 }
1529
1530 /**
1531 * amdgpu_device_pci_config_reset - reset the GPU
1532 *
1533 * @adev: amdgpu_device pointer
1534 *
1535 * Resets the GPU using the pci config reset sequence.
1536 * Only applicable to asics prior to vega10.
1537 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1538 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1539 {
1540 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1541 }
1542
1543 /**
1544 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1545 *
1546 * @adev: amdgpu_device pointer
1547 *
1548 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1549 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1550 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1551 {
1552 return pci_reset_function(adev->pdev);
1553 }
1554
1555 /*
1556 * amdgpu_device_wb_*()
1557 * Writeback is the method by which the GPU updates special pages in memory
1558 * with the status of certain GPU events (fences, ring pointers,etc.).
1559 */
1560
1561 /**
1562 * amdgpu_device_wb_fini - Disable Writeback and free memory
1563 *
1564 * @adev: amdgpu_device pointer
1565 *
1566 * Disables Writeback and frees the Writeback memory (all asics).
1567 * Used at driver shutdown.
1568 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1569 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1570 {
1571 if (adev->wb.wb_obj) {
1572 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1573 &adev->wb.gpu_addr,
1574 (void **)&adev->wb.wb);
1575 adev->wb.wb_obj = NULL;
1576 }
1577 }
1578
1579 /**
1580 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1581 *
1582 * @adev: amdgpu_device pointer
1583 *
1584 * Initializes writeback and allocates writeback memory (all asics).
1585 * Used at driver startup.
1586 * Returns 0 on success or an -error on failure.
1587 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1588 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1589 {
1590 int r;
1591
1592 if (adev->wb.wb_obj == NULL) {
1593 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1594 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1595 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1596 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1597 (void **)&adev->wb.wb);
1598 if (r) {
1599 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1600 return r;
1601 }
1602
1603 adev->wb.num_wb = AMDGPU_MAX_WB;
1604 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1605
1606 /* clear wb memory */
1607 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1608 }
1609
1610 return 0;
1611 }
1612
1613 /**
1614 * amdgpu_device_wb_get - Allocate a wb entry
1615 *
1616 * @adev: amdgpu_device pointer
1617 * @wb: wb index
1618 *
1619 * Allocate a wb slot for use by the driver (all asics).
1620 * Returns 0 on success or -EINVAL on failure.
1621 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1622 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1623 {
1624 unsigned long flags, offset;
1625
1626 spin_lock_irqsave(&adev->wb.lock, flags);
1627 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1628 if (offset < adev->wb.num_wb) {
1629 __set_bit(offset, adev->wb.used);
1630 spin_unlock_irqrestore(&adev->wb.lock, flags);
1631 *wb = offset << 3; /* convert to dw offset */
1632 return 0;
1633 } else {
1634 spin_unlock_irqrestore(&adev->wb.lock, flags);
1635 return -EINVAL;
1636 }
1637 }
1638
1639 /**
1640 * amdgpu_device_wb_free - Free a wb entry
1641 *
1642 * @adev: amdgpu_device pointer
1643 * @wb: wb index
1644 *
1645 * Free a wb slot allocated for use by the driver (all asics)
1646 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1647 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1648 {
1649 unsigned long flags;
1650
1651 wb >>= 3;
1652 spin_lock_irqsave(&adev->wb.lock, flags);
1653 if (wb < adev->wb.num_wb)
1654 __clear_bit(wb, adev->wb.used);
1655 spin_unlock_irqrestore(&adev->wb.lock, flags);
1656 }
1657
1658 /**
1659 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1660 *
1661 * @adev: amdgpu_device pointer
1662 *
1663 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1664 * to fail, but if any of the BARs is not accessible after the size we abort
1665 * driver loading by returning -ENODEV.
1666 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1667 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1668 {
1669 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1670 struct pci_bus *root;
1671 struct resource *res;
1672 unsigned int i;
1673 u16 cmd;
1674 int r;
1675
1676 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1677 return 0;
1678
1679 /* Bypass for VF */
1680 if (amdgpu_sriov_vf(adev))
1681 return 0;
1682
1683 /* resizing on Dell G5 SE platforms causes problems with runtime pm */
1684 if ((amdgpu_runtime_pm != 0) &&
1685 adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1686 adev->pdev->device == 0x731f &&
1687 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1688 return 0;
1689
1690 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1691 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1692 DRM_WARN("System can't access extended configuration space, please check!!\n");
1693
1694 /* skip if the bios has already enabled large BAR */
1695 if (adev->gmc.real_vram_size &&
1696 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1697 return 0;
1698
1699 /* Check if the root BUS has 64bit memory resources */
1700 root = adev->pdev->bus;
1701 while (root->parent)
1702 root = root->parent;
1703
1704 pci_bus_for_each_resource(root, res, i) {
1705 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1706 res->start > 0x100000000ull)
1707 break;
1708 }
1709
1710 /* Trying to resize is pointless without a root hub window above 4GB */
1711 if (!res)
1712 return 0;
1713
1714 /* Limit the BAR size to what is available */
1715 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1716 rbar_size);
1717
1718 /* Disable memory decoding while we change the BAR addresses and size */
1719 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1720 pci_write_config_word(adev->pdev, PCI_COMMAND,
1721 cmd & ~PCI_COMMAND_MEMORY);
1722
1723 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1724 amdgpu_doorbell_fini(adev);
1725 if (adev->asic_type >= CHIP_BONAIRE)
1726 pci_release_resource(adev->pdev, 2);
1727
1728 pci_release_resource(adev->pdev, 0);
1729
1730 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1731 if (r == -ENOSPC)
1732 DRM_INFO("Not enough PCI address space for a large BAR.");
1733 else if (r && r != -ENOTSUPP)
1734 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1735
1736 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1737
1738 /* When the doorbell or fb BAR isn't available we have no chance of
1739 * using the device.
1740 */
1741 r = amdgpu_doorbell_init(adev);
1742 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1743 return -ENODEV;
1744
1745 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1746
1747 return 0;
1748 }
1749
1750 /*
1751 * GPU helpers function.
1752 */
1753 /**
1754 * amdgpu_device_need_post - check if the hw need post or not
1755 *
1756 * @adev: amdgpu_device pointer
1757 *
1758 * Check if the asic has been initialized (all asics) at driver startup
1759 * or post is needed if hw reset is performed.
1760 * Returns true if need or false if not.
1761 */
amdgpu_device_need_post(struct amdgpu_device * adev)1762 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1763 {
1764 uint32_t reg, flags;
1765
1766 if (amdgpu_sriov_vf(adev))
1767 return false;
1768
1769 flags = amdgpu_device_get_vbios_flags(adev);
1770 if (flags & AMDGPU_VBIOS_SKIP)
1771 return false;
1772 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios)
1773 return false;
1774
1775 if (amdgpu_passthrough(adev)) {
1776 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1777 * some old smc fw still need driver do vPost otherwise gpu hang, while
1778 * those smc fw version above 22.15 doesn't have this flaw, so we force
1779 * vpost executed for smc version below 22.15
1780 */
1781 if (adev->asic_type == CHIP_FIJI) {
1782 int err;
1783 uint32_t fw_ver;
1784
1785 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1786 /* force vPost if error occurred */
1787 if (err)
1788 return true;
1789
1790 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1791 release_firmware(adev->pm.fw);
1792 if (fw_ver < 0x00160e00)
1793 return true;
1794 }
1795 }
1796
1797 /* Don't post if we need to reset whole hive on init */
1798 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1799 return false;
1800
1801 if (adev->has_hw_reset) {
1802 adev->has_hw_reset = false;
1803 return true;
1804 }
1805
1806 /* bios scratch used on CIK+ */
1807 if (adev->asic_type >= CHIP_BONAIRE)
1808 return amdgpu_atombios_scratch_need_asic_init(adev);
1809
1810 /* check MEM_SIZE for older asics */
1811 reg = amdgpu_asic_get_config_memsize(adev);
1812
1813 if ((reg != 0) && (reg != 0xffffffff))
1814 return false;
1815
1816 return true;
1817 }
1818
1819 /*
1820 * Check whether seamless boot is supported.
1821 *
1822 * So far we only support seamless boot on DCE 3.0 or later.
1823 * If users report that it works on older ASICS as well, we may
1824 * loosen this.
1825 */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1826 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1827 {
1828 switch (amdgpu_seamless) {
1829 case -1:
1830 break;
1831 case 1:
1832 return true;
1833 case 0:
1834 return false;
1835 default:
1836 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1837 amdgpu_seamless);
1838 return false;
1839 }
1840
1841 if (!(adev->flags & AMD_IS_APU))
1842 return false;
1843
1844 if (adev->mman.keep_stolen_vga_memory)
1845 return false;
1846
1847 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1848 }
1849
1850 /*
1851 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1852 * don't support dynamic speed switching. Until we have confirmation from Intel
1853 * that a specific host supports it, it's safer that we keep it disabled for all.
1854 *
1855 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1856 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1857 */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1858 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1859 {
1860 #if IS_ENABLED(CONFIG_X86)
1861 struct cpuinfo_x86 *c = &cpu_data(0);
1862
1863 /* eGPU change speeds based on USB4 fabric conditions */
1864 if (dev_is_removable(adev->dev))
1865 return true;
1866
1867 if (c->x86_vendor == X86_VENDOR_INTEL)
1868 return false;
1869 #endif
1870 return true;
1871 }
1872
1873 /**
1874 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1875 *
1876 * @adev: amdgpu_device pointer
1877 *
1878 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1879 * be set for this device.
1880 *
1881 * Returns true if it should be used or false if not.
1882 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1883 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1884 {
1885 switch (amdgpu_aspm) {
1886 case -1:
1887 break;
1888 case 0:
1889 return false;
1890 case 1:
1891 return true;
1892 default:
1893 return false;
1894 }
1895 if (adev->flags & AMD_IS_APU)
1896 return false;
1897 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1898 return false;
1899 return pcie_aspm_enabled(adev->pdev);
1900 }
1901
1902 /* if we get transitioned to only one device, take VGA back */
1903 /**
1904 * amdgpu_device_vga_set_decode - enable/disable vga decode
1905 *
1906 * @pdev: PCI device pointer
1907 * @state: enable/disable vga decode
1908 *
1909 * Enable/disable vga decode (all asics).
1910 * Returns VGA resource flags.
1911 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1912 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1913 bool state)
1914 {
1915 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1916
1917 amdgpu_asic_set_vga_state(adev, state);
1918 if (state)
1919 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1920 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1921 else
1922 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1923 }
1924
1925 /**
1926 * amdgpu_device_check_block_size - validate the vm block size
1927 *
1928 * @adev: amdgpu_device pointer
1929 *
1930 * Validates the vm block size specified via module parameter.
1931 * The vm block size defines number of bits in page table versus page directory,
1932 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1933 * page table and the remaining bits are in the page directory.
1934 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1935 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1936 {
1937 /* defines number of bits in page table versus page directory,
1938 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1939 * page table and the remaining bits are in the page directory
1940 */
1941 if (amdgpu_vm_block_size == -1)
1942 return;
1943
1944 if (amdgpu_vm_block_size < 9) {
1945 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1946 amdgpu_vm_block_size);
1947 amdgpu_vm_block_size = -1;
1948 }
1949 }
1950
1951 /**
1952 * amdgpu_device_check_vm_size - validate the vm size
1953 *
1954 * @adev: amdgpu_device pointer
1955 *
1956 * Validates the vm size in GB specified via module parameter.
1957 * The VM size is the size of the GPU virtual memory space in GB.
1958 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1959 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1960 {
1961 /* no need to check the default value */
1962 if (amdgpu_vm_size == -1)
1963 return;
1964
1965 if (amdgpu_vm_size < 1) {
1966 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1967 amdgpu_vm_size);
1968 amdgpu_vm_size = -1;
1969 }
1970 }
1971
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1972 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1973 {
1974 struct sysinfo si;
1975 bool is_os_64 = (sizeof(void *) == 8);
1976 uint64_t total_memory;
1977 uint64_t dram_size_seven_GB = 0x1B8000000;
1978 uint64_t dram_size_three_GB = 0xB8000000;
1979
1980 if (amdgpu_smu_memory_pool_size == 0)
1981 return;
1982
1983 if (!is_os_64) {
1984 DRM_WARN("Not 64-bit OS, feature not supported\n");
1985 goto def_value;
1986 }
1987 si_meminfo(&si);
1988 total_memory = (uint64_t)si.totalram * si.mem_unit;
1989
1990 if ((amdgpu_smu_memory_pool_size == 1) ||
1991 (amdgpu_smu_memory_pool_size == 2)) {
1992 if (total_memory < dram_size_three_GB)
1993 goto def_value1;
1994 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1995 (amdgpu_smu_memory_pool_size == 8)) {
1996 if (total_memory < dram_size_seven_GB)
1997 goto def_value1;
1998 } else {
1999 DRM_WARN("Smu memory pool size not supported\n");
2000 goto def_value;
2001 }
2002 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
2003
2004 return;
2005
2006 def_value1:
2007 DRM_WARN("No enough system memory\n");
2008 def_value:
2009 adev->pm.smu_prv_buffer_size = 0;
2010 }
2011
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)2012 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
2013 {
2014 if (!(adev->flags & AMD_IS_APU) ||
2015 adev->asic_type < CHIP_RAVEN)
2016 return 0;
2017
2018 switch (adev->asic_type) {
2019 case CHIP_RAVEN:
2020 if (adev->pdev->device == 0x15dd)
2021 adev->apu_flags |= AMD_APU_IS_RAVEN;
2022 if (adev->pdev->device == 0x15d8)
2023 adev->apu_flags |= AMD_APU_IS_PICASSO;
2024 break;
2025 case CHIP_RENOIR:
2026 if ((adev->pdev->device == 0x1636) ||
2027 (adev->pdev->device == 0x164c))
2028 adev->apu_flags |= AMD_APU_IS_RENOIR;
2029 else
2030 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
2031 break;
2032 case CHIP_VANGOGH:
2033 adev->apu_flags |= AMD_APU_IS_VANGOGH;
2034 break;
2035 case CHIP_YELLOW_CARP:
2036 break;
2037 case CHIP_CYAN_SKILLFISH:
2038 if ((adev->pdev->device == 0x13FE) ||
2039 (adev->pdev->device == 0x143F))
2040 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
2041 break;
2042 default:
2043 break;
2044 }
2045
2046 return 0;
2047 }
2048
2049 /**
2050 * amdgpu_device_check_arguments - validate module params
2051 *
2052 * @adev: amdgpu_device pointer
2053 *
2054 * Validates certain module parameters and updates
2055 * the associated values used by the driver (all asics).
2056 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)2057 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2058 {
2059 int i;
2060
2061 if (amdgpu_sched_jobs < 4) {
2062 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2063 amdgpu_sched_jobs);
2064 amdgpu_sched_jobs = 4;
2065 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2066 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2067 amdgpu_sched_jobs);
2068 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2069 }
2070
2071 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2072 /* gart size must be greater or equal to 32M */
2073 dev_warn(adev->dev, "gart size (%d) too small\n",
2074 amdgpu_gart_size);
2075 amdgpu_gart_size = -1;
2076 }
2077
2078 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2079 /* gtt size must be greater or equal to 32M */
2080 dev_warn(adev->dev, "gtt size (%d) too small\n",
2081 amdgpu_gtt_size);
2082 amdgpu_gtt_size = -1;
2083 }
2084
2085 /* valid range is between 4 and 9 inclusive */
2086 if (amdgpu_vm_fragment_size != -1 &&
2087 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2088 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2089 amdgpu_vm_fragment_size = -1;
2090 }
2091
2092 if (amdgpu_sched_hw_submission < 2) {
2093 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2094 amdgpu_sched_hw_submission);
2095 amdgpu_sched_hw_submission = 2;
2096 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2097 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2098 amdgpu_sched_hw_submission);
2099 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2100 }
2101
2102 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2103 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2104 amdgpu_reset_method = -1;
2105 }
2106
2107 amdgpu_device_check_smu_prv_buffer_size(adev);
2108
2109 amdgpu_device_check_vm_size(adev);
2110
2111 amdgpu_device_check_block_size(adev);
2112
2113 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2114
2115 for (i = 0; i < MAX_XCP; i++)
2116 adev->enforce_isolation[i] = !!enforce_isolation;
2117
2118 return 0;
2119 }
2120
2121 /**
2122 * amdgpu_switcheroo_set_state - set switcheroo state
2123 *
2124 * @pdev: pci dev pointer
2125 * @state: vga_switcheroo state
2126 *
2127 * Callback for the switcheroo driver. Suspends or resumes
2128 * the asics before or after it is powered up using ACPI methods.
2129 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2130 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2131 enum vga_switcheroo_state state)
2132 {
2133 struct drm_device *dev = pci_get_drvdata(pdev);
2134 int r;
2135
2136 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2137 return;
2138
2139 if (state == VGA_SWITCHEROO_ON) {
2140 pr_info("switched on\n");
2141 /* don't suspend or resume card normally */
2142 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2143
2144 pci_set_power_state(pdev, PCI_D0);
2145 amdgpu_device_load_pci_state(pdev);
2146 r = pci_enable_device(pdev);
2147 if (r)
2148 DRM_WARN("pci_enable_device failed (%d)\n", r);
2149 amdgpu_device_resume(dev, true);
2150
2151 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2152 } else {
2153 pr_info("switched off\n");
2154 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2155 amdgpu_device_prepare(dev);
2156 amdgpu_device_suspend(dev, true);
2157 amdgpu_device_cache_pci_state(pdev);
2158 /* Shut down the device */
2159 pci_disable_device(pdev);
2160 pci_set_power_state(pdev, PCI_D3cold);
2161 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2162 }
2163 }
2164
2165 /**
2166 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2167 *
2168 * @pdev: pci dev pointer
2169 *
2170 * Callback for the switcheroo driver. Check of the switcheroo
2171 * state can be changed.
2172 * Returns true if the state can be changed, false if not.
2173 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2174 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2175 {
2176 struct drm_device *dev = pci_get_drvdata(pdev);
2177
2178 /*
2179 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2180 * locking inversion with the driver load path. And the access here is
2181 * completely racy anyway. So don't bother with locking for now.
2182 */
2183 return atomic_read(&dev->open_count) == 0;
2184 }
2185
2186 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2187 .set_gpu_state = amdgpu_switcheroo_set_state,
2188 .reprobe = NULL,
2189 .can_switch = amdgpu_switcheroo_can_switch,
2190 };
2191
2192 /**
2193 * amdgpu_device_ip_set_clockgating_state - set the CG state
2194 *
2195 * @dev: amdgpu_device pointer
2196 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2197 * @state: clockgating state (gate or ungate)
2198 *
2199 * Sets the requested clockgating state for all instances of
2200 * the hardware IP specified.
2201 * Returns the error code from the last instance.
2202 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2203 int amdgpu_device_ip_set_clockgating_state(void *dev,
2204 enum amd_ip_block_type block_type,
2205 enum amd_clockgating_state state)
2206 {
2207 struct amdgpu_device *adev = dev;
2208 int i, r = 0;
2209
2210 for (i = 0; i < adev->num_ip_blocks; i++) {
2211 if (!adev->ip_blocks[i].status.valid)
2212 continue;
2213 if (adev->ip_blocks[i].version->type != block_type)
2214 continue;
2215 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2216 continue;
2217 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2218 &adev->ip_blocks[i], state);
2219 if (r)
2220 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2221 adev->ip_blocks[i].version->funcs->name, r);
2222 }
2223 return r;
2224 }
2225
2226 /**
2227 * amdgpu_device_ip_set_powergating_state - set the PG state
2228 *
2229 * @dev: amdgpu_device pointer
2230 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2231 * @state: powergating state (gate or ungate)
2232 *
2233 * Sets the requested powergating state for all instances of
2234 * the hardware IP specified.
2235 * Returns the error code from the last instance.
2236 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2237 int amdgpu_device_ip_set_powergating_state(void *dev,
2238 enum amd_ip_block_type block_type,
2239 enum amd_powergating_state state)
2240 {
2241 struct amdgpu_device *adev = dev;
2242 int i, r = 0;
2243
2244 for (i = 0; i < adev->num_ip_blocks; i++) {
2245 if (!adev->ip_blocks[i].status.valid)
2246 continue;
2247 if (adev->ip_blocks[i].version->type != block_type)
2248 continue;
2249 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2250 continue;
2251 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2252 &adev->ip_blocks[i], state);
2253 if (r)
2254 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2255 adev->ip_blocks[i].version->funcs->name, r);
2256 }
2257 return r;
2258 }
2259
2260 /**
2261 * amdgpu_device_ip_get_clockgating_state - get the CG state
2262 *
2263 * @adev: amdgpu_device pointer
2264 * @flags: clockgating feature flags
2265 *
2266 * Walks the list of IPs on the device and updates the clockgating
2267 * flags for each IP.
2268 * Updates @flags with the feature flags for each hardware IP where
2269 * clockgating is enabled.
2270 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2271 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2272 u64 *flags)
2273 {
2274 int i;
2275
2276 for (i = 0; i < adev->num_ip_blocks; i++) {
2277 if (!adev->ip_blocks[i].status.valid)
2278 continue;
2279 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2280 adev->ip_blocks[i].version->funcs->get_clockgating_state(
2281 &adev->ip_blocks[i], flags);
2282 }
2283 }
2284
2285 /**
2286 * amdgpu_device_ip_wait_for_idle - wait for idle
2287 *
2288 * @adev: amdgpu_device pointer
2289 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2290 *
2291 * Waits for the request hardware IP to be idle.
2292 * Returns 0 for success or a negative error code on failure.
2293 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2294 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2295 enum amd_ip_block_type block_type)
2296 {
2297 int i, r;
2298
2299 for (i = 0; i < adev->num_ip_blocks; i++) {
2300 if (!adev->ip_blocks[i].status.valid)
2301 continue;
2302 if (adev->ip_blocks[i].version->type == block_type) {
2303 if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2304 r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2305 &adev->ip_blocks[i]);
2306 if (r)
2307 return r;
2308 }
2309 break;
2310 }
2311 }
2312 return 0;
2313
2314 }
2315
2316 /**
2317 * amdgpu_device_ip_is_valid - is the hardware IP enabled
2318 *
2319 * @adev: amdgpu_device pointer
2320 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2321 *
2322 * Check if the hardware IP is enable or not.
2323 * Returns true if it the IP is enable, false if not.
2324 */
amdgpu_device_ip_is_valid(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2325 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2326 enum amd_ip_block_type block_type)
2327 {
2328 int i;
2329
2330 for (i = 0; i < adev->num_ip_blocks; i++) {
2331 if (adev->ip_blocks[i].version->type == block_type)
2332 return adev->ip_blocks[i].status.valid;
2333 }
2334 return false;
2335
2336 }
2337
2338 /**
2339 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2340 *
2341 * @adev: amdgpu_device pointer
2342 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2343 *
2344 * Returns a pointer to the hardware IP block structure
2345 * if it exists for the asic, otherwise NULL.
2346 */
2347 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2348 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2349 enum amd_ip_block_type type)
2350 {
2351 int i;
2352
2353 for (i = 0; i < adev->num_ip_blocks; i++)
2354 if (adev->ip_blocks[i].version->type == type)
2355 return &adev->ip_blocks[i];
2356
2357 return NULL;
2358 }
2359
2360 /**
2361 * amdgpu_device_ip_block_version_cmp
2362 *
2363 * @adev: amdgpu_device pointer
2364 * @type: enum amd_ip_block_type
2365 * @major: major version
2366 * @minor: minor version
2367 *
2368 * return 0 if equal or greater
2369 * return 1 if smaller or the ip_block doesn't exist
2370 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2371 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2372 enum amd_ip_block_type type,
2373 u32 major, u32 minor)
2374 {
2375 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2376
2377 if (ip_block && ((ip_block->version->major > major) ||
2378 ((ip_block->version->major == major) &&
2379 (ip_block->version->minor >= minor))))
2380 return 0;
2381
2382 return 1;
2383 }
2384
2385 /**
2386 * amdgpu_device_ip_block_add
2387 *
2388 * @adev: amdgpu_device pointer
2389 * @ip_block_version: pointer to the IP to add
2390 *
2391 * Adds the IP block driver information to the collection of IPs
2392 * on the asic.
2393 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2394 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2395 const struct amdgpu_ip_block_version *ip_block_version)
2396 {
2397 if (!ip_block_version)
2398 return -EINVAL;
2399
2400 switch (ip_block_version->type) {
2401 case AMD_IP_BLOCK_TYPE_VCN:
2402 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2403 return 0;
2404 break;
2405 case AMD_IP_BLOCK_TYPE_JPEG:
2406 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2407 return 0;
2408 break;
2409 default:
2410 break;
2411 }
2412
2413 dev_info(adev->dev, "detected ip block number %d <%s>\n",
2414 adev->num_ip_blocks, ip_block_version->funcs->name);
2415
2416 adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2417
2418 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2419
2420 return 0;
2421 }
2422
2423 /**
2424 * amdgpu_device_enable_virtual_display - enable virtual display feature
2425 *
2426 * @adev: amdgpu_device pointer
2427 *
2428 * Enabled the virtual display feature if the user has enabled it via
2429 * the module parameter virtual_display. This feature provides a virtual
2430 * display hardware on headless boards or in virtualized environments.
2431 * This function parses and validates the configuration string specified by
2432 * the user and configures the virtual display configuration (number of
2433 * virtual connectors, crtcs, etc.) specified.
2434 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2435 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2436 {
2437 adev->enable_virtual_display = false;
2438
2439 if (amdgpu_virtual_display) {
2440 const char *pci_address_name = pci_name(adev->pdev);
2441 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2442
2443 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2444 pciaddstr_tmp = pciaddstr;
2445 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2446 pciaddname = strsep(&pciaddname_tmp, ",");
2447 if (!strcmp("all", pciaddname)
2448 || !strcmp(pci_address_name, pciaddname)) {
2449 long num_crtc;
2450 int res = -1;
2451
2452 adev->enable_virtual_display = true;
2453
2454 if (pciaddname_tmp)
2455 res = kstrtol(pciaddname_tmp, 10,
2456 &num_crtc);
2457
2458 if (!res) {
2459 if (num_crtc < 1)
2460 num_crtc = 1;
2461 if (num_crtc > 6)
2462 num_crtc = 6;
2463 adev->mode_info.num_crtc = num_crtc;
2464 } else {
2465 adev->mode_info.num_crtc = 1;
2466 }
2467 break;
2468 }
2469 }
2470
2471 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2472 amdgpu_virtual_display, pci_address_name,
2473 adev->enable_virtual_display, adev->mode_info.num_crtc);
2474
2475 kfree(pciaddstr);
2476 }
2477 }
2478
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2479 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2480 {
2481 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2482 adev->mode_info.num_crtc = 1;
2483 adev->enable_virtual_display = true;
2484 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2485 adev->enable_virtual_display, adev->mode_info.num_crtc);
2486 }
2487 }
2488
2489 /**
2490 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2491 *
2492 * @adev: amdgpu_device pointer
2493 *
2494 * Parses the asic configuration parameters specified in the gpu info
2495 * firmware and makes them available to the driver for use in configuring
2496 * the asic.
2497 * Returns 0 on success, -EINVAL on failure.
2498 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2499 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2500 {
2501 const char *chip_name;
2502 int err;
2503 const struct gpu_info_firmware_header_v1_0 *hdr;
2504
2505 adev->firmware.gpu_info_fw = NULL;
2506
2507 if (adev->mman.discovery_bin)
2508 return 0;
2509
2510 switch (adev->asic_type) {
2511 default:
2512 return 0;
2513 case CHIP_VEGA10:
2514 chip_name = "vega10";
2515 break;
2516 case CHIP_VEGA12:
2517 chip_name = "vega12";
2518 break;
2519 case CHIP_RAVEN:
2520 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2521 chip_name = "raven2";
2522 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2523 chip_name = "picasso";
2524 else
2525 chip_name = "raven";
2526 break;
2527 case CHIP_ARCTURUS:
2528 chip_name = "arcturus";
2529 break;
2530 case CHIP_NAVI12:
2531 chip_name = "navi12";
2532 break;
2533 }
2534
2535 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2536 AMDGPU_UCODE_OPTIONAL,
2537 "amdgpu/%s_gpu_info.bin", chip_name);
2538 if (err) {
2539 dev_err(adev->dev,
2540 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2541 chip_name);
2542 goto out;
2543 }
2544
2545 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2546 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2547
2548 switch (hdr->version_major) {
2549 case 1:
2550 {
2551 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2552 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2553 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2554
2555 /*
2556 * Should be dropped when DAL no longer needs it.
2557 */
2558 if (adev->asic_type == CHIP_NAVI12)
2559 goto parse_soc_bounding_box;
2560
2561 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2562 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2563 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2564 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2565 adev->gfx.config.max_texture_channel_caches =
2566 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2567 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2568 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2569 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2570 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2571 adev->gfx.config.double_offchip_lds_buf =
2572 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2573 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2574 adev->gfx.cu_info.max_waves_per_simd =
2575 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2576 adev->gfx.cu_info.max_scratch_slots_per_cu =
2577 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2578 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2579 if (hdr->version_minor >= 1) {
2580 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2581 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2582 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2583 adev->gfx.config.num_sc_per_sh =
2584 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2585 adev->gfx.config.num_packer_per_sc =
2586 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2587 }
2588
2589 parse_soc_bounding_box:
2590 /*
2591 * soc bounding box info is not integrated in disocovery table,
2592 * we always need to parse it from gpu info firmware if needed.
2593 */
2594 if (hdr->version_minor == 2) {
2595 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2596 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2597 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2598 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2599 }
2600 break;
2601 }
2602 default:
2603 dev_err(adev->dev,
2604 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2605 err = -EINVAL;
2606 goto out;
2607 }
2608 out:
2609 return err;
2610 }
2611
2612 /**
2613 * amdgpu_device_ip_early_init - run early init for hardware IPs
2614 *
2615 * @adev: amdgpu_device pointer
2616 *
2617 * Early initialization pass for hardware IPs. The hardware IPs that make
2618 * up each asic are discovered each IP's early_init callback is run. This
2619 * is the first stage in initializing the asic.
2620 * Returns 0 on success, negative error code on failure.
2621 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2622 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2623 {
2624 struct amdgpu_ip_block *ip_block;
2625 struct pci_dev *parent;
2626 bool total, skip_bios;
2627 uint32_t bios_flags;
2628 int i, r;
2629
2630 amdgpu_device_enable_virtual_display(adev);
2631
2632 if (amdgpu_sriov_vf(adev)) {
2633 r = amdgpu_virt_request_full_gpu(adev, true);
2634 if (r)
2635 return r;
2636 }
2637
2638 switch (adev->asic_type) {
2639 #ifdef CONFIG_DRM_AMDGPU_SI
2640 case CHIP_VERDE:
2641 case CHIP_TAHITI:
2642 case CHIP_PITCAIRN:
2643 case CHIP_OLAND:
2644 case CHIP_HAINAN:
2645 adev->family = AMDGPU_FAMILY_SI;
2646 r = si_set_ip_blocks(adev);
2647 if (r)
2648 return r;
2649 break;
2650 #endif
2651 #ifdef CONFIG_DRM_AMDGPU_CIK
2652 case CHIP_BONAIRE:
2653 case CHIP_HAWAII:
2654 case CHIP_KAVERI:
2655 case CHIP_KABINI:
2656 case CHIP_MULLINS:
2657 if (adev->flags & AMD_IS_APU)
2658 adev->family = AMDGPU_FAMILY_KV;
2659 else
2660 adev->family = AMDGPU_FAMILY_CI;
2661
2662 r = cik_set_ip_blocks(adev);
2663 if (r)
2664 return r;
2665 break;
2666 #endif
2667 case CHIP_TOPAZ:
2668 case CHIP_TONGA:
2669 case CHIP_FIJI:
2670 case CHIP_POLARIS10:
2671 case CHIP_POLARIS11:
2672 case CHIP_POLARIS12:
2673 case CHIP_VEGAM:
2674 case CHIP_CARRIZO:
2675 case CHIP_STONEY:
2676 if (adev->flags & AMD_IS_APU)
2677 adev->family = AMDGPU_FAMILY_CZ;
2678 else
2679 adev->family = AMDGPU_FAMILY_VI;
2680
2681 r = vi_set_ip_blocks(adev);
2682 if (r)
2683 return r;
2684 break;
2685 default:
2686 r = amdgpu_discovery_set_ip_blocks(adev);
2687 if (r)
2688 return r;
2689 break;
2690 }
2691
2692 if (amdgpu_has_atpx() &&
2693 (amdgpu_is_atpx_hybrid() ||
2694 amdgpu_has_atpx_dgpu_power_cntl()) &&
2695 ((adev->flags & AMD_IS_APU) == 0) &&
2696 !dev_is_removable(&adev->pdev->dev))
2697 adev->flags |= AMD_IS_PX;
2698
2699 if (!(adev->flags & AMD_IS_APU)) {
2700 parent = pcie_find_root_port(adev->pdev);
2701 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2702 }
2703
2704
2705 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2706 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2707 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2708 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2709 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2710 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2711 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2712
2713 total = true;
2714 for (i = 0; i < adev->num_ip_blocks; i++) {
2715 ip_block = &adev->ip_blocks[i];
2716
2717 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2718 DRM_WARN("disabled ip block: %d <%s>\n",
2719 i, adev->ip_blocks[i].version->funcs->name);
2720 adev->ip_blocks[i].status.valid = false;
2721 } else if (ip_block->version->funcs->early_init) {
2722 r = ip_block->version->funcs->early_init(ip_block);
2723 if (r == -ENOENT) {
2724 adev->ip_blocks[i].status.valid = false;
2725 } else if (r) {
2726 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2727 adev->ip_blocks[i].version->funcs->name, r);
2728 total = false;
2729 } else {
2730 adev->ip_blocks[i].status.valid = true;
2731 }
2732 } else {
2733 adev->ip_blocks[i].status.valid = true;
2734 }
2735 /* get the vbios after the asic_funcs are set up */
2736 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2737 r = amdgpu_device_parse_gpu_info_fw(adev);
2738 if (r)
2739 return r;
2740
2741 bios_flags = amdgpu_device_get_vbios_flags(adev);
2742 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP);
2743 /* Read BIOS */
2744 if (!skip_bios) {
2745 bool optional =
2746 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL);
2747 if (!amdgpu_get_bios(adev) && !optional)
2748 return -EINVAL;
2749
2750 if (optional && !adev->bios)
2751 dev_info(
2752 adev->dev,
2753 "VBIOS image optional, proceeding without VBIOS image");
2754
2755 if (adev->bios) {
2756 r = amdgpu_atombios_init(adev);
2757 if (r) {
2758 dev_err(adev->dev,
2759 "amdgpu_atombios_init failed\n");
2760 amdgpu_vf_error_put(
2761 adev,
2762 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL,
2763 0, 0);
2764 return r;
2765 }
2766 }
2767 }
2768
2769 /*get pf2vf msg info at it's earliest time*/
2770 if (amdgpu_sriov_vf(adev))
2771 amdgpu_virt_init_data_exchange(adev);
2772
2773 }
2774 }
2775 if (!total)
2776 return -ENODEV;
2777
2778 if (adev->gmc.xgmi.supported)
2779 amdgpu_xgmi_early_init(adev);
2780
2781 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2782 if (ip_block->status.valid != false)
2783 amdgpu_amdkfd_device_probe(adev);
2784
2785 adev->cg_flags &= amdgpu_cg_mask;
2786 adev->pg_flags &= amdgpu_pg_mask;
2787
2788 return 0;
2789 }
2790
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2791 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2792 {
2793 int i, r;
2794
2795 for (i = 0; i < adev->num_ip_blocks; i++) {
2796 if (!adev->ip_blocks[i].status.sw)
2797 continue;
2798 if (adev->ip_blocks[i].status.hw)
2799 continue;
2800 if (!amdgpu_ip_member_of_hwini(
2801 adev, adev->ip_blocks[i].version->type))
2802 continue;
2803 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2804 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2805 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2806 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2807 if (r) {
2808 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2809 adev->ip_blocks[i].version->funcs->name, r);
2810 return r;
2811 }
2812 adev->ip_blocks[i].status.hw = true;
2813 }
2814 }
2815
2816 return 0;
2817 }
2818
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2819 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2820 {
2821 int i, r;
2822
2823 for (i = 0; i < adev->num_ip_blocks; i++) {
2824 if (!adev->ip_blocks[i].status.sw)
2825 continue;
2826 if (adev->ip_blocks[i].status.hw)
2827 continue;
2828 if (!amdgpu_ip_member_of_hwini(
2829 adev, adev->ip_blocks[i].version->type))
2830 continue;
2831 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2832 if (r) {
2833 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2834 adev->ip_blocks[i].version->funcs->name, r);
2835 return r;
2836 }
2837 adev->ip_blocks[i].status.hw = true;
2838 }
2839
2840 return 0;
2841 }
2842
amdgpu_device_fw_loading(struct amdgpu_device * adev)2843 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2844 {
2845 int r = 0;
2846 int i;
2847 uint32_t smu_version;
2848
2849 if (adev->asic_type >= CHIP_VEGA10) {
2850 for (i = 0; i < adev->num_ip_blocks; i++) {
2851 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2852 continue;
2853
2854 if (!amdgpu_ip_member_of_hwini(adev,
2855 AMD_IP_BLOCK_TYPE_PSP))
2856 break;
2857
2858 if (!adev->ip_blocks[i].status.sw)
2859 continue;
2860
2861 /* no need to do the fw loading again if already done*/
2862 if (adev->ip_blocks[i].status.hw == true)
2863 break;
2864
2865 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2866 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2867 if (r)
2868 return r;
2869 } else {
2870 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2871 if (r) {
2872 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2873 adev->ip_blocks[i].version->funcs->name, r);
2874 return r;
2875 }
2876 adev->ip_blocks[i].status.hw = true;
2877 }
2878 break;
2879 }
2880 }
2881
2882 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2883 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2884
2885 return r;
2886 }
2887
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2888 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2889 {
2890 struct drm_sched_init_args args = {
2891 .ops = &amdgpu_sched_ops,
2892 .num_rqs = DRM_SCHED_PRIORITY_COUNT,
2893 .timeout_wq = adev->reset_domain->wq,
2894 .dev = adev->dev,
2895 };
2896 long timeout;
2897 int r, i;
2898
2899 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2900 struct amdgpu_ring *ring = adev->rings[i];
2901
2902 /* No need to setup the GPU scheduler for rings that don't need it */
2903 if (!ring || ring->no_scheduler)
2904 continue;
2905
2906 switch (ring->funcs->type) {
2907 case AMDGPU_RING_TYPE_GFX:
2908 timeout = adev->gfx_timeout;
2909 break;
2910 case AMDGPU_RING_TYPE_COMPUTE:
2911 timeout = adev->compute_timeout;
2912 break;
2913 case AMDGPU_RING_TYPE_SDMA:
2914 timeout = adev->sdma_timeout;
2915 break;
2916 default:
2917 timeout = adev->video_timeout;
2918 break;
2919 }
2920
2921 args.timeout = timeout;
2922 args.credit_limit = ring->num_hw_submission;
2923 args.score = ring->sched_score;
2924 args.name = ring->name;
2925
2926 r = drm_sched_init(&ring->sched, &args);
2927 if (r) {
2928 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2929 ring->name);
2930 return r;
2931 }
2932 r = amdgpu_uvd_entity_init(adev, ring);
2933 if (r) {
2934 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2935 ring->name);
2936 return r;
2937 }
2938 r = amdgpu_vce_entity_init(adev, ring);
2939 if (r) {
2940 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2941 ring->name);
2942 return r;
2943 }
2944 }
2945
2946 amdgpu_xcp_update_partition_sched_list(adev);
2947
2948 return 0;
2949 }
2950
2951
2952 /**
2953 * amdgpu_device_ip_init - run init for hardware IPs
2954 *
2955 * @adev: amdgpu_device pointer
2956 *
2957 * Main initialization pass for hardware IPs. The list of all the hardware
2958 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2959 * are run. sw_init initializes the software state associated with each IP
2960 * and hw_init initializes the hardware associated with each IP.
2961 * Returns 0 on success, negative error code on failure.
2962 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2963 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2964 {
2965 bool init_badpage;
2966 int i, r;
2967
2968 r = amdgpu_ras_init(adev);
2969 if (r)
2970 return r;
2971
2972 for (i = 0; i < adev->num_ip_blocks; i++) {
2973 if (!adev->ip_blocks[i].status.valid)
2974 continue;
2975 if (adev->ip_blocks[i].version->funcs->sw_init) {
2976 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
2977 if (r) {
2978 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2979 adev->ip_blocks[i].version->funcs->name, r);
2980 goto init_failed;
2981 }
2982 }
2983 adev->ip_blocks[i].status.sw = true;
2984
2985 if (!amdgpu_ip_member_of_hwini(
2986 adev, adev->ip_blocks[i].version->type))
2987 continue;
2988
2989 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2990 /* need to do common hw init early so everything is set up for gmc */
2991 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2992 if (r) {
2993 DRM_ERROR("hw_init %d failed %d\n", i, r);
2994 goto init_failed;
2995 }
2996 adev->ip_blocks[i].status.hw = true;
2997 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2998 /* need to do gmc hw init early so we can allocate gpu mem */
2999 /* Try to reserve bad pages early */
3000 if (amdgpu_sriov_vf(adev))
3001 amdgpu_virt_exchange_data(adev);
3002
3003 r = amdgpu_device_mem_scratch_init(adev);
3004 if (r) {
3005 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
3006 goto init_failed;
3007 }
3008 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3009 if (r) {
3010 DRM_ERROR("hw_init %d failed %d\n", i, r);
3011 goto init_failed;
3012 }
3013 r = amdgpu_device_wb_init(adev);
3014 if (r) {
3015 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
3016 goto init_failed;
3017 }
3018 adev->ip_blocks[i].status.hw = true;
3019
3020 /* right after GMC hw init, we create CSA */
3021 if (adev->gfx.mcbp) {
3022 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
3023 AMDGPU_GEM_DOMAIN_VRAM |
3024 AMDGPU_GEM_DOMAIN_GTT,
3025 AMDGPU_CSA_SIZE);
3026 if (r) {
3027 DRM_ERROR("allocate CSA failed %d\n", r);
3028 goto init_failed;
3029 }
3030 }
3031
3032 r = amdgpu_seq64_init(adev);
3033 if (r) {
3034 DRM_ERROR("allocate seq64 failed %d\n", r);
3035 goto init_failed;
3036 }
3037 }
3038 }
3039
3040 if (amdgpu_sriov_vf(adev))
3041 amdgpu_virt_init_data_exchange(adev);
3042
3043 r = amdgpu_ib_pool_init(adev);
3044 if (r) {
3045 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
3046 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
3047 goto init_failed;
3048 }
3049
3050 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
3051 if (r)
3052 goto init_failed;
3053
3054 r = amdgpu_device_ip_hw_init_phase1(adev);
3055 if (r)
3056 goto init_failed;
3057
3058 r = amdgpu_device_fw_loading(adev);
3059 if (r)
3060 goto init_failed;
3061
3062 r = amdgpu_device_ip_hw_init_phase2(adev);
3063 if (r)
3064 goto init_failed;
3065
3066 /*
3067 * retired pages will be loaded from eeprom and reserved here,
3068 * it should be called after amdgpu_device_ip_hw_init_phase2 since
3069 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3070 * for I2C communication which only true at this point.
3071 *
3072 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3073 * failure from bad gpu situation and stop amdgpu init process
3074 * accordingly. For other failed cases, it will still release all
3075 * the resource and print error message, rather than returning one
3076 * negative value to upper level.
3077 *
3078 * Note: theoretically, this should be called before all vram allocations
3079 * to protect retired page from abusing
3080 */
3081 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3082 r = amdgpu_ras_recovery_init(adev, init_badpage);
3083 if (r)
3084 goto init_failed;
3085
3086 /**
3087 * In case of XGMI grab extra reference for reset domain for this device
3088 */
3089 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3090 if (amdgpu_xgmi_add_device(adev) == 0) {
3091 if (!amdgpu_sriov_vf(adev)) {
3092 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3093
3094 if (WARN_ON(!hive)) {
3095 r = -ENOENT;
3096 goto init_failed;
3097 }
3098
3099 if (!hive->reset_domain ||
3100 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3101 r = -ENOENT;
3102 amdgpu_put_xgmi_hive(hive);
3103 goto init_failed;
3104 }
3105
3106 /* Drop the early temporary reset domain we created for device */
3107 amdgpu_reset_put_reset_domain(adev->reset_domain);
3108 adev->reset_domain = hive->reset_domain;
3109 amdgpu_put_xgmi_hive(hive);
3110 }
3111 }
3112 }
3113
3114 r = amdgpu_device_init_schedulers(adev);
3115 if (r)
3116 goto init_failed;
3117
3118 if (adev->mman.buffer_funcs_ring->sched.ready)
3119 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3120
3121 /* Don't init kfd if whole hive need to be reset during init */
3122 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3123 kgd2kfd_init_zone_device(adev);
3124 amdgpu_amdkfd_device_init(adev);
3125 }
3126
3127 amdgpu_fru_get_product_info(adev);
3128
3129 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev))
3130 r = amdgpu_cper_init(adev);
3131
3132 init_failed:
3133
3134 return r;
3135 }
3136
3137 /**
3138 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3139 *
3140 * @adev: amdgpu_device pointer
3141 *
3142 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3143 * this function before a GPU reset. If the value is retained after a
3144 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3145 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)3146 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3147 {
3148 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3149 }
3150
3151 /**
3152 * amdgpu_device_check_vram_lost - check if vram is valid
3153 *
3154 * @adev: amdgpu_device pointer
3155 *
3156 * Checks the reset magic value written to the gart pointer in VRAM.
3157 * The driver calls this after a GPU reset to see if the contents of
3158 * VRAM is lost or now.
3159 * returns true if vram is lost, false if not.
3160 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3161 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3162 {
3163 if (memcmp(adev->gart.ptr, adev->reset_magic,
3164 AMDGPU_RESET_MAGIC_NUM))
3165 return true;
3166
3167 if (!amdgpu_in_reset(adev))
3168 return false;
3169
3170 /*
3171 * For all ASICs with baco/mode1 reset, the VRAM is
3172 * always assumed to be lost.
3173 */
3174 switch (amdgpu_asic_reset_method(adev)) {
3175 case AMD_RESET_METHOD_BACO:
3176 case AMD_RESET_METHOD_MODE1:
3177 return true;
3178 default:
3179 return false;
3180 }
3181 }
3182
3183 /**
3184 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3185 *
3186 * @adev: amdgpu_device pointer
3187 * @state: clockgating state (gate or ungate)
3188 *
3189 * The list of all the hardware IPs that make up the asic is walked and the
3190 * set_clockgating_state callbacks are run.
3191 * Late initialization pass enabling clockgating for hardware IPs.
3192 * Fini or suspend, pass disabling clockgating for hardware IPs.
3193 * Returns 0 on success, negative error code on failure.
3194 */
3195
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3196 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3197 enum amd_clockgating_state state)
3198 {
3199 int i, j, r;
3200
3201 if (amdgpu_emu_mode == 1)
3202 return 0;
3203
3204 for (j = 0; j < adev->num_ip_blocks; j++) {
3205 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3206 if (!adev->ip_blocks[i].status.late_initialized)
3207 continue;
3208 /* skip CG for GFX, SDMA on S0ix */
3209 if (adev->in_s0ix &&
3210 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3211 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3212 continue;
3213 /* skip CG for VCE/UVD, it's handled specially */
3214 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3215 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3216 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3217 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3218 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3219 /* enable clockgating to save power */
3220 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
3221 state);
3222 if (r) {
3223 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3224 adev->ip_blocks[i].version->funcs->name, r);
3225 return r;
3226 }
3227 }
3228 }
3229
3230 return 0;
3231 }
3232
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3233 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3234 enum amd_powergating_state state)
3235 {
3236 int i, j, r;
3237
3238 if (amdgpu_emu_mode == 1)
3239 return 0;
3240
3241 for (j = 0; j < adev->num_ip_blocks; j++) {
3242 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3243 if (!adev->ip_blocks[i].status.late_initialized)
3244 continue;
3245 /* skip PG for GFX, SDMA on S0ix */
3246 if (adev->in_s0ix &&
3247 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3248 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3249 continue;
3250 /* skip CG for VCE/UVD, it's handled specially */
3251 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3252 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3253 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3254 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3255 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3256 /* enable powergating to save power */
3257 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
3258 state);
3259 if (r) {
3260 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3261 adev->ip_blocks[i].version->funcs->name, r);
3262 return r;
3263 }
3264 }
3265 }
3266 return 0;
3267 }
3268
amdgpu_device_enable_mgpu_fan_boost(void)3269 static int amdgpu_device_enable_mgpu_fan_boost(void)
3270 {
3271 struct amdgpu_gpu_instance *gpu_ins;
3272 struct amdgpu_device *adev;
3273 int i, ret = 0;
3274
3275 mutex_lock(&mgpu_info.mutex);
3276
3277 /*
3278 * MGPU fan boost feature should be enabled
3279 * only when there are two or more dGPUs in
3280 * the system
3281 */
3282 if (mgpu_info.num_dgpu < 2)
3283 goto out;
3284
3285 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3286 gpu_ins = &(mgpu_info.gpu_ins[i]);
3287 adev = gpu_ins->adev;
3288 if (!(adev->flags & AMD_IS_APU) &&
3289 !gpu_ins->mgpu_fan_enabled) {
3290 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3291 if (ret)
3292 break;
3293
3294 gpu_ins->mgpu_fan_enabled = 1;
3295 }
3296 }
3297
3298 out:
3299 mutex_unlock(&mgpu_info.mutex);
3300
3301 return ret;
3302 }
3303
3304 /**
3305 * amdgpu_device_ip_late_init - run late init for hardware IPs
3306 *
3307 * @adev: amdgpu_device pointer
3308 *
3309 * Late initialization pass for hardware IPs. The list of all the hardware
3310 * IPs that make up the asic is walked and the late_init callbacks are run.
3311 * late_init covers any special initialization that an IP requires
3312 * after all of the have been initialized or something that needs to happen
3313 * late in the init process.
3314 * Returns 0 on success, negative error code on failure.
3315 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3316 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3317 {
3318 struct amdgpu_gpu_instance *gpu_instance;
3319 int i = 0, r;
3320
3321 for (i = 0; i < adev->num_ip_blocks; i++) {
3322 if (!adev->ip_blocks[i].status.hw)
3323 continue;
3324 if (adev->ip_blocks[i].version->funcs->late_init) {
3325 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3326 if (r) {
3327 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3328 adev->ip_blocks[i].version->funcs->name, r);
3329 return r;
3330 }
3331 }
3332 adev->ip_blocks[i].status.late_initialized = true;
3333 }
3334
3335 r = amdgpu_ras_late_init(adev);
3336 if (r) {
3337 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3338 return r;
3339 }
3340
3341 if (!amdgpu_reset_in_recovery(adev))
3342 amdgpu_ras_set_error_query_ready(adev, true);
3343
3344 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3345 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3346
3347 amdgpu_device_fill_reset_magic(adev);
3348
3349 r = amdgpu_device_enable_mgpu_fan_boost();
3350 if (r)
3351 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3352
3353 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3354 if (amdgpu_passthrough(adev) &&
3355 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3356 adev->asic_type == CHIP_ALDEBARAN))
3357 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3358
3359 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3360 mutex_lock(&mgpu_info.mutex);
3361
3362 /*
3363 * Reset device p-state to low as this was booted with high.
3364 *
3365 * This should be performed only after all devices from the same
3366 * hive get initialized.
3367 *
3368 * However, it's unknown how many device in the hive in advance.
3369 * As this is counted one by one during devices initializations.
3370 *
3371 * So, we wait for all XGMI interlinked devices initialized.
3372 * This may bring some delays as those devices may come from
3373 * different hives. But that should be OK.
3374 */
3375 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3376 for (i = 0; i < mgpu_info.num_gpu; i++) {
3377 gpu_instance = &(mgpu_info.gpu_ins[i]);
3378 if (gpu_instance->adev->flags & AMD_IS_APU)
3379 continue;
3380
3381 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3382 AMDGPU_XGMI_PSTATE_MIN);
3383 if (r) {
3384 DRM_ERROR("pstate setting failed (%d).\n", r);
3385 break;
3386 }
3387 }
3388 }
3389
3390 mutex_unlock(&mgpu_info.mutex);
3391 }
3392
3393 return 0;
3394 }
3395
amdgpu_ip_block_hw_fini(struct amdgpu_ip_block * ip_block)3396 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3397 {
3398 int r;
3399
3400 if (!ip_block->version->funcs->hw_fini) {
3401 DRM_ERROR("hw_fini of IP block <%s> not defined\n",
3402 ip_block->version->funcs->name);
3403 } else {
3404 r = ip_block->version->funcs->hw_fini(ip_block);
3405 /* XXX handle errors */
3406 if (r) {
3407 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3408 ip_block->version->funcs->name, r);
3409 }
3410 }
3411
3412 ip_block->status.hw = false;
3413 }
3414
3415 /**
3416 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3417 *
3418 * @adev: amdgpu_device pointer
3419 *
3420 * For ASICs need to disable SMC first
3421 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3422 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3423 {
3424 int i;
3425
3426 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3427 return;
3428
3429 for (i = 0; i < adev->num_ip_blocks; i++) {
3430 if (!adev->ip_blocks[i].status.hw)
3431 continue;
3432 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3433 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3434 break;
3435 }
3436 }
3437 }
3438
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3439 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3440 {
3441 int i, r;
3442
3443 for (i = 0; i < adev->num_ip_blocks; i++) {
3444 if (!adev->ip_blocks[i].version->funcs->early_fini)
3445 continue;
3446
3447 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3448 if (r) {
3449 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3450 adev->ip_blocks[i].version->funcs->name, r);
3451 }
3452 }
3453
3454 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3455 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3456
3457 amdgpu_amdkfd_suspend(adev, false);
3458
3459 /* Workaround for ASICs need to disable SMC first */
3460 amdgpu_device_smu_fini_early(adev);
3461
3462 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3463 if (!adev->ip_blocks[i].status.hw)
3464 continue;
3465
3466 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3467 }
3468
3469 if (amdgpu_sriov_vf(adev)) {
3470 if (amdgpu_virt_release_full_gpu(adev, false))
3471 DRM_ERROR("failed to release exclusive mode on fini\n");
3472 }
3473
3474 return 0;
3475 }
3476
3477 /**
3478 * amdgpu_device_ip_fini - run fini for hardware IPs
3479 *
3480 * @adev: amdgpu_device pointer
3481 *
3482 * Main teardown pass for hardware IPs. The list of all the hardware
3483 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3484 * are run. hw_fini tears down the hardware associated with each IP
3485 * and sw_fini tears down any software state associated with each IP.
3486 * Returns 0 on success, negative error code on failure.
3487 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3488 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3489 {
3490 int i, r;
3491
3492 amdgpu_cper_fini(adev);
3493
3494 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3495 amdgpu_virt_release_ras_err_handler_data(adev);
3496
3497 if (adev->gmc.xgmi.num_physical_nodes > 1)
3498 amdgpu_xgmi_remove_device(adev);
3499
3500 amdgpu_amdkfd_device_fini_sw(adev);
3501
3502 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3503 if (!adev->ip_blocks[i].status.sw)
3504 continue;
3505
3506 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3507 amdgpu_ucode_free_bo(adev);
3508 amdgpu_free_static_csa(&adev->virt.csa_obj);
3509 amdgpu_device_wb_fini(adev);
3510 amdgpu_device_mem_scratch_fini(adev);
3511 amdgpu_ib_pool_fini(adev);
3512 amdgpu_seq64_fini(adev);
3513 amdgpu_doorbell_fini(adev);
3514 }
3515 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3516 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3517 /* XXX handle errors */
3518 if (r) {
3519 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3520 adev->ip_blocks[i].version->funcs->name, r);
3521 }
3522 }
3523 adev->ip_blocks[i].status.sw = false;
3524 adev->ip_blocks[i].status.valid = false;
3525 }
3526
3527 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3528 if (!adev->ip_blocks[i].status.late_initialized)
3529 continue;
3530 if (adev->ip_blocks[i].version->funcs->late_fini)
3531 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3532 adev->ip_blocks[i].status.late_initialized = false;
3533 }
3534
3535 amdgpu_ras_fini(adev);
3536
3537 return 0;
3538 }
3539
3540 /**
3541 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3542 *
3543 * @work: work_struct.
3544 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3545 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3546 {
3547 struct amdgpu_device *adev =
3548 container_of(work, struct amdgpu_device, delayed_init_work.work);
3549 int r;
3550
3551 r = amdgpu_ib_ring_tests(adev);
3552 if (r)
3553 DRM_ERROR("ib ring test failed (%d).\n", r);
3554 }
3555
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3556 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3557 {
3558 struct amdgpu_device *adev =
3559 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3560
3561 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3562 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3563
3564 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0))
3565 adev->gfx.gfx_off_state = true;
3566 }
3567
3568 /**
3569 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3570 *
3571 * @adev: amdgpu_device pointer
3572 *
3573 * Main suspend function for hardware IPs. The list of all the hardware
3574 * IPs that make up the asic is walked, clockgating is disabled and the
3575 * suspend callbacks are run. suspend puts the hardware and software state
3576 * in each IP into a state suitable for suspend.
3577 * Returns 0 on success, negative error code on failure.
3578 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3579 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3580 {
3581 int i, r;
3582
3583 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3584 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3585
3586 /*
3587 * Per PMFW team's suggestion, driver needs to handle gfxoff
3588 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3589 * scenario. Add the missing df cstate disablement here.
3590 */
3591 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3592 dev_warn(adev->dev, "Failed to disallow df cstate");
3593
3594 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3595 if (!adev->ip_blocks[i].status.valid)
3596 continue;
3597
3598 /* displays are handled separately */
3599 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3600 continue;
3601
3602 /* XXX handle errors */
3603 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3604 if (r)
3605 return r;
3606 }
3607
3608 return 0;
3609 }
3610
3611 /**
3612 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3613 *
3614 * @adev: amdgpu_device pointer
3615 *
3616 * Main suspend function for hardware IPs. The list of all the hardware
3617 * IPs that make up the asic is walked, clockgating is disabled and the
3618 * suspend callbacks are run. suspend puts the hardware and software state
3619 * in each IP into a state suitable for suspend.
3620 * Returns 0 on success, negative error code on failure.
3621 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3622 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3623 {
3624 int i, r;
3625
3626 if (adev->in_s0ix)
3627 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3628
3629 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3630 if (!adev->ip_blocks[i].status.valid)
3631 continue;
3632 /* displays are handled in phase1 */
3633 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3634 continue;
3635 /* PSP lost connection when err_event_athub occurs */
3636 if (amdgpu_ras_intr_triggered() &&
3637 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3638 adev->ip_blocks[i].status.hw = false;
3639 continue;
3640 }
3641
3642 /* skip unnecessary suspend if we do not initialize them yet */
3643 if (!amdgpu_ip_member_of_hwini(
3644 adev, adev->ip_blocks[i].version->type))
3645 continue;
3646
3647 /* Since we skip suspend for S0i3, we need to cancel the delayed
3648 * idle work here as the suspend callback never gets called.
3649 */
3650 if (adev->in_s0ix &&
3651 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX &&
3652 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0))
3653 cancel_delayed_work_sync(&adev->gfx.idle_work);
3654 /* skip suspend of gfx/mes and psp for S0ix
3655 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3656 * like at runtime. PSP is also part of the always on hardware
3657 * so no need to suspend it.
3658 */
3659 if (adev->in_s0ix &&
3660 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3661 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3662 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3663 continue;
3664
3665 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3666 if (adev->in_s0ix &&
3667 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3668 IP_VERSION(5, 0, 0)) &&
3669 (adev->ip_blocks[i].version->type ==
3670 AMD_IP_BLOCK_TYPE_SDMA))
3671 continue;
3672
3673 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3674 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3675 * from this location and RLC Autoload automatically also gets loaded
3676 * from here based on PMFW -> PSP message during re-init sequence.
3677 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3678 * the TMR and reload FWs again for IMU enabled APU ASICs.
3679 */
3680 if (amdgpu_in_reset(adev) &&
3681 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3682 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3683 continue;
3684
3685 /* XXX handle errors */
3686 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3687 adev->ip_blocks[i].status.hw = false;
3688
3689 /* handle putting the SMC in the appropriate state */
3690 if (!amdgpu_sriov_vf(adev)) {
3691 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3692 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3693 if (r) {
3694 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3695 adev->mp1_state, r);
3696 return r;
3697 }
3698 }
3699 }
3700 }
3701
3702 return 0;
3703 }
3704
3705 /**
3706 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3707 *
3708 * @adev: amdgpu_device pointer
3709 *
3710 * Main suspend function for hardware IPs. The list of all the hardware
3711 * IPs that make up the asic is walked, clockgating is disabled and the
3712 * suspend callbacks are run. suspend puts the hardware and software state
3713 * in each IP into a state suitable for suspend.
3714 * Returns 0 on success, negative error code on failure.
3715 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3716 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3717 {
3718 int r;
3719
3720 if (amdgpu_sriov_vf(adev)) {
3721 amdgpu_virt_fini_data_exchange(adev);
3722 amdgpu_virt_request_full_gpu(adev, false);
3723 }
3724
3725 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3726
3727 r = amdgpu_device_ip_suspend_phase1(adev);
3728 if (r)
3729 return r;
3730 r = amdgpu_device_ip_suspend_phase2(adev);
3731
3732 if (amdgpu_sriov_vf(adev))
3733 amdgpu_virt_release_full_gpu(adev, false);
3734
3735 return r;
3736 }
3737
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3738 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3739 {
3740 int i, r;
3741
3742 static enum amd_ip_block_type ip_order[] = {
3743 AMD_IP_BLOCK_TYPE_COMMON,
3744 AMD_IP_BLOCK_TYPE_GMC,
3745 AMD_IP_BLOCK_TYPE_PSP,
3746 AMD_IP_BLOCK_TYPE_IH,
3747 };
3748
3749 for (i = 0; i < adev->num_ip_blocks; i++) {
3750 int j;
3751 struct amdgpu_ip_block *block;
3752
3753 block = &adev->ip_blocks[i];
3754 block->status.hw = false;
3755
3756 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3757
3758 if (block->version->type != ip_order[j] ||
3759 !block->status.valid)
3760 continue;
3761
3762 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3763 if (r) {
3764 dev_err(adev->dev, "RE-INIT-early: %s failed\n",
3765 block->version->funcs->name);
3766 return r;
3767 }
3768 block->status.hw = true;
3769 }
3770 }
3771
3772 return 0;
3773 }
3774
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3775 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3776 {
3777 struct amdgpu_ip_block *block;
3778 int i, r = 0;
3779
3780 static enum amd_ip_block_type ip_order[] = {
3781 AMD_IP_BLOCK_TYPE_SMC,
3782 AMD_IP_BLOCK_TYPE_DCE,
3783 AMD_IP_BLOCK_TYPE_GFX,
3784 AMD_IP_BLOCK_TYPE_SDMA,
3785 AMD_IP_BLOCK_TYPE_MES,
3786 AMD_IP_BLOCK_TYPE_UVD,
3787 AMD_IP_BLOCK_TYPE_VCE,
3788 AMD_IP_BLOCK_TYPE_VCN,
3789 AMD_IP_BLOCK_TYPE_JPEG
3790 };
3791
3792 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3793 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
3794
3795 if (!block)
3796 continue;
3797
3798 if (block->status.valid && !block->status.hw) {
3799 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3800 r = amdgpu_ip_block_resume(block);
3801 } else {
3802 r = block->version->funcs->hw_init(block);
3803 }
3804
3805 if (r) {
3806 dev_err(adev->dev, "RE-INIT-late: %s failed\n",
3807 block->version->funcs->name);
3808 break;
3809 }
3810 block->status.hw = true;
3811 }
3812 }
3813
3814 return r;
3815 }
3816
3817 /**
3818 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3819 *
3820 * @adev: amdgpu_device pointer
3821 *
3822 * First resume function for hardware IPs. The list of all the hardware
3823 * IPs that make up the asic is walked and the resume callbacks are run for
3824 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3825 * after a suspend and updates the software state as necessary. This
3826 * function is also used for restoring the GPU after a GPU reset.
3827 * Returns 0 on success, negative error code on failure.
3828 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3829 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3830 {
3831 int i, r;
3832
3833 for (i = 0; i < adev->num_ip_blocks; i++) {
3834 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3835 continue;
3836 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3837 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3838 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3839 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3840
3841 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3842 if (r)
3843 return r;
3844 }
3845 }
3846
3847 return 0;
3848 }
3849
3850 /**
3851 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3852 *
3853 * @adev: amdgpu_device pointer
3854 *
3855 * Second resume function for hardware IPs. The list of all the hardware
3856 * IPs that make up the asic is walked and the resume callbacks are run for
3857 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3858 * functional state after a suspend and updates the software state as
3859 * necessary. This function is also used for restoring the GPU after a GPU
3860 * reset.
3861 * Returns 0 on success, negative error code on failure.
3862 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3863 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3864 {
3865 int i, r;
3866
3867 for (i = 0; i < adev->num_ip_blocks; i++) {
3868 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3869 continue;
3870 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3871 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3872 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3873 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3874 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3875 continue;
3876 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3877 if (r)
3878 return r;
3879 }
3880
3881 return 0;
3882 }
3883
3884 /**
3885 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3886 *
3887 * @adev: amdgpu_device pointer
3888 *
3889 * Third resume function for hardware IPs. The list of all the hardware
3890 * IPs that make up the asic is walked and the resume callbacks are run for
3891 * all DCE. resume puts the hardware into a functional state after a suspend
3892 * and updates the software state as necessary. This function is also used
3893 * for restoring the GPU after a GPU reset.
3894 *
3895 * Returns 0 on success, negative error code on failure.
3896 */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3897 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3898 {
3899 int i, r;
3900
3901 for (i = 0; i < adev->num_ip_blocks; i++) {
3902 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3903 continue;
3904 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3905 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3906 if (r)
3907 return r;
3908 }
3909 }
3910
3911 return 0;
3912 }
3913
3914 /**
3915 * amdgpu_device_ip_resume - run resume for hardware IPs
3916 *
3917 * @adev: amdgpu_device pointer
3918 *
3919 * Main resume function for hardware IPs. The hardware IPs
3920 * are split into two resume functions because they are
3921 * also used in recovering from a GPU reset and some additional
3922 * steps need to be take between them. In this case (S3/S4) they are
3923 * run sequentially.
3924 * Returns 0 on success, negative error code on failure.
3925 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3926 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3927 {
3928 int r;
3929
3930 r = amdgpu_device_ip_resume_phase1(adev);
3931 if (r)
3932 return r;
3933
3934 r = amdgpu_device_fw_loading(adev);
3935 if (r)
3936 return r;
3937
3938 r = amdgpu_device_ip_resume_phase2(adev);
3939
3940 if (adev->mman.buffer_funcs_ring->sched.ready)
3941 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3942
3943 if (r)
3944 return r;
3945
3946 amdgpu_fence_driver_hw_init(adev);
3947
3948 r = amdgpu_device_ip_resume_phase3(adev);
3949
3950 return r;
3951 }
3952
3953 /**
3954 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3955 *
3956 * @adev: amdgpu_device pointer
3957 *
3958 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3959 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3960 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3961 {
3962 if (amdgpu_sriov_vf(adev)) {
3963 if (adev->is_atom_fw) {
3964 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3965 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3966 } else {
3967 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3968 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3969 }
3970
3971 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3972 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3973 }
3974 }
3975
3976 /**
3977 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3978 *
3979 * @asic_type: AMD asic type
3980 *
3981 * Check if there is DC (new modesetting infrastructre) support for an asic.
3982 * returns true if DC has support, false if not.
3983 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3984 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3985 {
3986 switch (asic_type) {
3987 #ifdef CONFIG_DRM_AMDGPU_SI
3988 case CHIP_HAINAN:
3989 #endif
3990 case CHIP_TOPAZ:
3991 /* chips with no display hardware */
3992 return false;
3993 #if defined(CONFIG_DRM_AMD_DC)
3994 case CHIP_TAHITI:
3995 case CHIP_PITCAIRN:
3996 case CHIP_VERDE:
3997 case CHIP_OLAND:
3998 /*
3999 * We have systems in the wild with these ASICs that require
4000 * LVDS and VGA support which is not supported with DC.
4001 *
4002 * Fallback to the non-DC driver here by default so as not to
4003 * cause regressions.
4004 */
4005 #if defined(CONFIG_DRM_AMD_DC_SI)
4006 return amdgpu_dc > 0;
4007 #else
4008 return false;
4009 #endif
4010 case CHIP_BONAIRE:
4011 case CHIP_KAVERI:
4012 case CHIP_KABINI:
4013 case CHIP_MULLINS:
4014 /*
4015 * We have systems in the wild with these ASICs that require
4016 * VGA support which is not supported with DC.
4017 *
4018 * Fallback to the non-DC driver here by default so as not to
4019 * cause regressions.
4020 */
4021 return amdgpu_dc > 0;
4022 default:
4023 return amdgpu_dc != 0;
4024 #else
4025 default:
4026 if (amdgpu_dc > 0)
4027 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
4028 return false;
4029 #endif
4030 }
4031 }
4032
4033 /**
4034 * amdgpu_device_has_dc_support - check if dc is supported
4035 *
4036 * @adev: amdgpu_device pointer
4037 *
4038 * Returns true for supported, false for not supported
4039 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)4040 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
4041 {
4042 if (adev->enable_virtual_display ||
4043 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
4044 return false;
4045
4046 return amdgpu_device_asic_has_dc_support(adev->asic_type);
4047 }
4048
amdgpu_device_xgmi_reset_func(struct work_struct * __work)4049 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
4050 {
4051 struct amdgpu_device *adev =
4052 container_of(__work, struct amdgpu_device, xgmi_reset_work);
4053 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
4054
4055 /* It's a bug to not have a hive within this function */
4056 if (WARN_ON(!hive))
4057 return;
4058
4059 /*
4060 * Use task barrier to synchronize all xgmi reset works across the
4061 * hive. task_barrier_enter and task_barrier_exit will block
4062 * until all the threads running the xgmi reset works reach
4063 * those points. task_barrier_full will do both blocks.
4064 */
4065 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
4066
4067 task_barrier_enter(&hive->tb);
4068 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
4069
4070 if (adev->asic_reset_res)
4071 goto fail;
4072
4073 task_barrier_exit(&hive->tb);
4074 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
4075
4076 if (adev->asic_reset_res)
4077 goto fail;
4078
4079 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
4080 } else {
4081
4082 task_barrier_full(&hive->tb);
4083 adev->asic_reset_res = amdgpu_asic_reset(adev);
4084 }
4085
4086 fail:
4087 if (adev->asic_reset_res)
4088 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4089 adev->asic_reset_res, adev_to_drm(adev)->unique);
4090 amdgpu_put_xgmi_hive(hive);
4091 }
4092
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)4093 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4094 {
4095 char *input = amdgpu_lockup_timeout;
4096 char *timeout_setting = NULL;
4097 int index = 0;
4098 long timeout;
4099 int ret = 0;
4100
4101 /*
4102 * By default timeout for non compute jobs is 10000
4103 * and 60000 for compute jobs.
4104 * In SR-IOV or passthrough mode, timeout for compute
4105 * jobs are 60000 by default.
4106 */
4107 adev->gfx_timeout = msecs_to_jiffies(10000);
4108 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4109 if (amdgpu_sriov_vf(adev))
4110 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
4111 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
4112 else
4113 adev->compute_timeout = msecs_to_jiffies(60000);
4114
4115 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4116 while ((timeout_setting = strsep(&input, ",")) &&
4117 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4118 ret = kstrtol(timeout_setting, 0, &timeout);
4119 if (ret)
4120 return ret;
4121
4122 if (timeout == 0) {
4123 index++;
4124 continue;
4125 } else if (timeout < 0) {
4126 timeout = MAX_SCHEDULE_TIMEOUT;
4127 dev_warn(adev->dev, "lockup timeout disabled");
4128 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4129 } else {
4130 timeout = msecs_to_jiffies(timeout);
4131 }
4132
4133 switch (index++) {
4134 case 0:
4135 adev->gfx_timeout = timeout;
4136 break;
4137 case 1:
4138 adev->compute_timeout = timeout;
4139 break;
4140 case 2:
4141 adev->sdma_timeout = timeout;
4142 break;
4143 case 3:
4144 adev->video_timeout = timeout;
4145 break;
4146 default:
4147 break;
4148 }
4149 }
4150 /*
4151 * There is only one value specified and
4152 * it should apply to all non-compute jobs.
4153 */
4154 if (index == 1) {
4155 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4156 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4157 adev->compute_timeout = adev->gfx_timeout;
4158 }
4159 }
4160
4161 return ret;
4162 }
4163
4164 /**
4165 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4166 *
4167 * @adev: amdgpu_device pointer
4168 *
4169 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4170 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4171 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4172 {
4173 struct iommu_domain *domain;
4174
4175 domain = iommu_get_domain_for_dev(adev->dev);
4176 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4177 adev->ram_is_direct_mapped = true;
4178 }
4179
4180 #if defined(CONFIG_HSA_AMD_P2P)
4181 /**
4182 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4183 *
4184 * @adev: amdgpu_device pointer
4185 *
4186 * return if IOMMU remapping bar address
4187 */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4188 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4189 {
4190 struct iommu_domain *domain;
4191
4192 domain = iommu_get_domain_for_dev(adev->dev);
4193 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4194 domain->type == IOMMU_DOMAIN_DMA_FQ))
4195 return true;
4196
4197 return false;
4198 }
4199 #endif
4200
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4201 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4202 {
4203 if (amdgpu_mcbp == 1)
4204 adev->gfx.mcbp = true;
4205 else if (amdgpu_mcbp == 0)
4206 adev->gfx.mcbp = false;
4207
4208 if (amdgpu_sriov_vf(adev))
4209 adev->gfx.mcbp = true;
4210
4211 if (adev->gfx.mcbp)
4212 DRM_INFO("MCBP is enabled\n");
4213 }
4214
4215 /**
4216 * amdgpu_device_init - initialize the driver
4217 *
4218 * @adev: amdgpu_device pointer
4219 * @flags: driver flags
4220 *
4221 * Initializes the driver info and hw (all asics).
4222 * Returns 0 for success or an error on failure.
4223 * Called at driver startup.
4224 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4225 int amdgpu_device_init(struct amdgpu_device *adev,
4226 uint32_t flags)
4227 {
4228 struct drm_device *ddev = adev_to_drm(adev);
4229 struct pci_dev *pdev = adev->pdev;
4230 int r, i;
4231 bool px = false;
4232 u32 max_MBps;
4233 int tmp;
4234
4235 adev->shutdown = false;
4236 adev->flags = flags;
4237
4238 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4239 adev->asic_type = amdgpu_force_asic_type;
4240 else
4241 adev->asic_type = flags & AMD_ASIC_MASK;
4242
4243 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4244 if (amdgpu_emu_mode == 1)
4245 adev->usec_timeout *= 10;
4246 adev->gmc.gart_size = 512 * 1024 * 1024;
4247 adev->accel_working = false;
4248 adev->num_rings = 0;
4249 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4250 adev->mman.buffer_funcs = NULL;
4251 adev->mman.buffer_funcs_ring = NULL;
4252 adev->vm_manager.vm_pte_funcs = NULL;
4253 adev->vm_manager.vm_pte_num_scheds = 0;
4254 adev->gmc.gmc_funcs = NULL;
4255 adev->harvest_ip_mask = 0x0;
4256 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4257 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4258
4259 adev->smc_rreg = &amdgpu_invalid_rreg;
4260 adev->smc_wreg = &amdgpu_invalid_wreg;
4261 adev->pcie_rreg = &amdgpu_invalid_rreg;
4262 adev->pcie_wreg = &amdgpu_invalid_wreg;
4263 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4264 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4265 adev->pciep_rreg = &amdgpu_invalid_rreg;
4266 adev->pciep_wreg = &amdgpu_invalid_wreg;
4267 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4268 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4269 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4270 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4271 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4272 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4273 adev->didt_rreg = &amdgpu_invalid_rreg;
4274 adev->didt_wreg = &amdgpu_invalid_wreg;
4275 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4276 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4277 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4278 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4279
4280 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4281 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4282 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4283
4284 /* mutex initialization are all done here so we
4285 * can recall function without having locking issues
4286 */
4287 mutex_init(&adev->firmware.mutex);
4288 mutex_init(&adev->pm.mutex);
4289 mutex_init(&adev->gfx.gpu_clock_mutex);
4290 mutex_init(&adev->srbm_mutex);
4291 mutex_init(&adev->gfx.pipe_reserve_mutex);
4292 mutex_init(&adev->gfx.gfx_off_mutex);
4293 mutex_init(&adev->gfx.partition_mutex);
4294 mutex_init(&adev->grbm_idx_mutex);
4295 mutex_init(&adev->mn_lock);
4296 mutex_init(&adev->virt.vf_errors.lock);
4297 hash_init(adev->mn_hash);
4298 mutex_init(&adev->psp.mutex);
4299 mutex_init(&adev->notifier_lock);
4300 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4301 mutex_init(&adev->benchmark_mutex);
4302 mutex_init(&adev->gfx.reset_sem_mutex);
4303 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4304 mutex_init(&adev->enforce_isolation_mutex);
4305 for (i = 0; i < MAX_XCP; ++i) {
4306 adev->isolation[i].spearhead = dma_fence_get_stub();
4307 amdgpu_sync_create(&adev->isolation[i].active);
4308 amdgpu_sync_create(&adev->isolation[i].prev);
4309 }
4310 mutex_init(&adev->gfx.kfd_sch_mutex);
4311 mutex_init(&adev->gfx.workload_profile_mutex);
4312 mutex_init(&adev->vcn.workload_profile_mutex);
4313
4314 amdgpu_device_init_apu_flags(adev);
4315
4316 r = amdgpu_device_check_arguments(adev);
4317 if (r)
4318 return r;
4319
4320 spin_lock_init(&adev->mmio_idx_lock);
4321 spin_lock_init(&adev->smc_idx_lock);
4322 spin_lock_init(&adev->pcie_idx_lock);
4323 spin_lock_init(&adev->uvd_ctx_idx_lock);
4324 spin_lock_init(&adev->didt_idx_lock);
4325 spin_lock_init(&adev->gc_cac_idx_lock);
4326 spin_lock_init(&adev->se_cac_idx_lock);
4327 spin_lock_init(&adev->audio_endpt_idx_lock);
4328 spin_lock_init(&adev->mm_stats.lock);
4329 spin_lock_init(&adev->virt.rlcg_reg_lock);
4330 spin_lock_init(&adev->wb.lock);
4331
4332 INIT_LIST_HEAD(&adev->reset_list);
4333
4334 INIT_LIST_HEAD(&adev->ras_list);
4335
4336 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4337
4338 INIT_DELAYED_WORK(&adev->delayed_init_work,
4339 amdgpu_device_delayed_init_work_handler);
4340 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4341 amdgpu_device_delay_enable_gfx_off);
4342 /*
4343 * Initialize the enforce_isolation work structures for each XCP
4344 * partition. This work handler is responsible for enforcing shader
4345 * isolation on AMD GPUs. It counts the number of emitted fences for
4346 * each GFX and compute ring. If there are any fences, it schedules
4347 * the `enforce_isolation_work` to be run after a delay. If there are
4348 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4349 * runqueue.
4350 */
4351 for (i = 0; i < MAX_XCP; i++) {
4352 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4353 amdgpu_gfx_enforce_isolation_handler);
4354 adev->gfx.enforce_isolation[i].adev = adev;
4355 adev->gfx.enforce_isolation[i].xcp_id = i;
4356 }
4357
4358 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4359
4360 adev->gfx.gfx_off_req_count = 1;
4361 adev->gfx.gfx_off_residency = 0;
4362 adev->gfx.gfx_off_entrycount = 0;
4363 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4364
4365 atomic_set(&adev->throttling_logging_enabled, 1);
4366 /*
4367 * If throttling continues, logging will be performed every minute
4368 * to avoid log flooding. "-1" is subtracted since the thermal
4369 * throttling interrupt comes every second. Thus, the total logging
4370 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4371 * for throttling interrupt) = 60 seconds.
4372 */
4373 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4374
4375 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4376
4377 /* Registers mapping */
4378 /* TODO: block userspace mapping of io register */
4379 if (adev->asic_type >= CHIP_BONAIRE) {
4380 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4381 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4382 } else {
4383 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4384 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4385 }
4386
4387 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4388 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4389
4390 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4391 if (!adev->rmmio)
4392 return -ENOMEM;
4393
4394 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4395 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4396
4397 /*
4398 * Reset domain needs to be present early, before XGMI hive discovered
4399 * (if any) and initialized to use reset sem and in_gpu reset flag
4400 * early on during init and before calling to RREG32.
4401 */
4402 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4403 if (!adev->reset_domain)
4404 return -ENOMEM;
4405
4406 /* detect hw virtualization here */
4407 amdgpu_virt_init(adev);
4408
4409 amdgpu_device_get_pcie_info(adev);
4410
4411 r = amdgpu_device_get_job_timeout_settings(adev);
4412 if (r) {
4413 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4414 return r;
4415 }
4416
4417 amdgpu_device_set_mcbp(adev);
4418
4419 /*
4420 * By default, use default mode where all blocks are expected to be
4421 * initialized. At present a 'swinit' of blocks is required to be
4422 * completed before the need for a different level is detected.
4423 */
4424 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4425 /* early init functions */
4426 r = amdgpu_device_ip_early_init(adev);
4427 if (r)
4428 return r;
4429
4430 /*
4431 * No need to remove conflicting FBs for non-display class devices.
4432 * This prevents the sysfb from being freed accidently.
4433 */
4434 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA ||
4435 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) {
4436 /* Get rid of things like offb */
4437 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4438 if (r)
4439 return r;
4440 }
4441
4442 /* Enable TMZ based on IP_VERSION */
4443 amdgpu_gmc_tmz_set(adev);
4444
4445 if (amdgpu_sriov_vf(adev) &&
4446 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4447 /* VF MMIO access (except mailbox range) from CPU
4448 * will be blocked during sriov runtime
4449 */
4450 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4451
4452 amdgpu_gmc_noretry_set(adev);
4453 /* Need to get xgmi info early to decide the reset behavior*/
4454 if (adev->gmc.xgmi.supported) {
4455 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4456 if (r)
4457 return r;
4458 }
4459
4460 /* enable PCIE atomic ops */
4461 if (amdgpu_sriov_vf(adev)) {
4462 if (adev->virt.fw_reserve.p_pf2vf)
4463 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4464 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4465 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4466 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4467 * internal path natively support atomics, set have_atomics_support to true.
4468 */
4469 } else if ((adev->flags & AMD_IS_APU) &&
4470 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4471 IP_VERSION(9, 0, 0))) {
4472 adev->have_atomics_support = true;
4473 } else {
4474 adev->have_atomics_support =
4475 !pci_enable_atomic_ops_to_root(adev->pdev,
4476 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4477 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4478 }
4479
4480 if (!adev->have_atomics_support)
4481 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4482
4483 /* doorbell bar mapping and doorbell index init*/
4484 amdgpu_doorbell_init(adev);
4485
4486 if (amdgpu_emu_mode == 1) {
4487 /* post the asic on emulation mode */
4488 emu_soc_asic_init(adev);
4489 goto fence_driver_init;
4490 }
4491
4492 amdgpu_reset_init(adev);
4493
4494 /* detect if we are with an SRIOV vbios */
4495 if (adev->bios)
4496 amdgpu_device_detect_sriov_bios(adev);
4497
4498 /* check if we need to reset the asic
4499 * E.g., driver was not cleanly unloaded previously, etc.
4500 */
4501 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4502 if (adev->gmc.xgmi.num_physical_nodes) {
4503 dev_info(adev->dev, "Pending hive reset.\n");
4504 amdgpu_set_init_level(adev,
4505 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4506 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4507 !amdgpu_device_has_display_hardware(adev)) {
4508 r = psp_gpu_reset(adev);
4509 } else {
4510 tmp = amdgpu_reset_method;
4511 /* It should do a default reset when loading or reloading the driver,
4512 * regardless of the module parameter reset_method.
4513 */
4514 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4515 r = amdgpu_asic_reset(adev);
4516 amdgpu_reset_method = tmp;
4517 }
4518
4519 if (r) {
4520 dev_err(adev->dev, "asic reset on init failed\n");
4521 goto failed;
4522 }
4523 }
4524
4525 /* Post card if necessary */
4526 if (amdgpu_device_need_post(adev)) {
4527 if (!adev->bios) {
4528 dev_err(adev->dev, "no vBIOS found\n");
4529 r = -EINVAL;
4530 goto failed;
4531 }
4532 DRM_INFO("GPU posting now...\n");
4533 r = amdgpu_device_asic_init(adev);
4534 if (r) {
4535 dev_err(adev->dev, "gpu post error!\n");
4536 goto failed;
4537 }
4538 }
4539
4540 if (adev->bios) {
4541 if (adev->is_atom_fw) {
4542 /* Initialize clocks */
4543 r = amdgpu_atomfirmware_get_clock_info(adev);
4544 if (r) {
4545 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4546 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4547 goto failed;
4548 }
4549 } else {
4550 /* Initialize clocks */
4551 r = amdgpu_atombios_get_clock_info(adev);
4552 if (r) {
4553 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4554 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4555 goto failed;
4556 }
4557 /* init i2c buses */
4558 amdgpu_i2c_init(adev);
4559 }
4560 }
4561
4562 fence_driver_init:
4563 /* Fence driver */
4564 r = amdgpu_fence_driver_sw_init(adev);
4565 if (r) {
4566 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4567 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4568 goto failed;
4569 }
4570
4571 /* init the mode config */
4572 drm_mode_config_init(adev_to_drm(adev));
4573
4574 r = amdgpu_device_ip_init(adev);
4575 if (r) {
4576 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4577 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4578 goto release_ras_con;
4579 }
4580
4581 amdgpu_fence_driver_hw_init(adev);
4582
4583 dev_info(adev->dev,
4584 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4585 adev->gfx.config.max_shader_engines,
4586 adev->gfx.config.max_sh_per_se,
4587 adev->gfx.config.max_cu_per_sh,
4588 adev->gfx.cu_info.number);
4589
4590 adev->accel_working = true;
4591
4592 amdgpu_vm_check_compute_bug(adev);
4593
4594 /* Initialize the buffer migration limit. */
4595 if (amdgpu_moverate >= 0)
4596 max_MBps = amdgpu_moverate;
4597 else
4598 max_MBps = 8; /* Allow 8 MB/s. */
4599 /* Get a log2 for easy divisions. */
4600 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4601
4602 /*
4603 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4604 * Otherwise the mgpu fan boost feature will be skipped due to the
4605 * gpu instance is counted less.
4606 */
4607 amdgpu_register_gpu_instance(adev);
4608
4609 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4610 * explicit gating rather than handling it automatically.
4611 */
4612 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4613 r = amdgpu_device_ip_late_init(adev);
4614 if (r) {
4615 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4616 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4617 goto release_ras_con;
4618 }
4619 /* must succeed. */
4620 amdgpu_ras_resume(adev);
4621 queue_delayed_work(system_wq, &adev->delayed_init_work,
4622 msecs_to_jiffies(AMDGPU_RESUME_MS));
4623 }
4624
4625 if (amdgpu_sriov_vf(adev)) {
4626 amdgpu_virt_release_full_gpu(adev, true);
4627 flush_delayed_work(&adev->delayed_init_work);
4628 }
4629
4630 /*
4631 * Place those sysfs registering after `late_init`. As some of those
4632 * operations performed in `late_init` might affect the sysfs
4633 * interfaces creating.
4634 */
4635 r = amdgpu_atombios_sysfs_init(adev);
4636 if (r)
4637 drm_err(&adev->ddev,
4638 "registering atombios sysfs failed (%d).\n", r);
4639
4640 r = amdgpu_pm_sysfs_init(adev);
4641 if (r)
4642 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4643
4644 r = amdgpu_ucode_sysfs_init(adev);
4645 if (r) {
4646 adev->ucode_sysfs_en = false;
4647 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4648 } else
4649 adev->ucode_sysfs_en = true;
4650
4651 r = amdgpu_device_attr_sysfs_init(adev);
4652 if (r)
4653 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4654
4655 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4656 if (r)
4657 dev_err(adev->dev,
4658 "Could not create amdgpu board attributes\n");
4659
4660 amdgpu_fru_sysfs_init(adev);
4661 amdgpu_reg_state_sysfs_init(adev);
4662 amdgpu_xcp_cfg_sysfs_init(adev);
4663
4664 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4665 r = amdgpu_pmu_init(adev);
4666 if (r)
4667 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4668
4669 /* Have stored pci confspace at hand for restore in sudden PCI error */
4670 if (amdgpu_device_cache_pci_state(adev->pdev))
4671 pci_restore_state(pdev);
4672
4673 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4674 /* this will fail for cards that aren't VGA class devices, just
4675 * ignore it
4676 */
4677 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4678 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4679
4680 px = amdgpu_device_supports_px(ddev);
4681
4682 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4683 apple_gmux_detect(NULL, NULL)))
4684 vga_switcheroo_register_client(adev->pdev,
4685 &amdgpu_switcheroo_ops, px);
4686
4687 if (px)
4688 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4689
4690 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4691 amdgpu_xgmi_reset_on_init(adev);
4692
4693 amdgpu_device_check_iommu_direct_map(adev);
4694
4695 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4696 r = register_pm_notifier(&adev->pm_nb);
4697 if (r)
4698 goto failed;
4699
4700 return 0;
4701
4702 release_ras_con:
4703 if (amdgpu_sriov_vf(adev))
4704 amdgpu_virt_release_full_gpu(adev, true);
4705
4706 /* failed in exclusive mode due to timeout */
4707 if (amdgpu_sriov_vf(adev) &&
4708 !amdgpu_sriov_runtime(adev) &&
4709 amdgpu_virt_mmio_blocked(adev) &&
4710 !amdgpu_virt_wait_reset(adev)) {
4711 dev_err(adev->dev, "VF exclusive mode timeout\n");
4712 /* Don't send request since VF is inactive. */
4713 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4714 adev->virt.ops = NULL;
4715 r = -EAGAIN;
4716 }
4717 amdgpu_release_ras_context(adev);
4718
4719 failed:
4720 amdgpu_vf_error_trans_all(adev);
4721
4722 return r;
4723 }
4724
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4725 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4726 {
4727
4728 /* Clear all CPU mappings pointing to this device */
4729 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4730
4731 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4732 amdgpu_doorbell_fini(adev);
4733
4734 iounmap(adev->rmmio);
4735 adev->rmmio = NULL;
4736 if (adev->mman.aper_base_kaddr)
4737 iounmap(adev->mman.aper_base_kaddr);
4738 adev->mman.aper_base_kaddr = NULL;
4739
4740 /* Memory manager related */
4741 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4742 arch_phys_wc_del(adev->gmc.vram_mtrr);
4743 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4744 }
4745 }
4746
4747 /**
4748 * amdgpu_device_fini_hw - tear down the driver
4749 *
4750 * @adev: amdgpu_device pointer
4751 *
4752 * Tear down the driver info (all asics).
4753 * Called at driver shutdown.
4754 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4755 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4756 {
4757 dev_info(adev->dev, "amdgpu: finishing device.\n");
4758 flush_delayed_work(&adev->delayed_init_work);
4759
4760 if (adev->mman.initialized)
4761 drain_workqueue(adev->mman.bdev.wq);
4762 adev->shutdown = true;
4763
4764 unregister_pm_notifier(&adev->pm_nb);
4765
4766 /* make sure IB test finished before entering exclusive mode
4767 * to avoid preemption on IB test
4768 */
4769 if (amdgpu_sriov_vf(adev)) {
4770 amdgpu_virt_request_full_gpu(adev, false);
4771 amdgpu_virt_fini_data_exchange(adev);
4772 }
4773
4774 /* disable all interrupts */
4775 amdgpu_irq_disable_all(adev);
4776 if (adev->mode_info.mode_config_initialized) {
4777 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4778 drm_helper_force_disable_all(adev_to_drm(adev));
4779 else
4780 drm_atomic_helper_shutdown(adev_to_drm(adev));
4781 }
4782 amdgpu_fence_driver_hw_fini(adev);
4783
4784 if (adev->pm.sysfs_initialized)
4785 amdgpu_pm_sysfs_fini(adev);
4786 if (adev->ucode_sysfs_en)
4787 amdgpu_ucode_sysfs_fini(adev);
4788 amdgpu_device_attr_sysfs_fini(adev);
4789 amdgpu_fru_sysfs_fini(adev);
4790
4791 amdgpu_reg_state_sysfs_fini(adev);
4792 amdgpu_xcp_cfg_sysfs_fini(adev);
4793
4794 /* disable ras feature must before hw fini */
4795 amdgpu_ras_pre_fini(adev);
4796
4797 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4798
4799 amdgpu_device_ip_fini_early(adev);
4800
4801 amdgpu_irq_fini_hw(adev);
4802
4803 if (adev->mman.initialized)
4804 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4805
4806 amdgpu_gart_dummy_page_fini(adev);
4807
4808 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4809 amdgpu_device_unmap_mmio(adev);
4810
4811 }
4812
amdgpu_device_fini_sw(struct amdgpu_device * adev)4813 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4814 {
4815 int i, idx;
4816 bool px;
4817
4818 amdgpu_device_ip_fini(adev);
4819 amdgpu_fence_driver_sw_fini(adev);
4820 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4821 adev->accel_working = false;
4822 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4823 for (i = 0; i < MAX_XCP; ++i) {
4824 dma_fence_put(adev->isolation[i].spearhead);
4825 amdgpu_sync_free(&adev->isolation[i].active);
4826 amdgpu_sync_free(&adev->isolation[i].prev);
4827 }
4828
4829 amdgpu_reset_fini(adev);
4830
4831 /* free i2c buses */
4832 amdgpu_i2c_fini(adev);
4833
4834 if (adev->bios) {
4835 if (amdgpu_emu_mode != 1)
4836 amdgpu_atombios_fini(adev);
4837 amdgpu_bios_release(adev);
4838 }
4839
4840 kfree(adev->fru_info);
4841 adev->fru_info = NULL;
4842
4843 kfree(adev->xcp_mgr);
4844 adev->xcp_mgr = NULL;
4845
4846 px = amdgpu_device_supports_px(adev_to_drm(adev));
4847
4848 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4849 apple_gmux_detect(NULL, NULL)))
4850 vga_switcheroo_unregister_client(adev->pdev);
4851
4852 if (px)
4853 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4854
4855 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4856 vga_client_unregister(adev->pdev);
4857
4858 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4859
4860 iounmap(adev->rmmio);
4861 adev->rmmio = NULL;
4862 drm_dev_exit(idx);
4863 }
4864
4865 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4866 amdgpu_pmu_fini(adev);
4867 if (adev->mman.discovery_bin)
4868 amdgpu_discovery_fini(adev);
4869
4870 amdgpu_reset_put_reset_domain(adev->reset_domain);
4871 adev->reset_domain = NULL;
4872
4873 kfree(adev->pci_state);
4874
4875 }
4876
4877 /**
4878 * amdgpu_device_evict_resources - evict device resources
4879 * @adev: amdgpu device object
4880 *
4881 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4882 * of the vram memory type. Mainly used for evicting device resources
4883 * at suspend time.
4884 *
4885 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4886 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4887 {
4888 int ret;
4889
4890 /* No need to evict vram on APUs unless going to S4 */
4891 if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
4892 return 0;
4893
4894 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4895 if (ret)
4896 DRM_WARN("evicting device resources failed\n");
4897 return ret;
4898 }
4899
4900 /*
4901 * Suspend & resume.
4902 */
4903 /**
4904 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
4905 * @nb: notifier block
4906 * @mode: suspend mode
4907 * @data: data
4908 *
4909 * This function is called when the system is about to suspend or hibernate.
4910 * It is used to set the appropriate flags so that eviction can be optimized
4911 * in the pm prepare callback.
4912 */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)4913 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
4914 void *data)
4915 {
4916 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
4917
4918 switch (mode) {
4919 case PM_HIBERNATION_PREPARE:
4920 adev->in_s4 = true;
4921 break;
4922 case PM_POST_HIBERNATION:
4923 adev->in_s4 = false;
4924 break;
4925 }
4926
4927 return NOTIFY_DONE;
4928 }
4929
4930 /**
4931 * amdgpu_device_prepare - prepare for device suspend
4932 *
4933 * @dev: drm dev pointer
4934 *
4935 * Prepare to put the hw in the suspend state (all asics).
4936 * Returns 0 for success or an error on failure.
4937 * Called at driver suspend.
4938 */
amdgpu_device_prepare(struct drm_device * dev)4939 int amdgpu_device_prepare(struct drm_device *dev)
4940 {
4941 struct amdgpu_device *adev = drm_to_adev(dev);
4942 int i, r;
4943
4944 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4945 return 0;
4946
4947 /* Evict the majority of BOs before starting suspend sequence */
4948 r = amdgpu_device_evict_resources(adev);
4949 if (r)
4950 return r;
4951
4952 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4953
4954 for (i = 0; i < adev->num_ip_blocks; i++) {
4955 if (!adev->ip_blocks[i].status.valid)
4956 continue;
4957 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4958 continue;
4959 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
4960 if (r)
4961 return r;
4962 }
4963
4964 return 0;
4965 }
4966
4967 /**
4968 * amdgpu_device_suspend - initiate device suspend
4969 *
4970 * @dev: drm dev pointer
4971 * @notify_clients: notify in-kernel DRM clients
4972 *
4973 * Puts the hw in the suspend state (all asics).
4974 * Returns 0 for success or an error on failure.
4975 * Called at driver suspend.
4976 */
amdgpu_device_suspend(struct drm_device * dev,bool notify_clients)4977 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
4978 {
4979 struct amdgpu_device *adev = drm_to_adev(dev);
4980 int r = 0;
4981
4982 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4983 return 0;
4984
4985 adev->in_suspend = true;
4986
4987 if (amdgpu_sriov_vf(adev)) {
4988 amdgpu_virt_fini_data_exchange(adev);
4989 r = amdgpu_virt_request_full_gpu(adev, false);
4990 if (r)
4991 return r;
4992 }
4993
4994 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4995 DRM_WARN("smart shift update failed\n");
4996
4997 if (notify_clients)
4998 drm_client_dev_suspend(adev_to_drm(adev), false);
4999
5000 cancel_delayed_work_sync(&adev->delayed_init_work);
5001
5002 amdgpu_ras_suspend(adev);
5003
5004 amdgpu_device_ip_suspend_phase1(adev);
5005
5006 if (!adev->in_s0ix)
5007 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
5008
5009 r = amdgpu_device_evict_resources(adev);
5010 if (r)
5011 return r;
5012
5013 amdgpu_ttm_set_buffer_funcs_status(adev, false);
5014
5015 amdgpu_fence_driver_hw_fini(adev);
5016
5017 amdgpu_device_ip_suspend_phase2(adev);
5018
5019 if (amdgpu_sriov_vf(adev))
5020 amdgpu_virt_release_full_gpu(adev, false);
5021
5022 r = amdgpu_dpm_notify_rlc_state(adev, false);
5023 if (r)
5024 return r;
5025
5026 return 0;
5027 }
5028
5029 /**
5030 * amdgpu_device_resume - initiate device resume
5031 *
5032 * @dev: drm dev pointer
5033 * @notify_clients: notify in-kernel DRM clients
5034 *
5035 * Bring the hw back to operating state (all asics).
5036 * Returns 0 for success or an error on failure.
5037 * Called at driver resume.
5038 */
amdgpu_device_resume(struct drm_device * dev,bool notify_clients)5039 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
5040 {
5041 struct amdgpu_device *adev = drm_to_adev(dev);
5042 int r = 0;
5043
5044 if (amdgpu_sriov_vf(adev)) {
5045 r = amdgpu_virt_request_full_gpu(adev, true);
5046 if (r)
5047 return r;
5048 }
5049
5050 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5051 return 0;
5052
5053 if (adev->in_s0ix)
5054 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
5055
5056 /* post card */
5057 if (amdgpu_device_need_post(adev)) {
5058 r = amdgpu_device_asic_init(adev);
5059 if (r)
5060 dev_err(adev->dev, "amdgpu asic init failed\n");
5061 }
5062
5063 r = amdgpu_device_ip_resume(adev);
5064
5065 if (r) {
5066 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
5067 goto exit;
5068 }
5069
5070 if (!adev->in_s0ix) {
5071 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
5072 if (r)
5073 goto exit;
5074 }
5075
5076 r = amdgpu_device_ip_late_init(adev);
5077 if (r)
5078 goto exit;
5079
5080 queue_delayed_work(system_wq, &adev->delayed_init_work,
5081 msecs_to_jiffies(AMDGPU_RESUME_MS));
5082 exit:
5083 if (amdgpu_sriov_vf(adev)) {
5084 amdgpu_virt_init_data_exchange(adev);
5085 amdgpu_virt_release_full_gpu(adev, true);
5086 }
5087
5088 if (r)
5089 return r;
5090
5091 /* Make sure IB tests flushed */
5092 flush_delayed_work(&adev->delayed_init_work);
5093
5094 if (notify_clients)
5095 drm_client_dev_resume(adev_to_drm(adev), false);
5096
5097 amdgpu_ras_resume(adev);
5098
5099 if (adev->mode_info.num_crtc) {
5100 /*
5101 * Most of the connector probing functions try to acquire runtime pm
5102 * refs to ensure that the GPU is powered on when connector polling is
5103 * performed. Since we're calling this from a runtime PM callback,
5104 * trying to acquire rpm refs will cause us to deadlock.
5105 *
5106 * Since we're guaranteed to be holding the rpm lock, it's safe to
5107 * temporarily disable the rpm helpers so this doesn't deadlock us.
5108 */
5109 #ifdef CONFIG_PM
5110 dev->dev->power.disable_depth++;
5111 #endif
5112 if (!adev->dc_enabled)
5113 drm_helper_hpd_irq_event(dev);
5114 else
5115 drm_kms_helper_hotplug_event(dev);
5116 #ifdef CONFIG_PM
5117 dev->dev->power.disable_depth--;
5118 #endif
5119 }
5120 adev->in_suspend = false;
5121
5122 if (adev->enable_mes)
5123 amdgpu_mes_self_test(adev);
5124
5125 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
5126 DRM_WARN("smart shift update failed\n");
5127
5128 return 0;
5129 }
5130
5131 /**
5132 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5133 *
5134 * @adev: amdgpu_device pointer
5135 *
5136 * The list of all the hardware IPs that make up the asic is walked and
5137 * the check_soft_reset callbacks are run. check_soft_reset determines
5138 * if the asic is still hung or not.
5139 * Returns true if any of the IPs are still in a hung state, false if not.
5140 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)5141 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5142 {
5143 int i;
5144 bool asic_hang = false;
5145
5146 if (amdgpu_sriov_vf(adev))
5147 return true;
5148
5149 if (amdgpu_asic_need_full_reset(adev))
5150 return true;
5151
5152 for (i = 0; i < adev->num_ip_blocks; i++) {
5153 if (!adev->ip_blocks[i].status.valid)
5154 continue;
5155 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5156 adev->ip_blocks[i].status.hang =
5157 adev->ip_blocks[i].version->funcs->check_soft_reset(
5158 &adev->ip_blocks[i]);
5159 if (adev->ip_blocks[i].status.hang) {
5160 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5161 asic_hang = true;
5162 }
5163 }
5164 return asic_hang;
5165 }
5166
5167 /**
5168 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5169 *
5170 * @adev: amdgpu_device pointer
5171 *
5172 * The list of all the hardware IPs that make up the asic is walked and the
5173 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5174 * handles any IP specific hardware or software state changes that are
5175 * necessary for a soft reset to succeed.
5176 * Returns 0 on success, negative error code on failure.
5177 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5178 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5179 {
5180 int i, r = 0;
5181
5182 for (i = 0; i < adev->num_ip_blocks; i++) {
5183 if (!adev->ip_blocks[i].status.valid)
5184 continue;
5185 if (adev->ip_blocks[i].status.hang &&
5186 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5187 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5188 if (r)
5189 return r;
5190 }
5191 }
5192
5193 return 0;
5194 }
5195
5196 /**
5197 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5198 *
5199 * @adev: amdgpu_device pointer
5200 *
5201 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5202 * reset is necessary to recover.
5203 * Returns true if a full asic reset is required, false if not.
5204 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5205 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5206 {
5207 int i;
5208
5209 if (amdgpu_asic_need_full_reset(adev))
5210 return true;
5211
5212 for (i = 0; i < adev->num_ip_blocks; i++) {
5213 if (!adev->ip_blocks[i].status.valid)
5214 continue;
5215 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5216 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5217 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5218 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5220 if (adev->ip_blocks[i].status.hang) {
5221 dev_info(adev->dev, "Some block need full reset!\n");
5222 return true;
5223 }
5224 }
5225 }
5226 return false;
5227 }
5228
5229 /**
5230 * amdgpu_device_ip_soft_reset - do a soft reset
5231 *
5232 * @adev: amdgpu_device pointer
5233 *
5234 * The list of all the hardware IPs that make up the asic is walked and the
5235 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5236 * IP specific hardware or software state changes that are necessary to soft
5237 * reset the IP.
5238 * Returns 0 on success, negative error code on failure.
5239 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5240 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5241 {
5242 int i, r = 0;
5243
5244 for (i = 0; i < adev->num_ip_blocks; i++) {
5245 if (!adev->ip_blocks[i].status.valid)
5246 continue;
5247 if (adev->ip_blocks[i].status.hang &&
5248 adev->ip_blocks[i].version->funcs->soft_reset) {
5249 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5250 if (r)
5251 return r;
5252 }
5253 }
5254
5255 return 0;
5256 }
5257
5258 /**
5259 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5260 *
5261 * @adev: amdgpu_device pointer
5262 *
5263 * The list of all the hardware IPs that make up the asic is walked and the
5264 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5265 * handles any IP specific hardware or software state changes that are
5266 * necessary after the IP has been soft reset.
5267 * Returns 0 on success, negative error code on failure.
5268 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5269 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5270 {
5271 int i, r = 0;
5272
5273 for (i = 0; i < adev->num_ip_blocks; i++) {
5274 if (!adev->ip_blocks[i].status.valid)
5275 continue;
5276 if (adev->ip_blocks[i].status.hang &&
5277 adev->ip_blocks[i].version->funcs->post_soft_reset)
5278 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5279 if (r)
5280 return r;
5281 }
5282
5283 return 0;
5284 }
5285
5286 /**
5287 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5288 *
5289 * @adev: amdgpu_device pointer
5290 * @reset_context: amdgpu reset context pointer
5291 *
5292 * do VF FLR and reinitialize Asic
5293 * return 0 means succeeded otherwise failed
5294 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5295 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5296 struct amdgpu_reset_context *reset_context)
5297 {
5298 int r;
5299 struct amdgpu_hive_info *hive = NULL;
5300
5301 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5302 if (!amdgpu_ras_get_fed_status(adev))
5303 amdgpu_virt_ready_to_reset(adev);
5304 amdgpu_virt_wait_reset(adev);
5305 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5306 r = amdgpu_virt_request_full_gpu(adev, true);
5307 } else {
5308 r = amdgpu_virt_reset_gpu(adev);
5309 }
5310 if (r)
5311 return r;
5312
5313 amdgpu_ras_clear_err_state(adev);
5314 amdgpu_irq_gpu_reset_resume_helper(adev);
5315
5316 /* some sw clean up VF needs to do before recover */
5317 amdgpu_virt_post_reset(adev);
5318
5319 /* Resume IP prior to SMC */
5320 r = amdgpu_device_ip_reinit_early_sriov(adev);
5321 if (r)
5322 return r;
5323
5324 amdgpu_virt_init_data_exchange(adev);
5325
5326 r = amdgpu_device_fw_loading(adev);
5327 if (r)
5328 return r;
5329
5330 /* now we are okay to resume SMC/CP/SDMA */
5331 r = amdgpu_device_ip_reinit_late_sriov(adev);
5332 if (r)
5333 return r;
5334
5335 hive = amdgpu_get_xgmi_hive(adev);
5336 /* Update PSP FW topology after reset */
5337 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5338 r = amdgpu_xgmi_update_topology(hive, adev);
5339 if (hive)
5340 amdgpu_put_xgmi_hive(hive);
5341 if (r)
5342 return r;
5343
5344 r = amdgpu_ib_ring_tests(adev);
5345 if (r)
5346 return r;
5347
5348 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5349 amdgpu_inc_vram_lost(adev);
5350
5351 /* need to be called during full access so we can't do it later like
5352 * bare-metal does.
5353 */
5354 amdgpu_amdkfd_post_reset(adev);
5355 amdgpu_virt_release_full_gpu(adev, true);
5356
5357 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5358 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5359 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5360 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5361 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
5362 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5363 amdgpu_ras_resume(adev);
5364
5365 amdgpu_virt_ras_telemetry_post_reset(adev);
5366
5367 return 0;
5368 }
5369
5370 /**
5371 * amdgpu_device_has_job_running - check if there is any unfinished job
5372 *
5373 * @adev: amdgpu_device pointer
5374 *
5375 * check if there is any job running on the device when guest driver receives
5376 * FLR notification from host driver. If there are still jobs running, then
5377 * the guest driver will not respond the FLR reset. Instead, let the job hit
5378 * the timeout and guest driver then issue the reset request.
5379 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5380 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5381 {
5382 int i;
5383
5384 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5385 struct amdgpu_ring *ring = adev->rings[i];
5386
5387 if (!amdgpu_ring_sched_ready(ring))
5388 continue;
5389
5390 if (amdgpu_fence_count_emitted(ring))
5391 return true;
5392 }
5393 return false;
5394 }
5395
5396 /**
5397 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5398 *
5399 * @adev: amdgpu_device pointer
5400 *
5401 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5402 * a hung GPU.
5403 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5404 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5405 {
5406
5407 if (amdgpu_gpu_recovery == 0)
5408 goto disabled;
5409
5410 /* Skip soft reset check in fatal error mode */
5411 if (!amdgpu_ras_is_poison_mode_supported(adev))
5412 return true;
5413
5414 if (amdgpu_sriov_vf(adev))
5415 return true;
5416
5417 if (amdgpu_gpu_recovery == -1) {
5418 switch (adev->asic_type) {
5419 #ifdef CONFIG_DRM_AMDGPU_SI
5420 case CHIP_VERDE:
5421 case CHIP_TAHITI:
5422 case CHIP_PITCAIRN:
5423 case CHIP_OLAND:
5424 case CHIP_HAINAN:
5425 #endif
5426 #ifdef CONFIG_DRM_AMDGPU_CIK
5427 case CHIP_KAVERI:
5428 case CHIP_KABINI:
5429 case CHIP_MULLINS:
5430 #endif
5431 case CHIP_CARRIZO:
5432 case CHIP_STONEY:
5433 case CHIP_CYAN_SKILLFISH:
5434 goto disabled;
5435 default:
5436 break;
5437 }
5438 }
5439
5440 return true;
5441
5442 disabled:
5443 dev_info(adev->dev, "GPU recovery disabled.\n");
5444 return false;
5445 }
5446
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5447 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5448 {
5449 u32 i;
5450 int ret = 0;
5451
5452 if (adev->bios)
5453 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5454
5455 dev_info(adev->dev, "GPU mode1 reset\n");
5456
5457 /* Cache the state before bus master disable. The saved config space
5458 * values are used in other cases like restore after mode-2 reset.
5459 */
5460 amdgpu_device_cache_pci_state(adev->pdev);
5461
5462 /* disable BM */
5463 pci_clear_master(adev->pdev);
5464
5465 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5466 dev_info(adev->dev, "GPU smu mode1 reset\n");
5467 ret = amdgpu_dpm_mode1_reset(adev);
5468 } else {
5469 dev_info(adev->dev, "GPU psp mode1 reset\n");
5470 ret = psp_gpu_reset(adev);
5471 }
5472
5473 if (ret)
5474 goto mode1_reset_failed;
5475
5476 amdgpu_device_load_pci_state(adev->pdev);
5477 ret = amdgpu_psp_wait_for_bootloader(adev);
5478 if (ret)
5479 goto mode1_reset_failed;
5480
5481 /* wait for asic to come out of reset */
5482 for (i = 0; i < adev->usec_timeout; i++) {
5483 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5484
5485 if (memsize != 0xffffffff)
5486 break;
5487 udelay(1);
5488 }
5489
5490 if (i >= adev->usec_timeout) {
5491 ret = -ETIMEDOUT;
5492 goto mode1_reset_failed;
5493 }
5494
5495 if (adev->bios)
5496 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5497
5498 return 0;
5499
5500 mode1_reset_failed:
5501 dev_err(adev->dev, "GPU mode1 reset failed\n");
5502 return ret;
5503 }
5504
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5505 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5506 struct amdgpu_reset_context *reset_context)
5507 {
5508 int i, r = 0;
5509 struct amdgpu_job *job = NULL;
5510 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5511 bool need_full_reset =
5512 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5513
5514 if (reset_context->reset_req_dev == adev)
5515 job = reset_context->job;
5516
5517 if (amdgpu_sriov_vf(adev))
5518 amdgpu_virt_pre_reset(adev);
5519
5520 amdgpu_fence_driver_isr_toggle(adev, true);
5521
5522 /* block all schedulers and reset given job's ring */
5523 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5524 struct amdgpu_ring *ring = adev->rings[i];
5525
5526 if (!amdgpu_ring_sched_ready(ring))
5527 continue;
5528
5529 /* Clear job fence from fence drv to avoid force_completion
5530 * leave NULL and vm flush fence in fence drv
5531 */
5532 amdgpu_fence_driver_clear_job_fences(ring);
5533
5534 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5535 amdgpu_fence_driver_force_completion(ring);
5536 }
5537
5538 amdgpu_fence_driver_isr_toggle(adev, false);
5539
5540 if (job && job->vm)
5541 drm_sched_increase_karma(&job->base);
5542
5543 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5544 /* If reset handler not implemented, continue; otherwise return */
5545 if (r == -EOPNOTSUPP)
5546 r = 0;
5547 else
5548 return r;
5549
5550 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5551 if (!amdgpu_sriov_vf(adev)) {
5552
5553 if (!need_full_reset)
5554 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5555
5556 if (!need_full_reset && amdgpu_gpu_recovery &&
5557 amdgpu_device_ip_check_soft_reset(adev)) {
5558 amdgpu_device_ip_pre_soft_reset(adev);
5559 r = amdgpu_device_ip_soft_reset(adev);
5560 amdgpu_device_ip_post_soft_reset(adev);
5561 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5562 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5563 need_full_reset = true;
5564 }
5565 }
5566
5567 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5568 dev_info(tmp_adev->dev, "Dumping IP State\n");
5569 /* Trigger ip dump before we reset the asic */
5570 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5571 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5572 tmp_adev->ip_blocks[i].version->funcs
5573 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5574 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5575 }
5576
5577 if (need_full_reset)
5578 r = amdgpu_device_ip_suspend(adev);
5579 if (need_full_reset)
5580 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5581 else
5582 clear_bit(AMDGPU_NEED_FULL_RESET,
5583 &reset_context->flags);
5584 }
5585
5586 return r;
5587 }
5588
amdgpu_device_reinit_after_reset(struct amdgpu_reset_context * reset_context)5589 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5590 {
5591 struct list_head *device_list_handle;
5592 bool full_reset, vram_lost = false;
5593 struct amdgpu_device *tmp_adev;
5594 int r, init_level;
5595
5596 device_list_handle = reset_context->reset_device_list;
5597
5598 if (!device_list_handle)
5599 return -EINVAL;
5600
5601 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5602
5603 /**
5604 * If it's reset on init, it's default init level, otherwise keep level
5605 * as recovery level.
5606 */
5607 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5608 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5609 else
5610 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5611
5612 r = 0;
5613 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5614 amdgpu_set_init_level(tmp_adev, init_level);
5615 if (full_reset) {
5616 /* post card */
5617 amdgpu_ras_clear_err_state(tmp_adev);
5618 r = amdgpu_device_asic_init(tmp_adev);
5619 if (r) {
5620 dev_warn(tmp_adev->dev, "asic atom init failed!");
5621 } else {
5622 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5623
5624 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5625 if (r)
5626 goto out;
5627
5628 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5629
5630 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5631 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5632
5633 if (vram_lost) {
5634 DRM_INFO("VRAM is lost due to GPU reset!\n");
5635 amdgpu_inc_vram_lost(tmp_adev);
5636 }
5637
5638 r = amdgpu_device_fw_loading(tmp_adev);
5639 if (r)
5640 return r;
5641
5642 r = amdgpu_xcp_restore_partition_mode(
5643 tmp_adev->xcp_mgr);
5644 if (r)
5645 goto out;
5646
5647 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5648 if (r)
5649 goto out;
5650
5651 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5652 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5653
5654 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5655 if (r)
5656 goto out;
5657
5658 if (vram_lost)
5659 amdgpu_device_fill_reset_magic(tmp_adev);
5660
5661 /*
5662 * Add this ASIC as tracked as reset was already
5663 * complete successfully.
5664 */
5665 amdgpu_register_gpu_instance(tmp_adev);
5666
5667 if (!reset_context->hive &&
5668 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5669 amdgpu_xgmi_add_device(tmp_adev);
5670
5671 r = amdgpu_device_ip_late_init(tmp_adev);
5672 if (r)
5673 goto out;
5674
5675 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
5676
5677 /*
5678 * The GPU enters bad state once faulty pages
5679 * by ECC has reached the threshold, and ras
5680 * recovery is scheduled next. So add one check
5681 * here to break recovery if it indeed exceeds
5682 * bad page threshold, and remind user to
5683 * retire this GPU or setting one bigger
5684 * bad_page_threshold value to fix this once
5685 * probing driver again.
5686 */
5687 if (!amdgpu_ras_is_rma(tmp_adev)) {
5688 /* must succeed. */
5689 amdgpu_ras_resume(tmp_adev);
5690 } else {
5691 r = -EINVAL;
5692 goto out;
5693 }
5694
5695 /* Update PSP FW topology after reset */
5696 if (reset_context->hive &&
5697 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5698 r = amdgpu_xgmi_update_topology(
5699 reset_context->hive, tmp_adev);
5700 }
5701 }
5702
5703 out:
5704 if (!r) {
5705 /* IP init is complete now, set level as default */
5706 amdgpu_set_init_level(tmp_adev,
5707 AMDGPU_INIT_LEVEL_DEFAULT);
5708 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5709 r = amdgpu_ib_ring_tests(tmp_adev);
5710 if (r) {
5711 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5712 r = -EAGAIN;
5713 goto end;
5714 }
5715 }
5716
5717 if (r)
5718 tmp_adev->asic_reset_res = r;
5719 }
5720
5721 end:
5722 return r;
5723 }
5724
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5725 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5726 struct amdgpu_reset_context *reset_context)
5727 {
5728 struct amdgpu_device *tmp_adev = NULL;
5729 bool need_full_reset, skip_hw_reset;
5730 int r = 0;
5731
5732 /* Try reset handler method first */
5733 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5734 reset_list);
5735
5736 reset_context->reset_device_list = device_list_handle;
5737 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5738 /* If reset handler not implemented, continue; otherwise return */
5739 if (r == -EOPNOTSUPP)
5740 r = 0;
5741 else
5742 return r;
5743
5744 /* Reset handler not implemented, use the default method */
5745 need_full_reset =
5746 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5747 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5748
5749 /*
5750 * ASIC reset has to be done on all XGMI hive nodes ASAP
5751 * to allow proper links negotiation in FW (within 1 sec)
5752 */
5753 if (!skip_hw_reset && need_full_reset) {
5754 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5755 /* For XGMI run all resets in parallel to speed up the process */
5756 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5757 if (!queue_work(system_unbound_wq,
5758 &tmp_adev->xgmi_reset_work))
5759 r = -EALREADY;
5760 } else
5761 r = amdgpu_asic_reset(tmp_adev);
5762
5763 if (r) {
5764 dev_err(tmp_adev->dev,
5765 "ASIC reset failed with error, %d for drm dev, %s",
5766 r, adev_to_drm(tmp_adev)->unique);
5767 goto out;
5768 }
5769 }
5770
5771 /* For XGMI wait for all resets to complete before proceed */
5772 if (!r) {
5773 list_for_each_entry(tmp_adev, device_list_handle,
5774 reset_list) {
5775 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5776 flush_work(&tmp_adev->xgmi_reset_work);
5777 r = tmp_adev->asic_reset_res;
5778 if (r)
5779 break;
5780 }
5781 }
5782 }
5783 }
5784
5785 if (!r && amdgpu_ras_intr_triggered()) {
5786 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5787 amdgpu_ras_reset_error_count(tmp_adev,
5788 AMDGPU_RAS_BLOCK__MMHUB);
5789 }
5790
5791 amdgpu_ras_intr_cleared();
5792 }
5793
5794 r = amdgpu_device_reinit_after_reset(reset_context);
5795 if (r == -EAGAIN)
5796 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5797 else
5798 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5799
5800 out:
5801 return r;
5802 }
5803
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5804 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5805 {
5806
5807 switch (amdgpu_asic_reset_method(adev)) {
5808 case AMD_RESET_METHOD_MODE1:
5809 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5810 break;
5811 case AMD_RESET_METHOD_MODE2:
5812 adev->mp1_state = PP_MP1_STATE_RESET;
5813 break;
5814 default:
5815 adev->mp1_state = PP_MP1_STATE_NONE;
5816 break;
5817 }
5818 }
5819
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5820 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5821 {
5822 amdgpu_vf_error_trans_all(adev);
5823 adev->mp1_state = PP_MP1_STATE_NONE;
5824 }
5825
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5826 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5827 {
5828 struct pci_dev *p = NULL;
5829
5830 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5831 adev->pdev->bus->number, 1);
5832 if (p) {
5833 pm_runtime_enable(&(p->dev));
5834 pm_runtime_resume(&(p->dev));
5835 }
5836
5837 pci_dev_put(p);
5838 }
5839
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5840 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5841 {
5842 enum amd_reset_method reset_method;
5843 struct pci_dev *p = NULL;
5844 u64 expires;
5845
5846 /*
5847 * For now, only BACO and mode1 reset are confirmed
5848 * to suffer the audio issue without proper suspended.
5849 */
5850 reset_method = amdgpu_asic_reset_method(adev);
5851 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5852 (reset_method != AMD_RESET_METHOD_MODE1))
5853 return -EINVAL;
5854
5855 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5856 adev->pdev->bus->number, 1);
5857 if (!p)
5858 return -ENODEV;
5859
5860 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5861 if (!expires)
5862 /*
5863 * If we cannot get the audio device autosuspend delay,
5864 * a fixed 4S interval will be used. Considering 3S is
5865 * the audio controller default autosuspend delay setting.
5866 * 4S used here is guaranteed to cover that.
5867 */
5868 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5869
5870 while (!pm_runtime_status_suspended(&(p->dev))) {
5871 if (!pm_runtime_suspend(&(p->dev)))
5872 break;
5873
5874 if (expires < ktime_get_mono_fast_ns()) {
5875 dev_warn(adev->dev, "failed to suspend display audio\n");
5876 pci_dev_put(p);
5877 /* TODO: abort the succeeding gpu reset? */
5878 return -ETIMEDOUT;
5879 }
5880 }
5881
5882 pm_runtime_disable(&(p->dev));
5883
5884 pci_dev_put(p);
5885 return 0;
5886 }
5887
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5888 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5889 {
5890 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5891
5892 #if defined(CONFIG_DEBUG_FS)
5893 if (!amdgpu_sriov_vf(adev))
5894 cancel_work(&adev->reset_work);
5895 #endif
5896
5897 if (adev->kfd.dev)
5898 cancel_work(&adev->kfd.reset_work);
5899
5900 if (amdgpu_sriov_vf(adev))
5901 cancel_work(&adev->virt.flr_work);
5902
5903 if (con && adev->ras_enabled)
5904 cancel_work(&con->recovery_work);
5905
5906 }
5907
amdgpu_device_health_check(struct list_head * device_list_handle)5908 static int amdgpu_device_health_check(struct list_head *device_list_handle)
5909 {
5910 struct amdgpu_device *tmp_adev;
5911 int ret = 0;
5912 u32 status;
5913
5914 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5915 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5916 if (PCI_POSSIBLE_ERROR(status)) {
5917 dev_err(tmp_adev->dev, "device lost from bus!");
5918 ret = -ENODEV;
5919 }
5920 }
5921
5922 return ret;
5923 }
5924
5925 /**
5926 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5927 *
5928 * @adev: amdgpu_device pointer
5929 * @job: which job trigger hang
5930 * @reset_context: amdgpu reset context pointer
5931 *
5932 * Attempt to reset the GPU if it has hung (all asics).
5933 * Attempt to do soft-reset or full-reset and reinitialize Asic
5934 * Returns 0 for success or an error on failure.
5935 */
5936
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5937 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5938 struct amdgpu_job *job,
5939 struct amdgpu_reset_context *reset_context)
5940 {
5941 struct list_head device_list, *device_list_handle = NULL;
5942 bool job_signaled = false;
5943 struct amdgpu_hive_info *hive = NULL;
5944 struct amdgpu_device *tmp_adev = NULL;
5945 int i, r = 0;
5946 bool need_emergency_restart = false;
5947 bool audio_suspended = false;
5948 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5949
5950 /*
5951 * If it reaches here because of hang/timeout and a RAS error is
5952 * detected at the same time, let RAS recovery take care of it.
5953 */
5954 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
5955 !amdgpu_sriov_vf(adev) &&
5956 reset_context->src != AMDGPU_RESET_SRC_RAS) {
5957 dev_dbg(adev->dev,
5958 "Gpu recovery from source: %d yielding to RAS error recovery handling",
5959 reset_context->src);
5960 return 0;
5961 }
5962 /*
5963 * Special case: RAS triggered and full reset isn't supported
5964 */
5965 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5966
5967 /*
5968 * Flush RAM to disk so that after reboot
5969 * the user can read log and see why the system rebooted.
5970 */
5971 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5972 amdgpu_ras_get_context(adev)->reboot) {
5973 DRM_WARN("Emergency reboot.");
5974
5975 ksys_sync_helper();
5976 emergency_restart();
5977 }
5978
5979 dev_info(adev->dev, "GPU %s begin!\n",
5980 need_emergency_restart ? "jobs stop":"reset");
5981
5982 if (!amdgpu_sriov_vf(adev))
5983 hive = amdgpu_get_xgmi_hive(adev);
5984 if (hive)
5985 mutex_lock(&hive->hive_lock);
5986
5987 reset_context->job = job;
5988 reset_context->hive = hive;
5989 /*
5990 * Build list of devices to reset.
5991 * In case we are in XGMI hive mode, resort the device list
5992 * to put adev in the 1st position.
5993 */
5994 INIT_LIST_HEAD(&device_list);
5995 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5996 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5997 list_add_tail(&tmp_adev->reset_list, &device_list);
5998 if (adev->shutdown)
5999 tmp_adev->shutdown = true;
6000 }
6001 if (!list_is_first(&adev->reset_list, &device_list))
6002 list_rotate_to_front(&adev->reset_list, &device_list);
6003 device_list_handle = &device_list;
6004 } else {
6005 list_add_tail(&adev->reset_list, &device_list);
6006 device_list_handle = &device_list;
6007 }
6008
6009 if (!amdgpu_sriov_vf(adev)) {
6010 r = amdgpu_device_health_check(device_list_handle);
6011 if (r)
6012 goto end_reset;
6013 }
6014
6015 /* We need to lock reset domain only once both for XGMI and single device */
6016 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6017 reset_list);
6018 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
6019
6020 /* block all schedulers and reset given job's ring */
6021 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6022
6023 amdgpu_device_set_mp1_state(tmp_adev);
6024
6025 /*
6026 * Try to put the audio codec into suspend state
6027 * before gpu reset started.
6028 *
6029 * Due to the power domain of the graphics device
6030 * is shared with AZ power domain. Without this,
6031 * we may change the audio hardware from behind
6032 * the audio driver's back. That will trigger
6033 * some audio codec errors.
6034 */
6035 if (!amdgpu_device_suspend_display_audio(tmp_adev))
6036 audio_suspended = true;
6037
6038 amdgpu_ras_set_error_query_ready(tmp_adev, false);
6039
6040 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
6041
6042 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
6043
6044 /*
6045 * Mark these ASICs to be reset as untracked first
6046 * And add them back after reset completed
6047 */
6048 amdgpu_unregister_gpu_instance(tmp_adev);
6049
6050 drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
6051
6052 /* disable ras on ALL IPs */
6053 if (!need_emergency_restart &&
6054 amdgpu_device_ip_need_full_reset(tmp_adev))
6055 amdgpu_ras_suspend(tmp_adev);
6056
6057 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6058 struct amdgpu_ring *ring = tmp_adev->rings[i];
6059
6060 if (!amdgpu_ring_sched_ready(ring))
6061 continue;
6062
6063 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
6064
6065 if (need_emergency_restart)
6066 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
6067 }
6068 atomic_inc(&tmp_adev->gpu_reset_counter);
6069 }
6070
6071 if (need_emergency_restart)
6072 goto skip_sched_resume;
6073
6074 /*
6075 * Must check guilty signal here since after this point all old
6076 * HW fences are force signaled.
6077 *
6078 * job->base holds a reference to parent fence
6079 */
6080 if (job && dma_fence_is_signaled(&job->hw_fence)) {
6081 job_signaled = true;
6082 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
6083 goto skip_hw_reset;
6084 }
6085
6086 retry: /* Rest of adevs pre asic reset from XGMI hive. */
6087 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6088 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
6089 /*TODO Should we stop ?*/
6090 if (r) {
6091 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6092 r, adev_to_drm(tmp_adev)->unique);
6093 tmp_adev->asic_reset_res = r;
6094 }
6095 }
6096
6097 /* Actual ASIC resets if needed.*/
6098 /* Host driver will handle XGMI hive reset for SRIOV */
6099 if (amdgpu_sriov_vf(adev)) {
6100 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
6101 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
6102 amdgpu_ras_set_fed(adev, true);
6103 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
6104 }
6105
6106 r = amdgpu_device_reset_sriov(adev, reset_context);
6107 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
6108 amdgpu_virt_release_full_gpu(adev, true);
6109 goto retry;
6110 }
6111 if (r)
6112 adev->asic_reset_res = r;
6113 } else {
6114 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
6115 if (r && r == -EAGAIN)
6116 goto retry;
6117 }
6118
6119 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6120 /*
6121 * Drop any pending non scheduler resets queued before reset is done.
6122 * Any reset scheduled after this point would be valid. Scheduler resets
6123 * were already dropped during drm_sched_stop and no new ones can come
6124 * in before drm_sched_start.
6125 */
6126 amdgpu_device_stop_pending_resets(tmp_adev);
6127 }
6128
6129 skip_hw_reset:
6130
6131 /* Post ASIC reset for all devs .*/
6132 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6133
6134 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6135 struct amdgpu_ring *ring = tmp_adev->rings[i];
6136
6137 if (!amdgpu_ring_sched_ready(ring))
6138 continue;
6139
6140 drm_sched_start(&ring->sched, 0);
6141 }
6142
6143 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6144 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6145
6146 if (tmp_adev->asic_reset_res)
6147 r = tmp_adev->asic_reset_res;
6148
6149 tmp_adev->asic_reset_res = 0;
6150
6151 if (r) {
6152 /* bad news, how to tell it to userspace ?
6153 * for ras error, we should report GPU bad status instead of
6154 * reset failure
6155 */
6156 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6157 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6158 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6159 atomic_read(&tmp_adev->gpu_reset_counter));
6160 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6161 } else {
6162 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6163 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6164 DRM_WARN("smart shift update failed\n");
6165 }
6166 }
6167
6168 skip_sched_resume:
6169 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6170 /* unlock kfd: SRIOV would do it separately */
6171 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6172 amdgpu_amdkfd_post_reset(tmp_adev);
6173
6174 /* kfd_post_reset will do nothing if kfd device is not initialized,
6175 * need to bring up kfd here if it's not be initialized before
6176 */
6177 if (!adev->kfd.init_complete)
6178 amdgpu_amdkfd_device_init(adev);
6179
6180 if (audio_suspended)
6181 amdgpu_device_resume_display_audio(tmp_adev);
6182
6183 amdgpu_device_unset_mp1_state(tmp_adev);
6184
6185 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6186 }
6187
6188 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6189 reset_list);
6190 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6191
6192 end_reset:
6193 if (hive) {
6194 mutex_unlock(&hive->hive_lock);
6195 amdgpu_put_xgmi_hive(hive);
6196 }
6197
6198 if (r)
6199 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6200
6201 atomic_set(&adev->reset_domain->reset_res, r);
6202
6203 if (!r)
6204 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
6205
6206 return r;
6207 }
6208
6209 /**
6210 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6211 *
6212 * @adev: amdgpu_device pointer
6213 * @speed: pointer to the speed of the link
6214 * @width: pointer to the width of the link
6215 *
6216 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6217 * first physical partner to an AMD dGPU.
6218 * This will exclude any virtual switches and links.
6219 */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6220 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6221 enum pci_bus_speed *speed,
6222 enum pcie_link_width *width)
6223 {
6224 struct pci_dev *parent = adev->pdev;
6225
6226 if (!speed || !width)
6227 return;
6228
6229 *speed = PCI_SPEED_UNKNOWN;
6230 *width = PCIE_LNK_WIDTH_UNKNOWN;
6231
6232 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6233 while ((parent = pci_upstream_bridge(parent))) {
6234 /* skip upstream/downstream switches internal to dGPU*/
6235 if (parent->vendor == PCI_VENDOR_ID_ATI)
6236 continue;
6237 *speed = pcie_get_speed_cap(parent);
6238 *width = pcie_get_width_cap(parent);
6239 break;
6240 }
6241 } else {
6242 /* use the current speeds rather than max if switching is not supported */
6243 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6244 }
6245 }
6246
6247 /**
6248 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU
6249 *
6250 * @adev: amdgpu_device pointer
6251 * @speed: pointer to the speed of the link
6252 * @width: pointer to the width of the link
6253 *
6254 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6255 * AMD dGPU which may be a virtual upstream bridge.
6256 */
amdgpu_device_gpu_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6257 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev,
6258 enum pci_bus_speed *speed,
6259 enum pcie_link_width *width)
6260 {
6261 struct pci_dev *parent = adev->pdev;
6262
6263 if (!speed || !width)
6264 return;
6265
6266 parent = pci_upstream_bridge(parent);
6267 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) {
6268 /* use the upstream/downstream switches internal to dGPU */
6269 *speed = pcie_get_speed_cap(parent);
6270 *width = pcie_get_width_cap(parent);
6271 while ((parent = pci_upstream_bridge(parent))) {
6272 if (parent->vendor == PCI_VENDOR_ID_ATI) {
6273 /* use the upstream/downstream switches internal to dGPU */
6274 *speed = pcie_get_speed_cap(parent);
6275 *width = pcie_get_width_cap(parent);
6276 }
6277 }
6278 } else {
6279 /* use the device itself */
6280 *speed = pcie_get_speed_cap(adev->pdev);
6281 *width = pcie_get_width_cap(adev->pdev);
6282 }
6283 }
6284
6285 /**
6286 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6287 *
6288 * @adev: amdgpu_device pointer
6289 *
6290 * Fetches and stores in the driver the PCIE capabilities (gen speed
6291 * and lanes) of the slot the device is in. Handles APUs and
6292 * virtualized environments where PCIE config space may not be available.
6293 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6294 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6295 {
6296 enum pci_bus_speed speed_cap, platform_speed_cap;
6297 enum pcie_link_width platform_link_width, link_width;
6298
6299 if (amdgpu_pcie_gen_cap)
6300 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6301
6302 if (amdgpu_pcie_lane_cap)
6303 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6304
6305 /* covers APUs as well */
6306 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6307 if (adev->pm.pcie_gen_mask == 0)
6308 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6309 if (adev->pm.pcie_mlw_mask == 0)
6310 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6311 return;
6312 }
6313
6314 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6315 return;
6316
6317 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6318 &platform_link_width);
6319 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width);
6320
6321 if (adev->pm.pcie_gen_mask == 0) {
6322 /* asic caps */
6323 if (speed_cap == PCI_SPEED_UNKNOWN) {
6324 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6325 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6326 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6327 } else {
6328 if (speed_cap == PCIE_SPEED_32_0GT)
6329 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6330 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6331 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6332 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6333 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6334 else if (speed_cap == PCIE_SPEED_16_0GT)
6335 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6336 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6337 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6338 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6339 else if (speed_cap == PCIE_SPEED_8_0GT)
6340 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6341 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6342 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6343 else if (speed_cap == PCIE_SPEED_5_0GT)
6344 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6345 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6346 else
6347 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6348 }
6349 /* platform caps */
6350 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6351 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6352 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6353 } else {
6354 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6355 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6356 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6357 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6358 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6359 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6360 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6361 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6362 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6363 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6364 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6365 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6366 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6367 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6368 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6369 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6370 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6371 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6372 else
6373 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6374
6375 }
6376 }
6377 if (adev->pm.pcie_mlw_mask == 0) {
6378 /* asic caps */
6379 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6380 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK;
6381 } else {
6382 switch (link_width) {
6383 case PCIE_LNK_X32:
6384 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 |
6385 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6386 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6387 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6388 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6389 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6390 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6391 break;
6392 case PCIE_LNK_X16:
6393 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6394 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6395 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6396 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6397 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6398 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6399 break;
6400 case PCIE_LNK_X12:
6401 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6402 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6403 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6404 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6405 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6406 break;
6407 case PCIE_LNK_X8:
6408 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6409 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6410 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6411 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6412 break;
6413 case PCIE_LNK_X4:
6414 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6415 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6416 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6417 break;
6418 case PCIE_LNK_X2:
6419 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6420 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6421 break;
6422 case PCIE_LNK_X1:
6423 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1;
6424 break;
6425 default:
6426 break;
6427 }
6428 }
6429 /* platform caps */
6430 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6431 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6432 } else {
6433 switch (platform_link_width) {
6434 case PCIE_LNK_X32:
6435 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6436 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6437 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6438 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6439 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6440 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6441 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6442 break;
6443 case PCIE_LNK_X16:
6444 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6445 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6446 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6447 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6448 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6449 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6450 break;
6451 case PCIE_LNK_X12:
6452 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6453 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6454 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6455 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6456 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6457 break;
6458 case PCIE_LNK_X8:
6459 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6460 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6461 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6462 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6463 break;
6464 case PCIE_LNK_X4:
6465 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6466 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6467 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6468 break;
6469 case PCIE_LNK_X2:
6470 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6471 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6472 break;
6473 case PCIE_LNK_X1:
6474 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6475 break;
6476 default:
6477 break;
6478 }
6479 }
6480 }
6481 }
6482
6483 /**
6484 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6485 *
6486 * @adev: amdgpu_device pointer
6487 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6488 *
6489 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6490 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6491 * @peer_adev.
6492 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6493 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6494 struct amdgpu_device *peer_adev)
6495 {
6496 #ifdef CONFIG_HSA_AMD_P2P
6497 bool p2p_access =
6498 !adev->gmc.xgmi.connected_to_cpu &&
6499 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6500 if (!p2p_access)
6501 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6502 pci_name(peer_adev->pdev));
6503
6504 bool is_large_bar = adev->gmc.visible_vram_size &&
6505 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6506 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6507
6508 if (!p2p_addressable) {
6509 uint64_t address_mask = peer_adev->dev->dma_mask ?
6510 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6511 resource_size_t aper_limit =
6512 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6513
6514 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6515 aper_limit & address_mask);
6516 }
6517 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6518 #else
6519 return false;
6520 #endif
6521 }
6522
amdgpu_device_baco_enter(struct drm_device * dev)6523 int amdgpu_device_baco_enter(struct drm_device *dev)
6524 {
6525 struct amdgpu_device *adev = drm_to_adev(dev);
6526 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6527
6528 if (!amdgpu_device_supports_baco(dev))
6529 return -ENOTSUPP;
6530
6531 if (ras && adev->ras_enabled &&
6532 adev->nbio.funcs->enable_doorbell_interrupt)
6533 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6534
6535 return amdgpu_dpm_baco_enter(adev);
6536 }
6537
amdgpu_device_baco_exit(struct drm_device * dev)6538 int amdgpu_device_baco_exit(struct drm_device *dev)
6539 {
6540 struct amdgpu_device *adev = drm_to_adev(dev);
6541 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6542 int ret = 0;
6543
6544 if (!amdgpu_device_supports_baco(dev))
6545 return -ENOTSUPP;
6546
6547 ret = amdgpu_dpm_baco_exit(adev);
6548 if (ret)
6549 return ret;
6550
6551 if (ras && adev->ras_enabled &&
6552 adev->nbio.funcs->enable_doorbell_interrupt)
6553 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6554
6555 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6556 adev->nbio.funcs->clear_doorbell_interrupt)
6557 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6558
6559 return 0;
6560 }
6561
6562 /**
6563 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6564 * @pdev: PCI device struct
6565 * @state: PCI channel state
6566 *
6567 * Description: Called when a PCI error is detected.
6568 *
6569 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6570 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6571 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6572 {
6573 struct drm_device *dev = pci_get_drvdata(pdev);
6574 struct amdgpu_device *adev = drm_to_adev(dev);
6575 int i;
6576
6577 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6578
6579 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6580 DRM_WARN("No support for XGMI hive yet...");
6581 return PCI_ERS_RESULT_DISCONNECT;
6582 }
6583
6584 adev->pci_channel_state = state;
6585
6586 switch (state) {
6587 case pci_channel_io_normal:
6588 return PCI_ERS_RESULT_CAN_RECOVER;
6589 /* Fatal error, prepare for slot reset */
6590 case pci_channel_io_frozen:
6591 /*
6592 * Locking adev->reset_domain->sem will prevent any external access
6593 * to GPU during PCI error recovery
6594 */
6595 amdgpu_device_lock_reset_domain(adev->reset_domain);
6596 amdgpu_device_set_mp1_state(adev);
6597
6598 /*
6599 * Block any work scheduling as we do for regular GPU reset
6600 * for the duration of the recovery
6601 */
6602 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6603 struct amdgpu_ring *ring = adev->rings[i];
6604
6605 if (!amdgpu_ring_sched_ready(ring))
6606 continue;
6607
6608 drm_sched_stop(&ring->sched, NULL);
6609 }
6610 atomic_inc(&adev->gpu_reset_counter);
6611 return PCI_ERS_RESULT_NEED_RESET;
6612 case pci_channel_io_perm_failure:
6613 /* Permanent error, prepare for device removal */
6614 return PCI_ERS_RESULT_DISCONNECT;
6615 }
6616
6617 return PCI_ERS_RESULT_NEED_RESET;
6618 }
6619
6620 /**
6621 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6622 * @pdev: pointer to PCI device
6623 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)6624 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6625 {
6626
6627 DRM_INFO("PCI error: mmio enabled callback!!\n");
6628
6629 /* TODO - dump whatever for debugging purposes */
6630
6631 /* This called only if amdgpu_pci_error_detected returns
6632 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6633 * works, no need to reset slot.
6634 */
6635
6636 return PCI_ERS_RESULT_RECOVERED;
6637 }
6638
6639 /**
6640 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6641 * @pdev: PCI device struct
6642 *
6643 * Description: This routine is called by the pci error recovery
6644 * code after the PCI slot has been reset, just before we
6645 * should resume normal operations.
6646 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)6647 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6648 {
6649 struct drm_device *dev = pci_get_drvdata(pdev);
6650 struct amdgpu_device *adev = drm_to_adev(dev);
6651 int r, i;
6652 struct amdgpu_reset_context reset_context;
6653 u32 memsize;
6654 struct list_head device_list;
6655
6656 /* PCI error slot reset should be skipped During RAS recovery */
6657 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6658 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6659 amdgpu_ras_in_recovery(adev))
6660 return PCI_ERS_RESULT_RECOVERED;
6661
6662 DRM_INFO("PCI error: slot reset callback!!\n");
6663
6664 memset(&reset_context, 0, sizeof(reset_context));
6665
6666 INIT_LIST_HEAD(&device_list);
6667 list_add_tail(&adev->reset_list, &device_list);
6668
6669 /* wait for asic to come out of reset */
6670 msleep(500);
6671
6672 /* Restore PCI confspace */
6673 amdgpu_device_load_pci_state(pdev);
6674
6675 /* confirm ASIC came out of reset */
6676 for (i = 0; i < adev->usec_timeout; i++) {
6677 memsize = amdgpu_asic_get_config_memsize(adev);
6678
6679 if (memsize != 0xffffffff)
6680 break;
6681 udelay(1);
6682 }
6683 if (memsize == 0xffffffff) {
6684 r = -ETIME;
6685 goto out;
6686 }
6687
6688 reset_context.method = AMD_RESET_METHOD_NONE;
6689 reset_context.reset_req_dev = adev;
6690 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6691 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6692
6693 adev->no_hw_access = true;
6694 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6695 adev->no_hw_access = false;
6696 if (r)
6697 goto out;
6698
6699 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6700
6701 out:
6702 if (!r) {
6703 if (amdgpu_device_cache_pci_state(adev->pdev))
6704 pci_restore_state(adev->pdev);
6705
6706 DRM_INFO("PCIe error recovery succeeded\n");
6707 } else {
6708 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6709 amdgpu_device_unset_mp1_state(adev);
6710 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6711 }
6712
6713 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6714 }
6715
6716 /**
6717 * amdgpu_pci_resume() - resume normal ops after PCI reset
6718 * @pdev: pointer to PCI device
6719 *
6720 * Called when the error recovery driver tells us that its
6721 * OK to resume normal operation.
6722 */
amdgpu_pci_resume(struct pci_dev * pdev)6723 void amdgpu_pci_resume(struct pci_dev *pdev)
6724 {
6725 struct drm_device *dev = pci_get_drvdata(pdev);
6726 struct amdgpu_device *adev = drm_to_adev(dev);
6727 int i;
6728
6729
6730 DRM_INFO("PCI error: resume callback!!\n");
6731
6732 /* Only continue execution for the case of pci_channel_io_frozen */
6733 if (adev->pci_channel_state != pci_channel_io_frozen)
6734 return;
6735
6736 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6737 struct amdgpu_ring *ring = adev->rings[i];
6738
6739 if (!amdgpu_ring_sched_ready(ring))
6740 continue;
6741
6742 drm_sched_start(&ring->sched, 0);
6743 }
6744
6745 amdgpu_device_unset_mp1_state(adev);
6746 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6747 }
6748
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6749 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6750 {
6751 struct drm_device *dev = pci_get_drvdata(pdev);
6752 struct amdgpu_device *adev = drm_to_adev(dev);
6753 int r;
6754
6755 if (amdgpu_sriov_vf(adev))
6756 return false;
6757
6758 r = pci_save_state(pdev);
6759 if (!r) {
6760 kfree(adev->pci_state);
6761
6762 adev->pci_state = pci_store_saved_state(pdev);
6763
6764 if (!adev->pci_state) {
6765 DRM_ERROR("Failed to store PCI saved state");
6766 return false;
6767 }
6768 } else {
6769 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6770 return false;
6771 }
6772
6773 return true;
6774 }
6775
amdgpu_device_load_pci_state(struct pci_dev * pdev)6776 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6777 {
6778 struct drm_device *dev = pci_get_drvdata(pdev);
6779 struct amdgpu_device *adev = drm_to_adev(dev);
6780 int r;
6781
6782 if (!adev->pci_state)
6783 return false;
6784
6785 r = pci_load_saved_state(pdev, adev->pci_state);
6786
6787 if (!r) {
6788 pci_restore_state(pdev);
6789 } else {
6790 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6791 return false;
6792 }
6793
6794 return true;
6795 }
6796
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6797 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6798 struct amdgpu_ring *ring)
6799 {
6800 #ifdef CONFIG_X86_64
6801 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6802 return;
6803 #endif
6804 if (adev->gmc.xgmi.connected_to_cpu)
6805 return;
6806
6807 if (ring && ring->funcs->emit_hdp_flush)
6808 amdgpu_ring_emit_hdp_flush(ring);
6809 else
6810 amdgpu_asic_flush_hdp(adev, ring);
6811 }
6812
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6813 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6814 struct amdgpu_ring *ring)
6815 {
6816 #ifdef CONFIG_X86_64
6817 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6818 return;
6819 #endif
6820 if (adev->gmc.xgmi.connected_to_cpu)
6821 return;
6822
6823 amdgpu_asic_invalidate_hdp(adev, ring);
6824 }
6825
amdgpu_in_reset(struct amdgpu_device * adev)6826 int amdgpu_in_reset(struct amdgpu_device *adev)
6827 {
6828 return atomic_read(&adev->reset_domain->in_gpu_reset);
6829 }
6830
6831 /**
6832 * amdgpu_device_halt() - bring hardware to some kind of halt state
6833 *
6834 * @adev: amdgpu_device pointer
6835 *
6836 * Bring hardware to some kind of halt state so that no one can touch it
6837 * any more. It will help to maintain error context when error occurred.
6838 * Compare to a simple hang, the system will keep stable at least for SSH
6839 * access. Then it should be trivial to inspect the hardware state and
6840 * see what's going on. Implemented as following:
6841 *
6842 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6843 * clears all CPU mappings to device, disallows remappings through page faults
6844 * 2. amdgpu_irq_disable_all() disables all interrupts
6845 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6846 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6847 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6848 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6849 * flush any in flight DMA operations
6850 */
amdgpu_device_halt(struct amdgpu_device * adev)6851 void amdgpu_device_halt(struct amdgpu_device *adev)
6852 {
6853 struct pci_dev *pdev = adev->pdev;
6854 struct drm_device *ddev = adev_to_drm(adev);
6855
6856 amdgpu_xcp_dev_unplug(adev);
6857 drm_dev_unplug(ddev);
6858
6859 amdgpu_irq_disable_all(adev);
6860
6861 amdgpu_fence_driver_hw_fini(adev);
6862
6863 adev->no_hw_access = true;
6864
6865 amdgpu_device_unmap_mmio(adev);
6866
6867 pci_disable_device(pdev);
6868 pci_wait_for_pending_transaction(pdev);
6869 }
6870
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6871 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6872 u32 reg)
6873 {
6874 unsigned long flags, address, data;
6875 u32 r;
6876
6877 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6878 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6879
6880 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6881 WREG32(address, reg * 4);
6882 (void)RREG32(address);
6883 r = RREG32(data);
6884 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6885 return r;
6886 }
6887
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6888 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6889 u32 reg, u32 v)
6890 {
6891 unsigned long flags, address, data;
6892
6893 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6894 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6895
6896 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6897 WREG32(address, reg * 4);
6898 (void)RREG32(address);
6899 WREG32(data, v);
6900 (void)RREG32(data);
6901 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6902 }
6903
6904 /**
6905 * amdgpu_device_get_gang - return a reference to the current gang
6906 * @adev: amdgpu_device pointer
6907 *
6908 * Returns: A new reference to the current gang leader.
6909 */
amdgpu_device_get_gang(struct amdgpu_device * adev)6910 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6911 {
6912 struct dma_fence *fence;
6913
6914 rcu_read_lock();
6915 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6916 rcu_read_unlock();
6917 return fence;
6918 }
6919
6920 /**
6921 * amdgpu_device_switch_gang - switch to a new gang
6922 * @adev: amdgpu_device pointer
6923 * @gang: the gang to switch to
6924 *
6925 * Try to switch to a new gang.
6926 * Returns: NULL if we switched to the new gang or a reference to the current
6927 * gang leader.
6928 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6929 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6930 struct dma_fence *gang)
6931 {
6932 struct dma_fence *old = NULL;
6933
6934 dma_fence_get(gang);
6935 do {
6936 dma_fence_put(old);
6937 old = amdgpu_device_get_gang(adev);
6938 if (old == gang)
6939 break;
6940
6941 if (!dma_fence_is_signaled(old)) {
6942 dma_fence_put(gang);
6943 return old;
6944 }
6945
6946 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6947 old, gang) != old);
6948
6949 /*
6950 * Drop it once for the exchanged reference in adev and once for the
6951 * thread local reference acquired in amdgpu_device_get_gang().
6952 */
6953 dma_fence_put(old);
6954 dma_fence_put(old);
6955 return NULL;
6956 }
6957
6958 /**
6959 * amdgpu_device_enforce_isolation - enforce HW isolation
6960 * @adev: the amdgpu device pointer
6961 * @ring: the HW ring the job is supposed to run on
6962 * @job: the job which is about to be pushed to the HW ring
6963 *
6964 * Makes sure that only one client at a time can use the GFX block.
6965 * Returns: The dependency to wait on before the job can be pushed to the HW.
6966 * The function is called multiple times until NULL is returned.
6967 */
amdgpu_device_enforce_isolation(struct amdgpu_device * adev,struct amdgpu_ring * ring,struct amdgpu_job * job)6968 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
6969 struct amdgpu_ring *ring,
6970 struct amdgpu_job *job)
6971 {
6972 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id];
6973 struct drm_sched_fence *f = job->base.s_fence;
6974 struct dma_fence *dep;
6975 void *owner;
6976 int r;
6977
6978 /*
6979 * For now enforce isolation only for the GFX block since we only need
6980 * the cleaner shader on those rings.
6981 */
6982 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX &&
6983 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
6984 return NULL;
6985
6986 /*
6987 * All submissions where enforce isolation is false are handled as if
6988 * they come from a single client. Use ~0l as the owner to distinct it
6989 * from kernel submissions where the owner is NULL.
6990 */
6991 owner = job->enforce_isolation ? f->owner : (void *)~0l;
6992
6993 mutex_lock(&adev->enforce_isolation_mutex);
6994
6995 /*
6996 * The "spearhead" submission is the first one which changes the
6997 * ownership to its client. We always need to wait for it to be
6998 * pushed to the HW before proceeding with anything.
6999 */
7000 if (&f->scheduled != isolation->spearhead &&
7001 !dma_fence_is_signaled(isolation->spearhead)) {
7002 dep = isolation->spearhead;
7003 goto out_grab_ref;
7004 }
7005
7006 if (isolation->owner != owner) {
7007
7008 /*
7009 * Wait for any gang to be assembled before switching to a
7010 * different owner or otherwise we could deadlock the
7011 * submissions.
7012 */
7013 if (!job->gang_submit) {
7014 dep = amdgpu_device_get_gang(adev);
7015 if (!dma_fence_is_signaled(dep))
7016 goto out_return_dep;
7017 dma_fence_put(dep);
7018 }
7019
7020 dma_fence_put(isolation->spearhead);
7021 isolation->spearhead = dma_fence_get(&f->scheduled);
7022 amdgpu_sync_move(&isolation->active, &isolation->prev);
7023 trace_amdgpu_isolation(isolation->owner, owner);
7024 isolation->owner = owner;
7025 }
7026
7027 /*
7028 * Specifying the ring here helps to pipeline submissions even when
7029 * isolation is enabled. If that is not desired for testing NULL can be
7030 * used instead of the ring to enforce a CPU round trip while switching
7031 * between clients.
7032 */
7033 dep = amdgpu_sync_peek_fence(&isolation->prev, ring);
7034 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT);
7035 if (r)
7036 DRM_WARN("OOM tracking isolation\n");
7037
7038 out_grab_ref:
7039 dma_fence_get(dep);
7040 out_return_dep:
7041 mutex_unlock(&adev->enforce_isolation_mutex);
7042 return dep;
7043 }
7044
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)7045 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
7046 {
7047 switch (adev->asic_type) {
7048 #ifdef CONFIG_DRM_AMDGPU_SI
7049 case CHIP_HAINAN:
7050 #endif
7051 case CHIP_TOPAZ:
7052 /* chips with no display hardware */
7053 return false;
7054 #ifdef CONFIG_DRM_AMDGPU_SI
7055 case CHIP_TAHITI:
7056 case CHIP_PITCAIRN:
7057 case CHIP_VERDE:
7058 case CHIP_OLAND:
7059 #endif
7060 #ifdef CONFIG_DRM_AMDGPU_CIK
7061 case CHIP_BONAIRE:
7062 case CHIP_HAWAII:
7063 case CHIP_KAVERI:
7064 case CHIP_KABINI:
7065 case CHIP_MULLINS:
7066 #endif
7067 case CHIP_TONGA:
7068 case CHIP_FIJI:
7069 case CHIP_POLARIS10:
7070 case CHIP_POLARIS11:
7071 case CHIP_POLARIS12:
7072 case CHIP_VEGAM:
7073 case CHIP_CARRIZO:
7074 case CHIP_STONEY:
7075 /* chips with display hardware */
7076 return true;
7077 default:
7078 /* IP discovery */
7079 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
7080 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
7081 return false;
7082 return true;
7083 }
7084 }
7085
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)7086 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
7087 uint32_t inst, uint32_t reg_addr, char reg_name[],
7088 uint32_t expected_value, uint32_t mask)
7089 {
7090 uint32_t ret = 0;
7091 uint32_t old_ = 0;
7092 uint32_t tmp_ = RREG32(reg_addr);
7093 uint32_t loop = adev->usec_timeout;
7094
7095 while ((tmp_ & (mask)) != (expected_value)) {
7096 if (old_ != tmp_) {
7097 loop = adev->usec_timeout;
7098 old_ = tmp_;
7099 } else
7100 udelay(1);
7101 tmp_ = RREG32(reg_addr);
7102 loop--;
7103 if (!loop) {
7104 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
7105 inst, reg_name, (uint32_t)expected_value,
7106 (uint32_t)(tmp_ & (mask)));
7107 ret = -ETIMEDOUT;
7108 break;
7109 }
7110 }
7111 return ret;
7112 }
7113
amdgpu_get_soft_full_reset_mask(struct amdgpu_ring * ring)7114 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
7115 {
7116 ssize_t size = 0;
7117
7118 if (!ring || !ring->adev)
7119 return size;
7120
7121 if (amdgpu_device_should_recover_gpu(ring->adev))
7122 size |= AMDGPU_RESET_TYPE_FULL;
7123
7124 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
7125 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
7126 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
7127
7128 return size;
7129 }
7130
amdgpu_show_reset_mask(char * buf,uint32_t supported_reset)7131 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
7132 {
7133 ssize_t size = 0;
7134
7135 if (supported_reset == 0) {
7136 size += sysfs_emit_at(buf, size, "unsupported");
7137 size += sysfs_emit_at(buf, size, "\n");
7138 return size;
7139
7140 }
7141
7142 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
7143 size += sysfs_emit_at(buf, size, "soft ");
7144
7145 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
7146 size += sysfs_emit_at(buf, size, "queue ");
7147
7148 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
7149 size += sysfs_emit_at(buf, size, "pipe ");
7150
7151 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
7152 size += sysfs_emit_at(buf, size, "full ");
7153
7154 size += sysfs_emit_at(buf, size, "\n");
7155 return size;
7156 }
7157