Lines Matching +full:write +full:- +full:0 +full:- +full:bps
102 if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT || in get_ras_block_str()
103 ras_block->block >= ARRAY_SIZE(ras_block_string)) in get_ras_block_str()
106 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) in get_ras_block_str()
107 return ras_mca_block_string[ras_block->sub_block_index]; in get_ras_block_str()
109 return ras_block_string[ras_block->block]; in get_ras_block_str()
120 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
137 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
155 amdgpu_ras_get_context(adev)->error_query_ready = ready; in amdgpu_ras_set_error_query_ready()
161 return amdgpu_ras_get_context(adev)->error_query_ready; in amdgpu_ras_get_error_query_ready()
172 if ((address >= adev->gmc.mc_vram_size) || in amdgpu_reserve_page_direct()
174 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
175 "RAS WARN: input address 0x%llx is invalid.\n", in amdgpu_reserve_page_direct()
177 return -EINVAL; in amdgpu_reserve_page_direct()
181 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
182 "RAS WARN: 0x%llx has already been marked as bad page!\n", in amdgpu_reserve_page_direct()
184 return 0; in amdgpu_reserve_page_direct()
191 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); in amdgpu_reserve_page_direct()
193 amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); in amdgpu_reserve_page_direct()
195 if (amdgpu_bad_page_threshold != 0) { in amdgpu_reserve_page_direct()
203 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); in amdgpu_reserve_page_direct()
204 dev_warn(adev->dev, "Clear EEPROM:\n"); in amdgpu_reserve_page_direct()
205 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); in amdgpu_reserve_page_direct()
207 return 0; in amdgpu_reserve_page_direct()
213 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; in amdgpu_ras_debugfs_read()
215 .head = obj->head, in amdgpu_ras_debugfs_read()
220 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_debugfs_read()
221 return -EINVAL; in amdgpu_ras_debugfs_read()
224 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && in amdgpu_ras_debugfs_read()
225 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { in amdgpu_ras_debugfs_read()
226 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_debugfs_read()
227 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_debugfs_read()
234 return 0; in amdgpu_ras_debugfs_read()
236 s -= *pos; in amdgpu_ras_debugfs_read()
241 return -EINVAL; in amdgpu_ras_debugfs_read()
251 .write = NULL,
259 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { in amdgpu_ras_find_block_id_by_name()
261 if (strcmp(name, ras_block_string[i]) == 0) in amdgpu_ras_find_block_id_by_name()
262 return 0; in amdgpu_ras_find_block_id_by_name()
264 return -EINVAL; in amdgpu_ras_find_block_id_by_name()
275 int op = -1; in amdgpu_ras_debugfs_ctrl_parse_data()
279 /* default value is 0 if the mask is not set by user */ in amdgpu_ras_debugfs_ctrl_parse_data()
280 u32 instance_mask = 0; in amdgpu_ras_debugfs_ctrl_parse_data()
283 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
286 memset(str, 0, sizeof(str)); in amdgpu_ras_debugfs_ctrl_parse_data()
287 memset(data, 0, sizeof(*data)); in amdgpu_ras_debugfs_ctrl_parse_data()
290 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
293 op = 0; in amdgpu_ras_debugfs_ctrl_parse_data()
300 else if (str[0] && str[1] && str[2] && str[3]) in amdgpu_ras_debugfs_ctrl_parse_data()
302 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
304 if (op != -1) { in amdgpu_ras_debugfs_ctrl_parse_data()
306 if (sscanf(str, "%*s 0x%llx", &address) != 1 && in amdgpu_ras_debugfs_ctrl_parse_data()
308 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
310 data->op = op; in amdgpu_ras_debugfs_ctrl_parse_data()
311 data->inject.address = address; in amdgpu_ras_debugfs_ctrl_parse_data()
313 return 0; in amdgpu_ras_debugfs_ctrl_parse_data()
317 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
319 data->head.block = block_id; in amdgpu_ras_debugfs_ctrl_parse_data()
322 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; in amdgpu_ras_debugfs_ctrl_parse_data()
324 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; in amdgpu_ras_debugfs_ctrl_parse_data()
326 data->head.type = AMDGPU_RAS_ERROR__POISON; in amdgpu_ras_debugfs_ctrl_parse_data()
328 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
330 data->op = op; in amdgpu_ras_debugfs_ctrl_parse_data()
333 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x", in amdgpu_ras_debugfs_ctrl_parse_data()
337 sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", in amdgpu_ras_debugfs_ctrl_parse_data()
341 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
342 data->head.sub_block_index = sub_block; in amdgpu_ras_debugfs_ctrl_parse_data()
343 data->inject.address = address; in amdgpu_ras_debugfs_ctrl_parse_data()
344 data->inject.value = value; in amdgpu_ras_debugfs_ctrl_parse_data()
345 data->inject.instance_mask = instance_mask; in amdgpu_ras_debugfs_ctrl_parse_data()
349 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
352 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
355 return 0; in amdgpu_ras_debugfs_ctrl_parse_data()
361 int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; in amdgpu_ras_instance_mask_check()
362 uint32_t mask, inst_mask = data->inject.instance_mask; in amdgpu_ras_instance_mask_check()
366 data->inject.instance_mask = 0; in amdgpu_ras_instance_mask_check()
367 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
368 "RAS inject mask(0x%x) isn't supported and force it to 0.\n", in amdgpu_ras_instance_mask_check()
374 switch (data->head.block) { in amdgpu_ras_instance_mask_check()
376 mask = GENMASK(num_xcc - 1, 0); in amdgpu_ras_instance_mask_check()
379 mask = GENMASK(adev->sdma.num_instances - 1, 0); in amdgpu_ras_instance_mask_check()
383 mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); in amdgpu_ras_instance_mask_check()
391 data->inject.instance_mask &= mask; in amdgpu_ras_instance_mask_check()
392 if (inst_mask != data->inject.instance_mask) in amdgpu_ras_instance_mask_check()
393 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
394 "Adjust RAS inject mask 0x%x to 0x%x\n", in amdgpu_ras_instance_mask_check()
395 inst_mask, data->inject.instance_mask); in amdgpu_ras_instance_mask_check()
414 * As their names indicate, inject operation will write the
420 * - 0: disable RAS on the block. Take ::head as its data.
421 * - 1: enable RAS on the block. Take ::head as its data.
422 * - 2: inject errors on the block. Take ::inject as its data.
429 * Write the struct to the control interface.
433 * .. code-block:: bash
437 …* echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/…
449 * ue is multi-uncorrectable
450 * ce is single-correctable
453 * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
454 * The address and value are hexadecimal numbers, leading 0x is optional.
455 * The mask means instance mask, is optional, default value is 0x1.
459 * .. code-block:: bash
461 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
462 * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
463 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
468 * /sys/class/drm/card[0/1/2...]/device/ras/features
471 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
483 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_ctrl_write()
485 int ret = 0; in amdgpu_ras_debugfs_ctrl_write()
488 dev_warn(adev->dev, "RAS WARN: error injection " in amdgpu_ras_debugfs_ctrl_write()
506 return -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
509 case 0: in amdgpu_ras_debugfs_ctrl_write()
510 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); in amdgpu_ras_debugfs_ctrl_write()
516 if ((data.inject.address >= adev->gmc.mc_vram_size && in amdgpu_ras_debugfs_ctrl_write()
517 adev->gmc.mc_vram_size) || in amdgpu_ras_debugfs_ctrl_write()
519 dev_warn(adev->dev, "RAS WARN: input address " in amdgpu_ras_debugfs_ctrl_write()
520 "0x%llx is invalid.", in amdgpu_ras_debugfs_ctrl_write()
522 ret = -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
529 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " in amdgpu_ras_debugfs_ctrl_write()
541 ret = -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
560 * .. code-block:: bash
564 * will reset EEPROM table to 0 entries.
572 (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_eeprom_write()
576 &(amdgpu_ras_get_context(adev)->eeprom_control)); in amdgpu_ras_debugfs_eeprom_write()
581 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_debugfs_eeprom_write()
591 .write = amdgpu_ras_debugfs_ctrl_write,
598 .write = amdgpu_ras_debugfs_eeprom_write,
606 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
617 * .. code-block:: bash
619 * ue: 0
628 .head = obj->head, in amdgpu_ras_sysfs_read()
631 if (!amdgpu_ras_get_error_query_ready(obj->adev)) in amdgpu_ras_sysfs_read()
634 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_sysfs_read()
635 return -EINVAL; in amdgpu_ras_sysfs_read()
637 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && in amdgpu_ras_sysfs_read()
638 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { in amdgpu_ras_sysfs_read()
639 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_sysfs_read()
640 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_sysfs_read()
653 #define get_obj(obj) do { (obj)->use++; } while (0)
654 #define alive_obj(obj) ((obj)->use)
658 if (obj && (--obj->use == 0)) { in put_obj()
659 list_del(&obj->node); in put_obj()
660 amdgpu_ras_error_data_fini(&obj->err_data); in put_obj()
663 if (obj && (obj->use < 0)) in put_obj()
664 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); in put_obj()
674 if (!adev->ras_enabled || !con) in amdgpu_ras_create_obj()
677 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) in amdgpu_ras_create_obj()
680 if (head->block == AMDGPU_RAS_BLOCK__MCA) { in amdgpu_ras_create_obj()
681 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) in amdgpu_ras_create_obj()
684 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; in amdgpu_ras_create_obj()
686 obj = &con->objs[head->block]; in amdgpu_ras_create_obj()
692 if (amdgpu_ras_error_data_init(&obj->err_data)) in amdgpu_ras_create_obj()
695 obj->head = *head; in amdgpu_ras_create_obj()
696 obj->adev = adev; in amdgpu_ras_create_obj()
697 list_add(&obj->node, &con->head); in amdgpu_ras_create_obj()
711 if (!adev->ras_enabled || !con) in amdgpu_ras_find_obj()
715 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) in amdgpu_ras_find_obj()
718 if (head->block == AMDGPU_RAS_BLOCK__MCA) { in amdgpu_ras_find_obj()
719 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) in amdgpu_ras_find_obj()
722 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; in amdgpu_ras_find_obj()
724 obj = &con->objs[head->block]; in amdgpu_ras_find_obj()
729 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { in amdgpu_ras_find_obj()
730 obj = &con->objs[i]; in amdgpu_ras_find_obj()
744 return adev->ras_hw_enabled & BIT(head->block); in amdgpu_ras_is_feature_allowed()
752 return con->features & BIT(head->block); in amdgpu_ras_is_feature_enabled()
767 * Ras framework checks con->hw_supported to see if it need do in __amdgpu_ras_feature_enable()
769 * IP checks con->support to see if it need disable ras. in __amdgpu_ras_feature_enable()
772 return 0; in __amdgpu_ras_feature_enable()
778 return -EINVAL; in __amdgpu_ras_feature_enable()
783 con->features |= BIT(head->block); in __amdgpu_ras_feature_enable()
786 con->features &= ~BIT(head->block); in __amdgpu_ras_feature_enable()
791 return 0; in __amdgpu_ras_feature_enable()
803 return -EINVAL; in amdgpu_ras_feature_enable()
805 /* For non-gfx ip, do not enable ras feature if it is not allowed */ in amdgpu_ras_feature_enable()
808 if (head->block != AMDGPU_RAS_BLOCK__GFX && in amdgpu_ras_feature_enable()
810 return 0; in amdgpu_ras_feature_enable()
813 if (head->block == AMDGPU_RAS_BLOCK__GFX && in amdgpu_ras_feature_enable()
818 return -ENOMEM; in amdgpu_ras_feature_enable()
821 info->disable_features = (struct ta_ras_disable_features_input) { in amdgpu_ras_feature_enable()
822 .block_id = amdgpu_ras_block_to_ta(head->block), in amdgpu_ras_feature_enable()
823 .error_type = amdgpu_ras_error_to_ta(head->type), in amdgpu_ras_feature_enable()
826 info->enable_features = (struct ta_ras_enable_features_input) { in amdgpu_ras_feature_enable()
827 .block_id = amdgpu_ras_block_to_ta(head->block), in amdgpu_ras_feature_enable()
828 .error_type = amdgpu_ras_error_to_ta(head->type), in amdgpu_ras_feature_enable()
832 ret = psp_ras_enable_features(&adev->psp, info, enable); in amdgpu_ras_feature_enable()
834 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", in amdgpu_ras_feature_enable()
848 return 0; in amdgpu_ras_feature_enable()
859 return -EINVAL; in amdgpu_ras_feature_enable_on_boot()
861 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { in amdgpu_ras_feature_enable_on_boot()
867 * with error code -EAGAIN. in amdgpu_ras_feature_enable_on_boot()
874 if (ret == -EINVAL) { in amdgpu_ras_feature_enable_on_boot()
877 dev_info(adev->dev, in amdgpu_ras_feature_enable_on_boot()
887 /* gfx block ras disable cmd must send to ras-ta */ in amdgpu_ras_feature_enable_on_boot()
888 if (head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
889 con->features |= BIT(head->block); in amdgpu_ras_feature_enable_on_boot()
891 ret = amdgpu_ras_feature_enable(adev, head, 0); in amdgpu_ras_feature_enable_on_boot()
894 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
895 con->features &= ~BIT(head->block); in amdgpu_ras_feature_enable_on_boot()
909 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_disable_all_features()
914 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
917 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
922 return con->features; in amdgpu_ras_disable_all_features()
932 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { in amdgpu_ras_enable_all_features()
936 .sub_block_index = 0, in amdgpu_ras_enable_all_features()
955 for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { in amdgpu_ras_enable_all_features()
975 return con->features; in amdgpu_ras_enable_all_features()
983 return -EINVAL; in amdgpu_ras_block_match_default()
985 if (block_obj->ras_comm.block == block) in amdgpu_ras_block_match_default()
986 return 0; in amdgpu_ras_block_match_default()
988 return -EINVAL; in amdgpu_ras_block_match_default()
1000 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_get_ras_block()
1001 if (!node->ras_obj) { in amdgpu_ras_get_ras_block()
1002 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_get_ras_block()
1006 obj = node->ras_obj; in amdgpu_ras_get_ras_block()
1007 if (obj->ras_block_match) { in amdgpu_ras_get_ras_block()
1008 if (obj->ras_block_match(obj, block, sub_block_index) == 0) in amdgpu_ras_get_ras_block()
1011 if (amdgpu_ras_block_match_default(obj, block) == 0) in amdgpu_ras_get_ras_block()
1022 int ret = 0; in amdgpu_ras_get_ecc_info()
1028 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); in amdgpu_ras_get_ecc_info()
1029 if (ret == -EOPNOTSUPP) { in amdgpu_ras_get_ecc_info()
1030 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1031 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) in amdgpu_ras_get_ecc_info()
1032 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1037 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1038 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) in amdgpu_ras_get_ecc_info()
1039 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1041 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1042 adev->umc.ras->ecc_info_query_ras_error_count) in amdgpu_ras_get_ecc_info()
1043 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1045 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1046 adev->umc.ras->ecc_info_query_ras_error_address) in amdgpu_ras_get_ecc_info()
1047 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1062 u64 event_id = qctx->evid.event_id; in amdgpu_ras_error_print_error_data()
1066 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1067 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1068 if (err_info->ue_count) { in amdgpu_ras_error_print_error_data()
1071 mcm_info->socket_id, in amdgpu_ras_error_print_error_data()
1072 mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1073 err_info->ue_count, in amdgpu_ras_error_print_error_data()
1078 for_each_ras_error(err_node, &ras_mgr->err_data) { in amdgpu_ras_error_print_error_data()
1079 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1080 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1083 mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name); in amdgpu_ras_error_print_error_data()
1089 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1090 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1091 if (err_info->de_count) { in amdgpu_ras_error_print_error_data()
1094 mcm_info->socket_id, in amdgpu_ras_error_print_error_data()
1095 mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1096 err_info->de_count, in amdgpu_ras_error_print_error_data()
1101 for_each_ras_error(err_node, &ras_mgr->err_data) { in amdgpu_ras_error_print_error_data()
1102 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1103 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1106 mcm_info->socket_id, mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1107 err_info->de_count, blk_name); in amdgpu_ras_error_print_error_data()
1111 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1112 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1113 if (err_info->ce_count) { in amdgpu_ras_error_print_error_data()
1116 mcm_info->socket_id, in amdgpu_ras_error_print_error_data()
1117 mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1118 err_info->ce_count, in amdgpu_ras_error_print_error_data()
1123 for_each_ras_error(err_node, &ras_mgr->err_data) { in amdgpu_ras_error_print_error_data()
1124 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1125 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1128 mcm_info->socket_id, mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1129 err_info->ce_count, blk_name); in amdgpu_ras_error_print_error_data()
1137 return !list_empty(&data->err_node_list); in err_data_has_source_info()
1145 struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); in amdgpu_ras_error_generate_report()
1146 const char *blk_name = get_ras_block_str(&query_if->head); in amdgpu_ras_error_generate_report()
1147 u64 event_id = qctx->evid.event_id; in amdgpu_ras_error_generate_report()
1149 if (err_data->ce_count) { in amdgpu_ras_error_generate_report()
1153 } else if (!adev->aid_mask && in amdgpu_ras_error_generate_report()
1154 adev->smuio.funcs && in amdgpu_ras_error_generate_report()
1155 adev->smuio.funcs->get_socket_id && in amdgpu_ras_error_generate_report()
1156 adev->smuio.funcs->get_die_id) { in amdgpu_ras_error_generate_report()
1160 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_error_generate_report()
1161 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_error_generate_report()
1162 ras_mgr->err_data.ce_count, in amdgpu_ras_error_generate_report()
1167 ras_mgr->err_data.ce_count, in amdgpu_ras_error_generate_report()
1172 if (err_data->ue_count) { in amdgpu_ras_error_generate_report()
1176 } else if (!adev->aid_mask && in amdgpu_ras_error_generate_report()
1177 adev->smuio.funcs && in amdgpu_ras_error_generate_report()
1178 adev->smuio.funcs->get_socket_id && in amdgpu_ras_error_generate_report()
1179 adev->smuio.funcs->get_die_id) { in amdgpu_ras_error_generate_report()
1183 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_error_generate_report()
1184 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_error_generate_report()
1185 ras_mgr->err_data.ue_count, in amdgpu_ras_error_generate_report()
1190 ras_mgr->err_data.ue_count, in amdgpu_ras_error_generate_report()
1195 if (err_data->de_count) { in amdgpu_ras_error_generate_report()
1199 } else if (!adev->aid_mask && in amdgpu_ras_error_generate_report()
1200 adev->smuio.funcs && in amdgpu_ras_error_generate_report()
1201 adev->smuio.funcs->get_socket_id && in amdgpu_ras_error_generate_report()
1202 adev->smuio.funcs->get_die_id) { in amdgpu_ras_error_generate_report()
1206 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_error_generate_report()
1207 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_error_generate_report()
1208 ras_mgr->err_data.de_count, in amdgpu_ras_error_generate_report()
1213 ras_mgr->err_data.de_count, in amdgpu_ras_error_generate_report()
1225 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &query_if->head); in amdgpu_ras_virt_error_generate_report()
1226 const char *blk_name = get_ras_block_str(&query_if->head); in amdgpu_ras_virt_error_generate_report()
1227 u64 event_id = qctx->evid.event_id; in amdgpu_ras_virt_error_generate_report()
1229 new_ce = err_data->ce_count - obj->err_data.ce_count; in amdgpu_ras_virt_error_generate_report()
1230 new_ue = err_data->ue_count - obj->err_data.ue_count; in amdgpu_ras_virt_error_generate_report()
1231 new_de = err_data->de_count - obj->err_data.de_count; in amdgpu_ras_virt_error_generate_report()
1262 err_info = &err_node->err_info; in amdgpu_rasmgr_error_data_statistic_update()
1263 amdgpu_ras_error_statistic_de_count(&obj->err_data, in amdgpu_rasmgr_error_data_statistic_update()
1264 &err_info->mcm_info, err_info->de_count); in amdgpu_rasmgr_error_data_statistic_update()
1265 amdgpu_ras_error_statistic_ce_count(&obj->err_data, in amdgpu_rasmgr_error_data_statistic_update()
1266 &err_info->mcm_info, err_info->ce_count); in amdgpu_rasmgr_error_data_statistic_update()
1267 amdgpu_ras_error_statistic_ue_count(&obj->err_data, in amdgpu_rasmgr_error_data_statistic_update()
1268 &err_info->mcm_info, err_info->ue_count); in amdgpu_rasmgr_error_data_statistic_update()
1272 obj->err_data.ue_count += err_data->ue_count; in amdgpu_rasmgr_error_data_statistic_update()
1273 obj->err_data.ce_count += err_data->ce_count; in amdgpu_rasmgr_error_data_statistic_update()
1274 obj->err_data.de_count += err_data->de_count; in amdgpu_rasmgr_error_data_statistic_update()
1282 obj->err_data.ue_count = err_data->ue_count; in amdgpu_ras_mgr_virt_error_data_statistics_update()
1283 obj->err_data.ce_count = err_data->ce_count; in amdgpu_ras_mgr_virt_error_data_statistics_update()
1284 obj->err_data.de_count = err_data->de_count; in amdgpu_ras_mgr_virt_error_data_statistics_update()
1291 memset(&head, 0, sizeof(head)); in get_ras_manager()
1303 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) in amdgpu_ras_bind_aca()
1304 return 0; in amdgpu_ras_bind_aca()
1308 return -EINVAL; in amdgpu_ras_bind_aca()
1310 return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data); in amdgpu_ras_bind_aca()
1319 return -EINVAL; in amdgpu_ras_unbind_aca()
1321 amdgpu_aca_remove_handle(&obj->aca_handle); in amdgpu_ras_unbind_aca()
1323 return 0; in amdgpu_ras_unbind_aca()
1334 return -EINVAL; in amdgpu_aca_log_ras_error_data()
1336 return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx); in amdgpu_aca_log_ras_error_data()
1344 .head = obj->head, in amdgpu_ras_aca_sysfs_read()
1347 if (!amdgpu_ras_get_error_query_ready(obj->adev)) in amdgpu_ras_aca_sysfs_read()
1350 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_aca_sysfs_read()
1351 return -EINVAL; in amdgpu_ras_aca_sysfs_read()
1363 enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; in amdgpu_ras_query_error_status_helper()
1368 return -EINVAL; in amdgpu_ras_query_error_status_helper()
1371 return -EINVAL; in amdgpu_ras_query_error_status_helper()
1376 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { in amdgpu_ras_query_error_status_helper()
1379 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); in amdgpu_ras_query_error_status_helper()
1380 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_query_error_status_helper()
1381 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_query_error_status_helper()
1382 get_ras_block_str(&info->head)); in amdgpu_ras_query_error_status_helper()
1383 return -EINVAL; in amdgpu_ras_query_error_status_helper()
1386 if (block_obj->hw_ops->query_ras_error_count) in amdgpu_ras_query_error_status_helper()
1387 block_obj->hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_query_error_status_helper()
1389 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || in amdgpu_ras_query_error_status_helper()
1390 (info->head.block == AMDGPU_RAS_BLOCK__GFX) || in amdgpu_ras_query_error_status_helper()
1391 (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { in amdgpu_ras_query_error_status_helper()
1392 if (block_obj->hw_ops->query_ras_error_status) in amdgpu_ras_query_error_status_helper()
1393 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_query_error_status_helper()
1416 return 0; in amdgpu_ras_query_error_status_helper()
1424 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_query_error_status_with_event()
1431 return -EINVAL; in amdgpu_ras_query_error_status_with_event()
1438 return -EINVAL; in amdgpu_ras_query_error_status_with_event()
1440 memset(&qctx, 0, sizeof(qctx)); in amdgpu_ras_query_error_status_with_event()
1444 if (!down_read_trylock(&adev->reset_domain->sem)) { in amdgpu_ras_query_error_status_with_event()
1445 ret = -EIO; in amdgpu_ras_query_error_status_with_event()
1453 up_read(&adev->reset_domain->sem); in amdgpu_ras_query_error_status_with_event()
1469 info->ue_count = obj->err_data.ue_count; in amdgpu_ras_query_error_status_with_event()
1470 info->ce_count = obj->err_data.ce_count; in amdgpu_ras_query_error_status_with_event()
1471 info->de_count = obj->err_data.de_count; in amdgpu_ras_query_error_status_with_event()
1487 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); in amdgpu_ras_reset_error_count()
1488 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; in amdgpu_ras_reset_error_count()
1489 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; in amdgpu_ras_reset_error_count()
1491 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_reset_error_count()
1492 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_reset_error_count()
1494 return -EOPNOTSUPP; in amdgpu_ras_reset_error_count()
1499 return -EOPNOTSUPP; in amdgpu_ras_reset_error_count()
1503 ((smu_funcs && smu_funcs->set_debug_mode) || in amdgpu_ras_reset_error_count()
1504 (mca_funcs && mca_funcs->mca_set_debug_mode))) in amdgpu_ras_reset_error_count()
1505 return -EOPNOTSUPP; in amdgpu_ras_reset_error_count()
1507 if (block_obj->hw_ops->reset_ras_error_count) in amdgpu_ras_reset_error_count()
1508 block_obj->hw_ops->reset_ras_error_count(adev); in amdgpu_ras_reset_error_count()
1510 return 0; in amdgpu_ras_reset_error_count()
1516 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); in amdgpu_ras_reset_error_status()
1518 if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) in amdgpu_ras_reset_error_status()
1519 return 0; in amdgpu_ras_reset_error_status()
1523 if (block_obj->hw_ops->reset_ras_error_status) in amdgpu_ras_reset_error_status()
1524 block_obj->hw_ops->reset_ras_error_status(adev); in amdgpu_ras_reset_error_status()
1527 return 0; in amdgpu_ras_reset_error_status()
1534 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_error_inject()
1536 .block_id = amdgpu_ras_block_to_ta(info->head.block), in amdgpu_ras_error_inject()
1537 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), in amdgpu_ras_error_inject()
1538 .sub_block_index = info->head.sub_block_index, in amdgpu_ras_error_inject()
1539 .address = info->address, in amdgpu_ras_error_inject()
1540 .value = info->value, in amdgpu_ras_error_inject()
1542 int ret = -EINVAL; in amdgpu_ras_error_inject()
1544 info->head.block, in amdgpu_ras_error_inject()
1545 info->head.sub_block_index); in amdgpu_ras_error_inject()
1549 return 0; in amdgpu_ras_error_inject()
1552 return -EINVAL; in amdgpu_ras_error_inject()
1554 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_error_inject()
1555 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_inject()
1556 get_ras_block_str(&info->head)); in amdgpu_ras_error_inject()
1557 return -EINVAL; in amdgpu_ras_error_inject()
1561 if (adev->gmc.xgmi.num_physical_nodes > 1 && in amdgpu_ras_error_inject()
1562 info->head.block != AMDGPU_RAS_BLOCK__GFX) { in amdgpu_ras_error_inject()
1568 if (block_obj->hw_ops->ras_error_inject) { in amdgpu_ras_error_inject()
1569 if (info->head.block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_error_inject()
1570 ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); in amdgpu_ras_error_inject()
1572 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, in amdgpu_ras_error_inject()
1573 info->instance_mask); in amdgpu_ras_error_inject()
1576 ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); in amdgpu_ras_error_inject()
1580 dev_err(adev->dev, "ras inject %s failed %d\n", in amdgpu_ras_error_inject()
1581 get_ras_block_str(&info->head), ret); in amdgpu_ras_error_inject()
1587 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
1593 * Return 0 for query success or do nothing, otherwise return an error
1605 return 0; in amdgpu_ras_query_error_count_helper()
1611 *ce_count += query_info->ce_count; in amdgpu_ras_query_error_count_helper()
1612 *ue_count += query_info->ue_count; in amdgpu_ras_query_error_count_helper()
1616 if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && in amdgpu_ras_query_error_count_helper()
1617 amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { in amdgpu_ras_query_error_count_helper()
1618 if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) in amdgpu_ras_query_error_count_helper()
1619 dev_warn(adev->dev, in amdgpu_ras_query_error_count_helper()
1623 return 0; in amdgpu_ras_query_error_count_helper()
1627 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
1637 * error counts in those integer pointers. Return 0 if the device
1638 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
1650 if (!adev->ras_enabled || !con) in amdgpu_ras_query_error_count()
1651 return -EOPNOTSUPP; in amdgpu_ras_query_error_count()
1656 return 0; in amdgpu_ras_query_error_count()
1658 ce = 0; in amdgpu_ras_query_error_count()
1659 ue = 0; in amdgpu_ras_query_error_count()
1662 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_query_error_count()
1664 .head = obj->head, in amdgpu_ras_query_error_count()
1683 return 0; in amdgpu_ras_query_error_count()
1691 struct ras_badpage **bps, unsigned int *count);
1710 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
1729 * .. code-block:: bash
1731 * 0x00000001 : 0x00001000 : R
1732 * 0x00000002 : 0x00001000 : P
1742 struct amdgpu_device *adev = con->adev; in amdgpu_ras_sysfs_badpages_read()
1744 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; in amdgpu_ras_sysfs_badpages_read()
1745 unsigned int start = div64_ul(ppos + element_size - 1, element_size); in amdgpu_ras_sysfs_badpages_read()
1746 unsigned int end = div64_ul(ppos + count - 1, element_size); in amdgpu_ras_sysfs_badpages_read()
1747 ssize_t s = 0; in amdgpu_ras_sysfs_badpages_read()
1748 struct ras_badpage *bps = NULL; in amdgpu_ras_sysfs_badpages_read() local
1749 unsigned int bps_count = 0; in amdgpu_ras_sysfs_badpages_read()
1751 memset(buf, 0, count); in amdgpu_ras_sysfs_badpages_read()
1753 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) in amdgpu_ras_sysfs_badpages_read()
1754 return 0; in amdgpu_ras_sysfs_badpages_read()
1758 "0x%08x : 0x%08x : %1s\n", in amdgpu_ras_sysfs_badpages_read()
1759 bps[start].bp, in amdgpu_ras_sysfs_badpages_read()
1760 bps[start].size, in amdgpu_ras_sysfs_badpages_read()
1761 amdgpu_ras_badpage_flags_str(bps[start].flags)); in amdgpu_ras_sysfs_badpages_read()
1763 kfree(bps); in amdgpu_ras_sysfs_badpages_read()
1774 return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); in amdgpu_ras_sysfs_features_read()
1782 return sysfs_emit(buf, "table version: 0x%x\n", con->eeprom_control.tbl_hdr.version); in amdgpu_ras_sysfs_version_show()
1790 return sysfs_emit(buf, "schema: 0x%x\n", con->schema); in amdgpu_ras_sysfs_schema_show()
1807 struct ras_event_manager *event_mgr = con->event_mgr; in amdgpu_ras_sysfs_event_state_show()
1809 int i, size = 0; in amdgpu_ras_sysfs_event_state_show()
1812 return -EINVAL; in amdgpu_ras_sysfs_event_state_show()
1814 size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno)); in amdgpu_ras_sysfs_event_state_show()
1815 for (i = 0; i < ARRAY_SIZE(dump_event); i++) { in amdgpu_ras_sysfs_event_state_show()
1816 event_state = &event_mgr->event_state[dump_event[i].type]; in amdgpu_ras_sysfs_event_state_show()
1819 atomic64_read(&event_state->count), in amdgpu_ras_sysfs_event_state_show()
1820 event_state->last_seqno); in amdgpu_ras_sysfs_event_state_show()
1830 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove_bad_page_node()
1831 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove_bad_page_node()
1832 &con->badpages_attr.attr, in amdgpu_ras_sysfs_remove_bad_page_node()
1840 &con->features_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1841 &con->version_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1842 &con->schema_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1843 &con->event_state_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1851 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove_dev_attr_node()
1852 sysfs_remove_group(&adev->dev->kobj, &group); in amdgpu_ras_sysfs_remove_dev_attr_node()
1854 return 0; in amdgpu_ras_sysfs_remove_dev_attr_node()
1863 return 0; in amdgpu_ras_sysfs_create()
1865 if (!obj || obj->attr_inuse) in amdgpu_ras_sysfs_create()
1866 return -EINVAL; in amdgpu_ras_sysfs_create()
1868 if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block)) in amdgpu_ras_sysfs_create()
1869 return 0; in amdgpu_ras_sysfs_create()
1873 snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), in amdgpu_ras_sysfs_create()
1874 "%s_err_count", head->name); in amdgpu_ras_sysfs_create()
1876 obj->sysfs_attr = (struct device_attribute){ in amdgpu_ras_sysfs_create()
1878 .name = obj->fs_data.sysfs_name, in amdgpu_ras_sysfs_create()
1883 sysfs_attr_init(&obj->sysfs_attr.attr); in amdgpu_ras_sysfs_create()
1885 if (sysfs_add_file_to_group(&adev->dev->kobj, in amdgpu_ras_sysfs_create()
1886 &obj->sysfs_attr.attr, in amdgpu_ras_sysfs_create()
1889 return -EINVAL; in amdgpu_ras_sysfs_create()
1892 obj->attr_inuse = 1; in amdgpu_ras_sysfs_create()
1894 return 0; in amdgpu_ras_sysfs_create()
1903 return 0; in amdgpu_ras_sysfs_remove()
1905 if (!obj || !obj->attr_inuse) in amdgpu_ras_sysfs_remove()
1906 return -EINVAL; in amdgpu_ras_sysfs_remove()
1908 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove()
1909 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove()
1910 &obj->sysfs_attr.attr, in amdgpu_ras_sysfs_remove()
1912 obj->attr_inuse = 0; in amdgpu_ras_sysfs_remove()
1915 return 0; in amdgpu_ras_sysfs_remove()
1923 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_sysfs_remove_all()
1924 amdgpu_ras_sysfs_remove(adev, &obj->head); in amdgpu_ras_sysfs_remove_all()
1927 if (amdgpu_bad_page_threshold != 0) in amdgpu_ras_sysfs_remove_all()
1932 return 0; in amdgpu_ras_sysfs_remove_all()
1945 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1949 * .. code-block:: bash
1958 struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control; in amdgpu_ras_debugfs_create_ctrl_node()
1959 struct drm_minor *minor = adev_to_drm(adev)->primary; in amdgpu_ras_debugfs_create_ctrl_node()
1962 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); in amdgpu_ras_debugfs_create_ctrl_node()
1968 &con->bad_page_cnt_threshold); in amdgpu_ras_debugfs_create_ctrl_node()
1969 debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs); in amdgpu_ras_debugfs_create_ctrl_node()
1970 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1971 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1974 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table", in amdgpu_ras_debugfs_create_ctrl_node()
1977 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control); in amdgpu_ras_debugfs_create_ctrl_node()
1987 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); in amdgpu_ras_debugfs_create_ctrl_node()
1994 &con->disable_ras_err_cnt_harvest); in amdgpu_ras_debugfs_create_ctrl_node()
2002 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); in amdgpu_ras_debugfs_create()
2009 memcpy(obj->fs_data.debugfs_name, in amdgpu_ras_debugfs_create()
2010 head->debugfs_name, in amdgpu_ras_debugfs_create()
2011 sizeof(obj->fs_data.debugfs_name)); in amdgpu_ras_debugfs_create()
2013 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, in amdgpu_ras_debugfs_create()
2021 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { in amdgpu_ras_aca_is_supported()
2022 case IP_VERSION(13, 0, 6): in amdgpu_ras_aca_is_supported()
2023 case IP_VERSION(13, 0, 12): in amdgpu_ras_aca_is_supported()
2024 case IP_VERSION(13, 0, 14): in amdgpu_ras_aca_is_supported()
2051 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_debugfs_create_all()
2052 if (amdgpu_ras_is_supported(adev, obj->head.block) && in amdgpu_ras_debugfs_create_all()
2053 (obj->attr_inuse == 1)) { in amdgpu_ras_debugfs_create_all()
2055 get_ras_block_str(&obj->head)); in amdgpu_ras_debugfs_create_all()
2056 fs_info.head = obj->head; in amdgpu_ras_debugfs_create_all()
2073 amdgpu_ras_sysfs_badpages_read, NULL, 0);
2089 &con->features_attr.attr, in amdgpu_ras_fs_init()
2090 &con->version_attr.attr, in amdgpu_ras_fs_init()
2091 &con->schema_attr.attr, in amdgpu_ras_fs_init()
2092 &con->event_state_attr.attr, in amdgpu_ras_fs_init()
2104 con->features_attr = dev_attr_features; in amdgpu_ras_fs_init()
2105 sysfs_attr_init(attrs[0]); in amdgpu_ras_fs_init()
2108 con->version_attr = dev_attr_version; in amdgpu_ras_fs_init()
2112 con->schema_attr = dev_attr_schema; in amdgpu_ras_fs_init()
2116 con->event_state_attr = dev_attr_event_state; in amdgpu_ras_fs_init()
2119 if (amdgpu_bad_page_threshold != 0) { in amdgpu_ras_fs_init()
2121 con->badpages_attr = bin_attr_gpu_vram_bad_pages; in amdgpu_ras_fs_init()
2122 sysfs_bin_attr_init(&con->badpages_attr); in amdgpu_ras_fs_init()
2123 bin_attrs[0] = &con->badpages_attr; in amdgpu_ras_fs_init()
2127 r = sysfs_create_group(&adev->dev->kobj, &group); in amdgpu_ras_fs_init()
2129 dev_err(adev->dev, "Failed to create RAS sysfs group!"); in amdgpu_ras_fs_init()
2131 return 0; in amdgpu_ras_fs_init()
2140 list_for_each_entry_safe(con_obj, tmp, &con->head, node) { in amdgpu_ras_fs_fini()
2141 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); in amdgpu_ras_fs_fini()
2148 return 0; in amdgpu_ras_fs_fini()
2165 * If the current interrupt is caused by a non-fatal RAS error, skip in amdgpu_ras_interrupt_fatal_error_handler()
2175 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
2176 adev->nbio.ras->handle_ras_controller_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
2177 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
2179 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
2180 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
2181 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
2188 struct amdgpu_device *adev = obj->adev; in amdgpu_ras_interrupt_poison_consumption_handler()
2190 amdgpu_ras_get_ras_block(adev, obj->head.block, 0); in amdgpu_ras_interrupt_poison_consumption_handler()
2203 amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block); in amdgpu_ras_interrupt_poison_consumption_handler()
2208 if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) { in amdgpu_ras_interrupt_poison_consumption_handler()
2209 poison_stat = block_obj->hw_ops->query_poison_status(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
2212 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", in amdgpu_ras_interrupt_poison_consumption_handler()
2213 block_obj->ras_comm.name); in amdgpu_ras_interrupt_poison_consumption_handler()
2219 amdgpu_umc_poison_handler(adev, obj->head.block, 0); in amdgpu_ras_interrupt_poison_consumption_handler()
2221 if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) in amdgpu_ras_interrupt_poison_consumption_handler()
2222 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
2231 block_obj->ras_comm.name); in amdgpu_ras_interrupt_poison_consumption_handler()
2242 struct amdgpu_device *adev = obj->adev; in amdgpu_ras_interrupt_poison_creation_handler()
2254 if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { in amdgpu_ras_interrupt_poison_creation_handler()
2255 struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); in amdgpu_ras_interrupt_poison_creation_handler()
2257 atomic_inc(&con->page_retirement_req_cnt); in amdgpu_ras_interrupt_poison_creation_handler()
2258 atomic_inc(&con->poison_creation_count); in amdgpu_ras_interrupt_poison_creation_handler()
2260 wake_up(&con->page_retirement_wq); in amdgpu_ras_interrupt_poison_creation_handler()
2267 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_umc_handler()
2271 if (!data->cb) in amdgpu_ras_interrupt_umc_handler()
2281 amdgpu_ras_set_fed(obj->adev, true); in amdgpu_ras_interrupt_umc_handler()
2282 ret = data->cb(obj->adev, &err_data, entry); in amdgpu_ras_interrupt_umc_handler()
2289 /* these counts could be left as 0 if in amdgpu_ras_interrupt_umc_handler()
2292 obj->err_data.ue_count += err_data.ue_count; in amdgpu_ras_interrupt_umc_handler()
2293 obj->err_data.ce_count += err_data.ce_count; in amdgpu_ras_interrupt_umc_handler()
2294 obj->err_data.de_count += err_data.de_count; in amdgpu_ras_interrupt_umc_handler()
2302 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_handler()
2305 while (data->rptr != data->wptr) { in amdgpu_ras_interrupt_handler()
2307 memcpy(&entry, &data->ring[data->rptr], in amdgpu_ras_interrupt_handler()
2308 data->element_size); in amdgpu_ras_interrupt_handler()
2311 data->rptr = (data->aligned_element_size + in amdgpu_ras_interrupt_handler()
2312 data->rptr) % data->ring_size; in amdgpu_ras_interrupt_handler()
2314 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { in amdgpu_ras_interrupt_handler()
2315 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) in amdgpu_ras_interrupt_handler()
2320 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) in amdgpu_ras_interrupt_handler()
2323 dev_warn(obj->adev->dev, in amdgpu_ras_interrupt_handler()
2324 "No RAS interrupt handler for non-UMC block with poison disabled.\n"); in amdgpu_ras_interrupt_handler()
2345 obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_interrupt_dispatch()
2347 return -EINVAL; in amdgpu_ras_interrupt_dispatch()
2349 data = &obj->ih_data; in amdgpu_ras_interrupt_dispatch()
2351 if (data->inuse == 0) in amdgpu_ras_interrupt_dispatch()
2352 return 0; in amdgpu_ras_interrupt_dispatch()
2355 memcpy(&data->ring[data->wptr], info->entry, in amdgpu_ras_interrupt_dispatch()
2356 data->element_size); in amdgpu_ras_interrupt_dispatch()
2359 data->wptr = (data->aligned_element_size + in amdgpu_ras_interrupt_dispatch()
2360 data->wptr) % data->ring_size; in amdgpu_ras_interrupt_dispatch()
2362 schedule_work(&data->ih_work); in amdgpu_ras_interrupt_dispatch()
2364 return 0; in amdgpu_ras_interrupt_dispatch()
2374 return -EINVAL; in amdgpu_ras_interrupt_remove_handler()
2376 data = &obj->ih_data; in amdgpu_ras_interrupt_remove_handler()
2377 if (data->inuse == 0) in amdgpu_ras_interrupt_remove_handler()
2378 return 0; in amdgpu_ras_interrupt_remove_handler()
2380 cancel_work_sync(&data->ih_work); in amdgpu_ras_interrupt_remove_handler()
2382 kfree(data->ring); in amdgpu_ras_interrupt_remove_handler()
2383 memset(data, 0, sizeof(*data)); in amdgpu_ras_interrupt_remove_handler()
2386 return 0; in amdgpu_ras_interrupt_remove_handler()
2400 return -EINVAL; in amdgpu_ras_interrupt_add_handler()
2406 data = &obj->ih_data; in amdgpu_ras_interrupt_add_handler()
2409 .inuse = 0, in amdgpu_ras_interrupt_add_handler()
2410 .cb = ras_obj->ras_cb, in amdgpu_ras_interrupt_add_handler()
2412 .rptr = 0, in amdgpu_ras_interrupt_add_handler()
2413 .wptr = 0, in amdgpu_ras_interrupt_add_handler()
2416 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); in amdgpu_ras_interrupt_add_handler()
2418 data->aligned_element_size = ALIGN(data->element_size, 8); in amdgpu_ras_interrupt_add_handler()
2420 data->ring_size = 64 * data->aligned_element_size; in amdgpu_ras_interrupt_add_handler()
2421 data->ring = kmalloc(data->ring_size, GFP_KERNEL); in amdgpu_ras_interrupt_add_handler()
2422 if (!data->ring) { in amdgpu_ras_interrupt_add_handler()
2424 return -ENOMEM; in amdgpu_ras_interrupt_add_handler()
2428 data->inuse = 1; in amdgpu_ras_interrupt_add_handler()
2430 return 0; in amdgpu_ras_interrupt_add_handler()
2438 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_interrupt_remove_all()
2439 amdgpu_ras_interrupt_remove_handler(adev, &obj->head); in amdgpu_ras_interrupt_remove_all()
2442 return 0; in amdgpu_ras_interrupt_remove_all()
2452 if (!adev->ras_enabled || !con) in amdgpu_ras_log_on_err_counter()
2455 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_log_on_err_counter()
2457 .head = obj->head, in amdgpu_ras_log_on_err_counter()
2476 (amdgpu_ip_version(adev, MP1_HWIP, 0) == in amdgpu_ras_log_on_err_counter()
2477 IP_VERSION(13, 0, 2))) in amdgpu_ras_log_on_err_counter()
2482 if (amdgpu_ip_version(adev, MP0_HWIP, 0) != in amdgpu_ras_log_on_err_counter()
2483 IP_VERSION(11, 0, 2) && in amdgpu_ras_log_on_err_counter()
2484 amdgpu_ip_version(adev, MP0_HWIP, 0) != in amdgpu_ras_log_on_err_counter()
2485 IP_VERSION(11, 0, 4) && in amdgpu_ras_log_on_err_counter()
2486 amdgpu_ip_version(adev, MP0_HWIP, 0) != in amdgpu_ras_log_on_err_counter()
2487 IP_VERSION(13, 0, 0)) { in amdgpu_ras_log_on_err_counter()
2489 dev_warn(adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_log_on_err_counter()
2500 * Only two block need to query read/write in amdgpu_ras_error_status_query()
2503 if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && in amdgpu_ras_error_status_query()
2504 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) in amdgpu_ras_error_status_query()
2508 info->head.block, in amdgpu_ras_error_status_query()
2509 info->head.sub_block_index); in amdgpu_ras_error_status_query()
2511 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_error_status_query()
2512 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_status_query()
2513 get_ras_block_str(&info->head)); in amdgpu_ras_error_status_query()
2517 if (block_obj->hw_ops->query_ras_error_status) in amdgpu_ras_error_status_query()
2518 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_error_status_query()
2527 if (!adev->ras_enabled || !con) in amdgpu_ras_query_err_status()
2530 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_query_err_status()
2532 .head = obj->head, in amdgpu_ras_query_err_status()
2541 /* return 0 on success.
2542 * caller need free bps.
2545 struct ras_badpage **bps, unsigned int *count) in amdgpu_ras_badpages_read() argument
2549 int i = 0; in amdgpu_ras_badpages_read()
2550 int ret = 0, status; in amdgpu_ras_badpages_read()
2552 if (!con || !con->eh_data || !bps || !count) in amdgpu_ras_badpages_read()
2553 return -EINVAL; in amdgpu_ras_badpages_read()
2555 mutex_lock(&con->recovery_lock); in amdgpu_ras_badpages_read()
2556 data = con->eh_data; in amdgpu_ras_badpages_read()
2557 if (!data || data->count == 0) { in amdgpu_ras_badpages_read()
2558 *bps = NULL; in amdgpu_ras_badpages_read()
2559 ret = -EINVAL; in amdgpu_ras_badpages_read()
2563 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); in amdgpu_ras_badpages_read()
2564 if (!*bps) { in amdgpu_ras_badpages_read()
2565 ret = -ENOMEM; in amdgpu_ras_badpages_read()
2569 for (; i < data->count; i++) { in amdgpu_ras_badpages_read()
2570 (*bps)[i] = (struct ras_badpage){ in amdgpu_ras_badpages_read()
2571 .bp = data->bps[i].retired_page, in amdgpu_ras_badpages_read()
2575 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, in amdgpu_ras_badpages_read()
2576 data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT); in amdgpu_ras_badpages_read()
2577 if (status == -EBUSY) in amdgpu_ras_badpages_read()
2578 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; in amdgpu_ras_badpages_read()
2579 else if (status == -ENOENT) in amdgpu_ras_badpages_read()
2580 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; in amdgpu_ras_badpages_read()
2583 *count = data->count; in amdgpu_ras_badpages_read()
2585 mutex_unlock(&con->recovery_lock); in amdgpu_ras_badpages_read()
2595 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) in amdgpu_ras_set_fed_all()
2606 int hive_ras_recovery = 0; in amdgpu_ras_in_recovery()
2609 hive_ras_recovery = atomic_read(&hive->ras_recovery); in amdgpu_ras_in_recovery()
2613 if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) in amdgpu_ras_in_recovery()
2632 struct amdgpu_device *adev = ras->adev; in amdgpu_ras_do_recovery()
2638 atomic_set(&hive->ras_recovery, 1); in amdgpu_ras_do_recovery()
2645 list_for_each_entry(remote_adev, &hive->device_list, in amdgpu_ras_do_recovery()
2652 if (!ras->disable_ras_err_cnt_harvest) { in amdgpu_ras_do_recovery()
2655 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { in amdgpu_ras_do_recovery()
2656 device_list_handle = &hive->device_list; in amdgpu_ras_do_recovery()
2659 list_add_tail(&adev->gmc.xgmi.head, &device_list); in amdgpu_ras_do_recovery()
2672 if (amdgpu_device_should_recover_gpu(ras->adev)) { in amdgpu_ras_do_recovery()
2674 memset(&reset_context, 0, sizeof(reset_context)); in amdgpu_ras_do_recovery()
2682 if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) in amdgpu_ras_do_recovery()
2687 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { in amdgpu_ras_do_recovery()
2688 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; in amdgpu_ras_do_recovery()
2695 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { in amdgpu_ras_do_recovery()
2696 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_do_recovery()
2699 psp_fatal_error_recovery_quirk(&adev->psp); in amdgpu_ras_do_recovery()
2703 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); in amdgpu_ras_do_recovery()
2705 atomic_set(&ras->in_recovery, 0); in amdgpu_ras_do_recovery()
2707 atomic_set(&hive->ras_recovery, 0); in amdgpu_ras_do_recovery()
2712 /* alloc/realloc bps array */
2716 unsigned int old_space = data->count + data->space_left; in amdgpu_ras_realloc_eh_data_space()
2719 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); in amdgpu_ras_realloc_eh_data_space() local
2721 if (!bps) { in amdgpu_ras_realloc_eh_data_space()
2722 return -ENOMEM; in amdgpu_ras_realloc_eh_data_space()
2725 if (data->bps) { in amdgpu_ras_realloc_eh_data_space()
2726 memcpy(bps, data->bps, in amdgpu_ras_realloc_eh_data_space()
2727 data->count * sizeof(*data->bps)); in amdgpu_ras_realloc_eh_data_space()
2728 kfree(data->bps); in amdgpu_ras_realloc_eh_data_space()
2731 data->bps = bps; in amdgpu_ras_realloc_eh_data_space()
2732 data->space_left += align_space - old_space; in amdgpu_ras_realloc_eh_data_space()
2733 return 0; in amdgpu_ras_realloc_eh_data_space()
2737 struct eeprom_table_record *bps, in amdgpu_ras_mca2pa_by_idx() argument
2741 uint32_t socket = 0; in amdgpu_ras_mca2pa_by_idx()
2742 int ret = 0; in amdgpu_ras_mca2pa_by_idx()
2744 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) in amdgpu_ras_mca2pa_by_idx()
2745 socket = adev->smuio.funcs->get_socket_id(adev); in amdgpu_ras_mca2pa_by_idx()
2748 err_data->err_addr_cnt = 0; in amdgpu_ras_mca2pa_by_idx()
2749 err_data->err_addr_len = adev->umc.retire_unit; in amdgpu_ras_mca2pa_by_idx()
2751 memset(&addr_in, 0, sizeof(addr_in)); in amdgpu_ras_mca2pa_by_idx()
2752 addr_in.ma.err_addr = bps->address; in amdgpu_ras_mca2pa_by_idx()
2754 addr_in.ma.ch_inst = bps->mem_channel; in amdgpu_ras_mca2pa_by_idx()
2758 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_mca2pa_by_idx()
2759 ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, in amdgpu_ras_mca2pa_by_idx()
2766 struct eeprom_table_record *bps, in amdgpu_ras_mca2pa() argument
2770 uint32_t die_id, socket = 0; in amdgpu_ras_mca2pa()
2772 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) in amdgpu_ras_mca2pa()
2773 socket = adev->smuio.funcs->get_socket_id(adev); in amdgpu_ras_mca2pa()
2778 if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa) in amdgpu_ras_mca2pa()
2779 die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address, in amdgpu_ras_mca2pa()
2780 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT); in amdgpu_ras_mca2pa()
2782 return -EINVAL; in amdgpu_ras_mca2pa()
2785 err_data->err_addr_cnt = 0; in amdgpu_ras_mca2pa()
2786 err_data->err_addr_len = adev->umc.retire_unit; in amdgpu_ras_mca2pa()
2788 memset(&addr_in, 0, sizeof(addr_in)); in amdgpu_ras_mca2pa()
2789 addr_in.ma.err_addr = bps->address; in amdgpu_ras_mca2pa()
2790 addr_in.ma.ch_inst = bps->mem_channel; in amdgpu_ras_mca2pa()
2791 addr_in.ma.umc_inst = bps->mcumc_id; in amdgpu_ras_mca2pa()
2795 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_mca2pa()
2796 return adev->umc.ras->convert_ras_err_addr(adev, err_data, in amdgpu_ras_mca2pa()
2799 return -EINVAL; in amdgpu_ras_mca2pa()
2803 struct eeprom_table_record *bps, int count) in __amdgpu_ras_restore_bad_pages() argument
2807 struct ras_err_handler_data *data = con->eh_data; in __amdgpu_ras_restore_bad_pages()
2809 for (j = 0; j < count; j++) { in __amdgpu_ras_restore_bad_pages()
2811 bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) in __amdgpu_ras_restore_bad_pages()
2814 if (!data->space_left && in __amdgpu_ras_restore_bad_pages()
2816 return -ENOMEM; in __amdgpu_ras_restore_bad_pages()
2819 amdgpu_ras_reserve_page(adev, bps[j].retired_page); in __amdgpu_ras_restore_bad_pages()
2821 memcpy(&data->bps[data->count], &(bps[j]), in __amdgpu_ras_restore_bad_pages()
2823 data->count++; in __amdgpu_ras_restore_bad_pages()
2824 data->space_left--; in __amdgpu_ras_restore_bad_pages()
2827 return 0; in __amdgpu_ras_restore_bad_pages()
2831 struct eeprom_table_record *bps, struct ras_err_data *err_data, in __amdgpu_ras_convert_rec_array_from_rom() argument
2834 int i = 0; in __amdgpu_ras_convert_rec_array_from_rom()
2837 save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; in __amdgpu_ras_convert_rec_array_from_rom()
2840 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { in __amdgpu_ras_convert_rec_array_from_rom()
2841 memcpy(err_data->err_addr, bps, in __amdgpu_ras_convert_rec_array_from_rom()
2842 sizeof(struct eeprom_table_record) * adev->umc.retire_unit); in __amdgpu_ras_convert_rec_array_from_rom()
2846 for (i = 0; i < adev->umc.retire_unit; i++) in __amdgpu_ras_convert_rec_array_from_rom()
2847 bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); in __amdgpu_ras_convert_rec_array_from_rom()
2852 bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT)) in __amdgpu_ras_convert_rec_array_from_rom()
2853 return -EINVAL; in __amdgpu_ras_convert_rec_array_from_rom()
2855 if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data)) in __amdgpu_ras_convert_rec_array_from_rom()
2856 return -EINVAL; in __amdgpu_ras_convert_rec_array_from_rom()
2859 if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { in __amdgpu_ras_convert_rec_array_from_rom()
2861 memcpy(err_data->err_addr, bps, in __amdgpu_ras_convert_rec_array_from_rom()
2862 sizeof(struct eeprom_table_record) * adev->umc.retire_unit); in __amdgpu_ras_convert_rec_array_from_rom()
2864 return -EOPNOTSUPP; in __amdgpu_ras_convert_rec_array_from_rom()
2869 return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); in __amdgpu_ras_convert_rec_array_from_rom()
2873 struct eeprom_table_record *bps, struct ras_err_data *err_data, in __amdgpu_ras_convert_rec_from_rom() argument
2878 save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; in __amdgpu_ras_convert_rec_from_rom()
2879 bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); in __amdgpu_ras_convert_rec_from_rom()
2883 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT)) in __amdgpu_ras_convert_rec_from_rom()
2884 return -EINVAL; in __amdgpu_ras_convert_rec_from_rom()
2886 if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) in __amdgpu_ras_convert_rec_from_rom()
2887 return -EINVAL; in __amdgpu_ras_convert_rec_from_rom()
2889 return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, in __amdgpu_ras_convert_rec_from_rom()
2890 adev->umc.retire_unit); in __amdgpu_ras_convert_rec_from_rom()
2895 struct eeprom_table_record *bps, int pages, bool from_rom) in amdgpu_ras_add_bad_pages() argument
2900 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_add_bad_pages()
2902 int ret = 0; in amdgpu_ras_add_bad_pages()
2905 if (!con || !con->eh_data || !bps || pages <= 0) in amdgpu_ras_add_bad_pages()
2906 return 0; in amdgpu_ras_add_bad_pages()
2910 kcalloc(adev->umc.retire_unit, in amdgpu_ras_add_bad_pages()
2913 dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n"); in amdgpu_ras_add_bad_pages()
2914 return -ENOMEM; in amdgpu_ras_add_bad_pages()
2917 if (adev->gmc.gmc_funcs->query_mem_partition_mode) in amdgpu_ras_add_bad_pages()
2918 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); in amdgpu_ras_add_bad_pages()
2921 mutex_lock(&con->recovery_lock); in amdgpu_ras_add_bad_pages()
2924 for (i = 0; i < pages; i++) { in amdgpu_ras_add_bad_pages()
2925 if (control->ras_num_recs - i >= adev->umc.retire_unit) { in amdgpu_ras_add_bad_pages()
2926 if ((bps[i].address == bps[i + 1].address) && in amdgpu_ras_add_bad_pages()
2927 (bps[i].mem_channel == bps[i + 1].mem_channel)) { in amdgpu_ras_add_bad_pages()
2930 &bps[i], &err_data, nps); in amdgpu_ras_add_bad_pages()
2933 i += (adev->umc.retire_unit - 1); in amdgpu_ras_add_bad_pages()
2943 &bps[i], &err_data, nps); in amdgpu_ras_add_bad_pages()
2948 ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); in amdgpu_ras_add_bad_pages()
2954 mutex_unlock(&con->recovery_lock); in amdgpu_ras_add_bad_pages()
2960 * write error record array to eeprom, the function should be
2972 if (!con || !con->eh_data) { in amdgpu_ras_save_bad_pages()
2974 *new_cnt = 0; in amdgpu_ras_save_bad_pages()
2976 return 0; in amdgpu_ras_save_bad_pages()
2979 mutex_lock(&con->recovery_lock); in amdgpu_ras_save_bad_pages()
2980 control = &con->eeprom_control; in amdgpu_ras_save_bad_pages()
2981 data = con->eh_data; in amdgpu_ras_save_bad_pages()
2982 bad_page_num = control->ras_num_bad_pages; in amdgpu_ras_save_bad_pages()
2983 save_count = data->count - bad_page_num; in amdgpu_ras_save_bad_pages()
2984 mutex_unlock(&con->recovery_lock); in amdgpu_ras_save_bad_pages()
2986 unit_num = save_count / adev->umc.retire_unit; in amdgpu_ras_save_bad_pages()
2991 if (save_count > 0) { in amdgpu_ras_save_bad_pages()
2993 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { in amdgpu_ras_save_bad_pages()
2995 &data->bps[bad_page_num], save_count)) { in amdgpu_ras_save_bad_pages()
2996 dev_err(adev->dev, "Failed to save EEPROM table data!"); in amdgpu_ras_save_bad_pages()
2997 return -EIO; in amdgpu_ras_save_bad_pages()
3000 for (i = 0; i < unit_num; i++) { in amdgpu_ras_save_bad_pages()
3002 &data->bps[bad_page_num + in amdgpu_ras_save_bad_pages()
3003 i * adev->umc.retire_unit], 1)) { in amdgpu_ras_save_bad_pages()
3004 dev_err(adev->dev, "Failed to save EEPROM table data!"); in amdgpu_ras_save_bad_pages()
3005 return -EIO; in amdgpu_ras_save_bad_pages()
3010 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); in amdgpu_ras_save_bad_pages()
3013 return 0; in amdgpu_ras_save_bad_pages()
3023 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_load_bad_pages()
3024 struct eeprom_table_record *bps; in amdgpu_ras_load_bad_pages() local
3025 int ret, i = 0; in amdgpu_ras_load_bad_pages()
3028 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) in amdgpu_ras_load_bad_pages()
3029 return 0; in amdgpu_ras_load_bad_pages()
3031 bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL); in amdgpu_ras_load_bad_pages()
3032 if (!bps) in amdgpu_ras_load_bad_pages()
3033 return -ENOMEM; in amdgpu_ras_load_bad_pages()
3035 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); in amdgpu_ras_load_bad_pages()
3037 dev_err(adev->dev, "Failed to load EEPROM table records!"); in amdgpu_ras_load_bad_pages()
3039 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { in amdgpu_ras_load_bad_pages()
3040 for (i = 0; i < control->ras_num_recs; i++) { in amdgpu_ras_load_bad_pages()
3041 if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { in amdgpu_ras_load_bad_pages()
3042 if ((bps[i].address == bps[i + 1].address) && in amdgpu_ras_load_bad_pages()
3043 (bps[i].mem_channel == bps[i + 1].mem_channel)) { in amdgpu_ras_load_bad_pages()
3044 control->ras_num_pa_recs += adev->umc.retire_unit; in amdgpu_ras_load_bad_pages()
3045 i += (adev->umc.retire_unit - 1); in amdgpu_ras_load_bad_pages()
3047 control->ras_num_mca_recs += in amdgpu_ras_load_bad_pages()
3048 (control->ras_num_recs - i); in amdgpu_ras_load_bad_pages()
3052 control->ras_num_mca_recs += (control->ras_num_recs - i); in amdgpu_ras_load_bad_pages()
3064 ret = -EHWPOISON; in amdgpu_ras_load_bad_pages()
3068 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); in amdgpu_ras_load_bad_pages()
3072 kfree(bps); in amdgpu_ras_load_bad_pages()
3079 struct ras_err_handler_data *data = con->eh_data; in amdgpu_ras_check_bad_page_unlock()
3083 for (i = 0; i < data->count; i++) in amdgpu_ras_check_bad_page_unlock()
3084 if (addr == data->bps[i].retired_page) in amdgpu_ras_check_bad_page_unlock()
3101 if (!con || !con->eh_data) in amdgpu_ras_check_bad_page()
3104 mutex_lock(&con->recovery_lock); in amdgpu_ras_check_bad_page()
3106 mutex_unlock(&con->recovery_lock); in amdgpu_ras_check_bad_page()
3118 * -1: Threshold is set to default value in amdgpu_ras_validate_threshold()
3121 * 0: Disable bad page retirement in amdgpu_ras_validate_threshold()
3124 * -2: Threshold is determined by a formula in amdgpu_ras_validate_threshold()
3127 * 0 < threshold < max number of bad page records in EEPROM, in amdgpu_ras_validate_threshold()
3128 * A user-defined threshold is set in amdgpu_ras_validate_threshold()
3131 if (amdgpu_bad_page_threshold == -2) { in amdgpu_ras_validate_threshold()
3132 u64 val = adev->gmc.mc_vram_size; in amdgpu_ras_validate_threshold()
3135 con->bad_page_cnt_threshold = min(lower_32_bits(val), in amdgpu_ras_validate_threshold()
3137 } else if (amdgpu_bad_page_threshold == -1) { in amdgpu_ras_validate_threshold()
3138 con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4; in amdgpu_ras_validate_threshold()
3140 con->bad_page_cnt_threshold = min_t(int, max_count, in amdgpu_ras_validate_threshold()
3149 int ret = 0; in amdgpu_ras_put_poison_req()
3153 memset(&poison_msg, 0, sizeof(poison_msg)); in amdgpu_ras_put_poison_req()
3160 ret = kfifo_put(&con->poison_fifo, poison_msg); in amdgpu_ras_put_poison_req()
3162 dev_err(adev->dev, "Poison message fifo is full!\n"); in amdgpu_ras_put_poison_req()
3163 return -ENOSPC; in amdgpu_ras_put_poison_req()
3166 return 0; in amdgpu_ras_put_poison_req()
3174 return kfifo_get(&con->poison_fifo, poison_msg); in amdgpu_ras_get_poison_req()
3179 mutex_init(&ecc_log->lock); in amdgpu_ras_ecc_log_init()
3181 INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); in amdgpu_ras_ecc_log_init()
3182 ecc_log->de_queried_count = 0; in amdgpu_ras_ecc_log_init()
3183 ecc_log->prev_de_queried_count = 0; in amdgpu_ras_ecc_log_init()
3192 mutex_lock(&ecc_log->lock); in amdgpu_ras_ecc_log_fini()
3193 radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) { in amdgpu_ras_ecc_log_fini()
3195 kfree(ecc_err->err_pages.pfn); in amdgpu_ras_ecc_log_fini()
3197 radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot); in amdgpu_ras_ecc_log_fini()
3199 mutex_unlock(&ecc_log->lock); in amdgpu_ras_ecc_log_fini()
3201 mutex_destroy(&ecc_log->lock); in amdgpu_ras_ecc_log_fini()
3202 ecc_log->de_queried_count = 0; in amdgpu_ras_ecc_log_fini()
3203 ecc_log->prev_de_queried_count = 0; in amdgpu_ras_ecc_log_fini()
3211 mutex_lock(&con->umc_ecc_log.lock); in amdgpu_ras_schedule_retirement_dwork()
3212 ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree, in amdgpu_ras_schedule_retirement_dwork()
3214 mutex_unlock(&con->umc_ecc_log.lock); in amdgpu_ras_schedule_retirement_dwork()
3217 schedule_delayed_work(&con->page_retirement_dwork, in amdgpu_ras_schedule_retirement_dwork()
3227 struct amdgpu_device *adev = con->adev; in amdgpu_ras_do_page_retirement()
3255 int ret = 0; in amdgpu_ras_poison_creation_handler()
3258 uint32_t timeout = 0; in amdgpu_ras_poison_creation_handler()
3266 memset(&info, 0, sizeof(info)); in amdgpu_ras_poison_creation_handler()
3269 ecc_log = &ras->umc_ecc_log; in amdgpu_ras_poison_creation_handler()
3270 total_detect_count = 0; in amdgpu_ras_poison_creation_handler()
3276 de_queried_count = ecc_log->de_queried_count; in amdgpu_ras_poison_creation_handler()
3277 if (de_queried_count > ecc_log->prev_de_queried_count) { in amdgpu_ras_poison_creation_handler()
3278 new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; in amdgpu_ras_poison_creation_handler()
3279 ecc_log->prev_de_queried_count = de_queried_count; in amdgpu_ras_poison_creation_handler()
3280 timeout = 0; in amdgpu_ras_poison_creation_handler()
3282 new_detect_count = 0; in amdgpu_ras_poison_creation_handler()
3292 if (!--timeout) { in amdgpu_ras_poison_creation_handler()
3302 dev_warn(adev->dev, "Can't find deferred error! count: %u\n", in amdgpu_ras_poison_creation_handler()
3303 (need_query_count - total_detect_count)); in amdgpu_ras_poison_creation_handler()
3304 return -ENOENT; in amdgpu_ras_poison_creation_handler()
3308 schedule_delayed_work(&ras->page_retirement_dwork, 0); in amdgpu_ras_poison_creation_handler()
3310 return 0; in amdgpu_ras_poison_creation_handler()
3320 ret = kfifo_get(&con->poison_fifo, &msg); in amdgpu_ras_clear_poison_fifo()
3328 uint32_t reset_flags = 0, reset = 0; in amdgpu_ras_poison_consumption_handler()
3332 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); in amdgpu_ras_poison_consumption_handler()
3334 for (i = 0; i < msg_count; i++) { in amdgpu_ras_poison_consumption_handler()
3354 flush_delayed_work(&con->page_retirement_dwork); in amdgpu_ras_poison_consumption_handler()
3356 con->gpu_reset_flags |= reset; in amdgpu_ras_poison_consumption_handler()
3362 flush_work(&con->recovery_work); in amdgpu_ras_poison_consumption_handler()
3365 return 0; in amdgpu_ras_poison_consumption_handler()
3378 wait_event_interruptible(con->page_retirement_wq, in amdgpu_ras_page_retirement_thread()
3380 atomic_read(&con->page_retirement_req_cnt)); in amdgpu_ras_page_retirement_thread()
3385 gpu_reset = 0; in amdgpu_ras_page_retirement_thread()
3388 poison_creation_count = atomic_read(&con->poison_creation_count); in amdgpu_ras_page_retirement_thread()
3390 if (ret == -EIO) in amdgpu_ras_page_retirement_thread()
3394 atomic_sub(poison_creation_count, &con->poison_creation_count); in amdgpu_ras_page_retirement_thread()
3395 atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); in amdgpu_ras_page_retirement_thread()
3397 } while (atomic_read(&con->poison_creation_count)); in amdgpu_ras_page_retirement_thread()
3399 if (ret != -EIO) { in amdgpu_ras_page_retirement_thread()
3400 msg_count = kfifo_len(&con->poison_fifo); in amdgpu_ras_page_retirement_thread()
3404 if ((ret != -EIO) && in amdgpu_ras_page_retirement_thread()
3406 atomic_sub(msg_count, &con->page_retirement_req_cnt); in amdgpu_ras_page_retirement_thread()
3410 if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) { in amdgpu_ras_page_retirement_thread()
3411 /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ in amdgpu_ras_page_retirement_thread()
3413 atomic_set(&con->poison_creation_count, 0); in amdgpu_ras_page_retirement_thread()
3419 atomic_set(&con->page_retirement_req_cnt, 0); in amdgpu_ras_page_retirement_thread()
3421 if (ret == -EIO) { in amdgpu_ras_page_retirement_thread()
3422 /* Wait for mode-1 reset to complete */ in amdgpu_ras_page_retirement_thread()
3423 down_read(&adev->reset_domain->sem); in amdgpu_ras_page_retirement_thread()
3424 up_read(&adev->reset_domain->sem); in amdgpu_ras_page_retirement_thread()
3428 schedule_delayed_work(&con->page_retirement_dwork, 0); in amdgpu_ras_page_retirement_thread()
3430 /* gpu just completed mode-2 reset or other reset */ in amdgpu_ras_page_retirement_thread()
3432 msg_count = kfifo_len(&con->poison_fifo); in amdgpu_ras_page_retirement_thread()
3435 atomic_sub(msg_count, &con->page_retirement_req_cnt); in amdgpu_ras_page_retirement_thread()
3439 schedule_delayed_work(&con->page_retirement_dwork, 0); in amdgpu_ras_page_retirement_thread()
3443 return 0; in amdgpu_ras_page_retirement_thread()
3453 return 0; in amdgpu_ras_init_badpage_info()
3455 control = &con->eeprom_control; in amdgpu_ras_init_badpage_info()
3460 if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_init_badpage_info()
3461 control->ras_num_pa_recs = control->ras_num_recs; in amdgpu_ras_init_badpage_info()
3463 if (control->ras_num_recs) { in amdgpu_ras_init_badpage_info()
3469 adev, control->ras_num_bad_pages); in amdgpu_ras_init_badpage_info()
3471 if (con->update_channel_flag == true) { in amdgpu_ras_init_badpage_info()
3473 adev, control->bad_channel_bitmap); in amdgpu_ras_init_badpage_info()
3474 con->update_channel_flag = false; in amdgpu_ras_init_badpage_info()
3478 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 && in amdgpu_ras_init_badpage_info()
3479 control->tbl_hdr.version < RAS_TABLE_VER_V3) in amdgpu_ras_init_badpage_info()
3482 dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); in amdgpu_ras_init_badpage_info()
3492 u32 max_eeprom_records_count = 0; in amdgpu_ras_recovery_init()
3496 return 0; in amdgpu_ras_recovery_init()
3500 * adev->ras_enabled is unset, i.e. when "ras_enable" in amdgpu_ras_recovery_init()
3501 * module parameter is set to 0. in amdgpu_ras_recovery_init()
3503 con->adev = adev; in amdgpu_ras_recovery_init()
3505 if (!adev->ras_enabled) in amdgpu_ras_recovery_init()
3506 return 0; in amdgpu_ras_recovery_init()
3508 data = &con->eh_data; in amdgpu_ras_recovery_init()
3511 ret = -ENOMEM; in amdgpu_ras_recovery_init()
3515 mutex_init(&con->recovery_lock); in amdgpu_ras_recovery_init()
3516 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); in amdgpu_ras_recovery_init()
3517 atomic_set(&con->in_recovery, 0); in amdgpu_ras_recovery_init()
3518 con->eeprom_control.bad_channel_bitmap = 0; in amdgpu_ras_recovery_init()
3520 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); in amdgpu_ras_recovery_init()
3529 mutex_init(&con->page_rsv_lock); in amdgpu_ras_recovery_init()
3530 INIT_KFIFO(con->poison_fifo); in amdgpu_ras_recovery_init()
3531 mutex_init(&con->page_retirement_lock); in amdgpu_ras_recovery_init()
3532 init_waitqueue_head(&con->page_retirement_wq); in amdgpu_ras_recovery_init()
3533 atomic_set(&con->page_retirement_req_cnt, 0); in amdgpu_ras_recovery_init()
3534 atomic_set(&con->poison_creation_count, 0); in amdgpu_ras_recovery_init()
3535 con->page_retirement_thread = in amdgpu_ras_recovery_init()
3537 if (IS_ERR(con->page_retirement_thread)) { in amdgpu_ras_recovery_init()
3538 con->page_retirement_thread = NULL; in amdgpu_ras_recovery_init()
3539 dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n"); in amdgpu_ras_recovery_init()
3542 INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement); in amdgpu_ras_recovery_init()
3543 amdgpu_ras_ecc_log_init(&con->umc_ecc_log); in amdgpu_ras_recovery_init()
3545 if ((adev->asic_type == CHIP_ALDEBARAN) && in amdgpu_ras_recovery_init()
3546 (adev->gmc.xgmi.connected_to_cpu)) in amdgpu_ras_recovery_init()
3549 return 0; in amdgpu_ras_recovery_init()
3552 kfree((*data)->bps); in amdgpu_ras_recovery_init()
3554 con->eh_data = NULL; in amdgpu_ras_recovery_init()
3556 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); in amdgpu_ras_recovery_init()
3563 ret = 0; in amdgpu_ras_recovery_init()
3565 ret = -EINVAL; in amdgpu_ras_recovery_init()
3573 struct ras_err_handler_data *data = con->eh_data; in amdgpu_ras_recovery_fini()
3579 return 0; in amdgpu_ras_recovery_fini()
3583 flush_delayed_work(&con->page_retirement_dwork); in amdgpu_ras_recovery_fini()
3584 ret = amdgpu_ras_schedule_retirement_dwork(con, 0); in amdgpu_ras_recovery_fini()
3585 } while (ret && max_flush_timeout--); in amdgpu_ras_recovery_fini()
3587 if (con->page_retirement_thread) in amdgpu_ras_recovery_fini()
3588 kthread_stop(con->page_retirement_thread); in amdgpu_ras_recovery_fini()
3590 atomic_set(&con->page_retirement_req_cnt, 0); in amdgpu_ras_recovery_fini()
3591 atomic_set(&con->poison_creation_count, 0); in amdgpu_ras_recovery_fini()
3593 mutex_destroy(&con->page_rsv_lock); in amdgpu_ras_recovery_fini()
3595 cancel_work_sync(&con->recovery_work); in amdgpu_ras_recovery_fini()
3597 cancel_delayed_work_sync(&con->page_retirement_dwork); in amdgpu_ras_recovery_fini()
3599 amdgpu_ras_ecc_log_fini(&con->umc_ecc_log); in amdgpu_ras_recovery_fini()
3601 mutex_lock(&con->recovery_lock); in amdgpu_ras_recovery_fini()
3602 con->eh_data = NULL; in amdgpu_ras_recovery_fini()
3603 kfree(data->bps); in amdgpu_ras_recovery_fini()
3605 mutex_unlock(&con->recovery_lock); in amdgpu_ras_recovery_fini()
3607 return 0; in amdgpu_ras_recovery_fini()
3614 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { in amdgpu_ras_asic_supported()
3615 case IP_VERSION(13, 0, 2): in amdgpu_ras_asic_supported()
3616 case IP_VERSION(13, 0, 6): in amdgpu_ras_asic_supported()
3617 case IP_VERSION(13, 0, 12): in amdgpu_ras_asic_supported()
3618 case IP_VERSION(13, 0, 14): in amdgpu_ras_asic_supported()
3625 if (adev->asic_type == CHIP_IP_DISCOVERY) { in amdgpu_ras_asic_supported()
3626 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { in amdgpu_ras_asic_supported()
3627 case IP_VERSION(13, 0, 0): in amdgpu_ras_asic_supported()
3628 case IP_VERSION(13, 0, 6): in amdgpu_ras_asic_supported()
3629 case IP_VERSION(13, 0, 10): in amdgpu_ras_asic_supported()
3630 case IP_VERSION(13, 0, 12): in amdgpu_ras_asic_supported()
3631 case IP_VERSION(13, 0, 14): in amdgpu_ras_asic_supported()
3632 case IP_VERSION(14, 0, 3): in amdgpu_ras_asic_supported()
3639 return adev->asic_type == CHIP_VEGA10 || in amdgpu_ras_asic_supported()
3640 adev->asic_type == CHIP_VEGA20 || in amdgpu_ras_asic_supported()
3641 adev->asic_type == CHIP_ARCTURUS || in amdgpu_ras_asic_supported()
3642 adev->asic_type == CHIP_ALDEBARAN || in amdgpu_ras_asic_supported()
3643 adev->asic_type == CHIP_SIENNA_CICHLID; in amdgpu_ras_asic_supported()
3649 * due to GC EDC can not write
3653 struct atom_context *ctx = adev->mode_info.atom_context; in amdgpu_ras_get_quirks()
3658 if (strnstr(ctx->vbios_pn, "D16406", in amdgpu_ras_get_quirks()
3659 sizeof(ctx->vbios_pn)) || in amdgpu_ras_get_quirks()
3660 strnstr(ctx->vbios_pn, "D36002", in amdgpu_ras_get_quirks()
3661 sizeof(ctx->vbios_pn))) in amdgpu_ras_get_quirks()
3662 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_get_quirks()
3670 dev_info(adev->dev, "MEM ECC is active.\n"); in amdgpu_ras_query_ras_capablity_from_vbios()
3671 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_query_ras_capablity_from_vbios()
3674 dev_info(adev->dev, "MEM ECC is not presented.\n"); in amdgpu_ras_query_ras_capablity_from_vbios()
3679 dev_info(adev->dev, "SRAM ECC is active.\n"); in amdgpu_ras_query_ras_capablity_from_vbios()
3681 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_query_ras_capablity_from_vbios()
3684 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | in amdgpu_ras_query_ras_capablity_from_vbios()
3692 if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || in amdgpu_ras_query_ras_capablity_from_vbios()
3693 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || in amdgpu_ras_query_ras_capablity_from_vbios()
3694 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3)) in amdgpu_ras_query_ras_capablity_from_vbios()
3695 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_query_ras_capablity_from_vbios()
3698 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_query_ras_capablity_from_vbios()
3705 if (!adev->gmc.xgmi.num_physical_nodes) in amdgpu_ras_query_ras_capablity_from_vbios()
3706 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); in amdgpu_ras_query_ras_capablity_from_vbios()
3708 dev_info(adev->dev, "SRAM ECC is not presented.\n"); in amdgpu_ras_query_ras_capablity_from_vbios()
3723 if (adev->gmc.xgmi.connected_to_cpu || in amdgpu_ras_query_poison_mode()
3724 adev->gmc.is_app_apu) { in amdgpu_ras_query_poison_mode()
3726 con->poison_supported = true; in amdgpu_ras_query_poison_mode()
3727 } else if (adev->df.funcs && in amdgpu_ras_query_poison_mode()
3728 adev->df.funcs->query_ras_poison_mode && in amdgpu_ras_query_poison_mode()
3729 adev->umc.ras && in amdgpu_ras_query_poison_mode()
3730 adev->umc.ras->query_ras_poison_mode) { in amdgpu_ras_query_poison_mode()
3732 adev->df.funcs->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
3734 adev->umc.ras->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
3738 con->poison_supported = true; in amdgpu_ras_query_poison_mode()
3740 dev_warn(adev->dev, in amdgpu_ras_query_poison_mode()
3757 adev->ras_hw_enabled = adev->ras_enabled = 0; in amdgpu_ras_check_supported()
3768 if (amdgpu_psp_get_ras_capability(&adev->psp)) in amdgpu_ras_check_supported()
3772 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { in amdgpu_ras_check_supported()
3777 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | in amdgpu_ras_check_supported()
3790 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; in amdgpu_ras_check_supported()
3792 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : in amdgpu_ras_check_supported()
3793 adev->ras_hw_enabled & amdgpu_ras_mask; in amdgpu_ras_check_supported()
3796 adev->aca.is_enabled = in amdgpu_ras_check_supported()
3797 (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || in amdgpu_ras_check_supported()
3798 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || in amdgpu_ras_check_supported()
3799 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)); in amdgpu_ras_check_supported()
3802 if (adev->gmc.is_app_apu && in amdgpu_ras_check_supported()
3803 amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) in amdgpu_ras_check_supported()
3804 amdgpu_bad_page_threshold = 0; in amdgpu_ras_check_supported()
3811 struct amdgpu_device *adev = con->adev; in amdgpu_ras_counte_dw()
3816 res = pm_runtime_get_sync(dev->dev); in amdgpu_ras_counte_dw()
3817 if (res < 0) in amdgpu_ras_counte_dw()
3822 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) { in amdgpu_ras_counte_dw()
3823 atomic_set(&con->ras_ce_count, ce_count); in amdgpu_ras_counte_dw()
3824 atomic_set(&con->ras_ue_count, ue_count); in amdgpu_ras_counte_dw()
3827 pm_runtime_mark_last_busy(dev->dev); in amdgpu_ras_counte_dw()
3829 pm_runtime_put_autosuspend(dev->dev); in amdgpu_ras_counte_dw()
3834 return amdgpu_ras_is_poison_mode_supported(adev) ? AMDGPU_RAS_ERROR__POISON : 0 | in amdgpu_get_ras_schema()
3845 memset(mgr, 0, sizeof(*mgr)); in ras_event_mgr_init()
3846 atomic64_set(&mgr->seqno, 0); in ras_event_mgr_init()
3848 for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { in ras_event_mgr_init()
3849 event_state = &mgr->event_state[i]; in ras_event_mgr_init()
3850 event_state->last_seqno = RAS_EVENT_INVALID_ID; in ras_event_mgr_init()
3851 atomic64_set(&event_state->count, 0); in ras_event_mgr_init()
3864 ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr; in amdgpu_ras_event_mgr_init()
3866 /* init event manager with node 0 on xgmi system */ in amdgpu_ras_event_mgr_init()
3868 if (!hive || adev->gmc.xgmi.node_id == 0) in amdgpu_ras_event_mgr_init()
3869 ras_event_mgr_init(ras->event_mgr); in amdgpu_ras_event_mgr_init()
3880 if (!con || (adev->flags & AMD_IS_APU)) in amdgpu_ras_init_reserved_vram_size()
3883 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { in amdgpu_ras_init_reserved_vram_size()
3884 case IP_VERSION(13, 0, 2): in amdgpu_ras_init_reserved_vram_size()
3885 case IP_VERSION(13, 0, 6): in amdgpu_ras_init_reserved_vram_size()
3886 case IP_VERSION(13, 0, 12): in amdgpu_ras_init_reserved_vram_size()
3887 con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT; in amdgpu_ras_init_reserved_vram_size()
3889 case IP_VERSION(13, 0, 14): in amdgpu_ras_init_reserved_vram_size()
3890 con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1); in amdgpu_ras_init_reserved_vram_size()
3903 return 0; in amdgpu_ras_init()
3910 return -ENOMEM; in amdgpu_ras_init()
3912 con->adev = adev; in amdgpu_ras_init()
3913 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); in amdgpu_ras_init()
3914 atomic_set(&con->ras_ce_count, 0); in amdgpu_ras_init()
3915 atomic_set(&con->ras_ue_count, 0); in amdgpu_ras_init()
3917 con->objs = (struct ras_manager *)(con + 1); in amdgpu_ras_init()
3923 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { in amdgpu_ras_init()
3927 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { in amdgpu_ras_init()
3928 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_init()
3930 return 0; in amdgpu_ras_init()
3933 r = 0; in amdgpu_ras_init()
3937 con->update_channel_flag = false; in amdgpu_ras_init()
3938 con->features = 0; in amdgpu_ras_init()
3939 con->schema = 0; in amdgpu_ras_init()
3940 INIT_LIST_HEAD(&con->head); in amdgpu_ras_init()
3942 con->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_init()
3947 switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { in amdgpu_ras_init()
3948 case IP_VERSION(7, 4, 0): in amdgpu_ras_init()
3951 if (!adev->gmc.xgmi.connected_to_cpu) in amdgpu_ras_init()
3952 adev->nbio.ras = &nbio_v7_4_ras; in amdgpu_ras_init()
3954 case IP_VERSION(4, 3, 0): in amdgpu_ras_init()
3955 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) in amdgpu_ras_init()
3962 adev->nbio.ras = &nbio_v4_3_ras; in amdgpu_ras_init()
3965 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) in amdgpu_ras_init()
3973 adev->nbio.ras = &nbif_v6_3_1_ras; in amdgpu_ras_init()
3975 case IP_VERSION(7, 9, 0): in amdgpu_ras_init()
3977 if (!adev->gmc.is_app_apu) in amdgpu_ras_init()
3978 adev->nbio.ras = &nbio_v7_9_ras; in amdgpu_ras_init()
3991 if (adev->nbio.ras && in amdgpu_ras_init()
3992 adev->nbio.ras->init_ras_controller_interrupt) { in amdgpu_ras_init()
3993 r = adev->nbio.ras->init_ras_controller_interrupt(adev); in amdgpu_ras_init()
3998 if (adev->nbio.ras && in amdgpu_ras_init()
3999 adev->nbio.ras->init_ras_err_event_athub_interrupt) { in amdgpu_ras_init()
4000 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); in amdgpu_ras_init()
4006 if (adev->smuio.funcs && in amdgpu_ras_init()
4007 adev->smuio.funcs->get_socket_id) in amdgpu_ras_init()
4008 con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << in amdgpu_ras_init()
4012 con->schema = amdgpu_get_ras_schema(adev); in amdgpu_ras_init()
4017 r = -EINVAL; in amdgpu_ras_init()
4030 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " in amdgpu_ras_init()
4032 adev->ras_hw_enabled, adev->ras_enabled); in amdgpu_ras_init()
4034 return 0; in amdgpu_ras_init()
4044 if (adev->gmc.xgmi.connected_to_cpu || in amdgpu_persistent_edc_harvesting_supported()
4045 adev->gmc.is_app_apu) in amdgpu_persistent_edc_harvesting_supported()
4047 return 0; in amdgpu_persistent_edc_harvesting_supported()
4058 return 0; in amdgpu_persistent_edc_harvesting()
4060 if (amdgpu_ras_query_error_status(adev, &info) != 0) in amdgpu_persistent_edc_harvesting()
4063 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) in amdgpu_persistent_edc_harvesting()
4066 return 0; in amdgpu_persistent_edc_harvesting()
4076 return con->poison_supported; in amdgpu_ras_is_poison_mode_supported()
4090 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { in amdgpu_ras_block_late_init()
4091 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); in amdgpu_ras_block_late_init()
4092 return 0; in amdgpu_ras_block_late_init()
4097 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) { in amdgpu_ras_block_late_init()
4109 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) in amdgpu_ras_block_late_init()
4110 return 0; in amdgpu_ras_block_late_init()
4113 if (ras_obj->ras_cb || (ras_obj->hw_ops && in amdgpu_ras_block_late_init()
4114 (ras_obj->hw_ops->query_poison_status || in amdgpu_ras_block_late_init()
4115 ras_obj->hw_ops->handle_poison_consumption))) { in amdgpu_ras_block_late_init()
4121 if (ras_obj->hw_ops && in amdgpu_ras_block_late_init()
4122 (ras_obj->hw_ops->query_ras_error_count || in amdgpu_ras_block_late_init()
4123 ras_obj->hw_ops->query_ras_error_status)) { in amdgpu_ras_block_late_init()
4132 return -ENOMEM; in amdgpu_ras_block_late_init()
4133 memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); in amdgpu_ras_block_late_init()
4135 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { in amdgpu_ras_block_late_init()
4136 atomic_set(&con->ras_ce_count, ce_count); in amdgpu_ras_block_late_init()
4137 atomic_set(&con->ras_ue_count, ue_count); in amdgpu_ras_block_late_init()
4143 return 0; in amdgpu_ras_block_late_init()
4146 if (ras_obj->ras_cb) in amdgpu_ras_block_late_init()
4149 amdgpu_ras_feature_enable(adev, ras_block, 0); in amdgpu_ras_block_late_init()
4170 if (ras_obj->ras_cb) in amdgpu_ras_block_late_fini()
4188 if (!adev->ras_enabled || !con) { in amdgpu_ras_resume()
4195 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { in amdgpu_ras_resume()
4207 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_resume()
4208 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { in amdgpu_ras_resume()
4209 amdgpu_ras_feature_enable(adev, &obj->head, 0); in amdgpu_ras_resume()
4221 if (!adev->ras_enabled || !con) in amdgpu_ras_suspend()
4224 amdgpu_ras_disable_all_features(adev, 0); in amdgpu_ras_suspend()
4226 if (AMDGPU_RAS_GET_FEATURES(con->features)) in amdgpu_ras_suspend()
4258 return 0; in amdgpu_ras_late_init()
4260 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_late_init()
4261 obj = node->ras_obj; in amdgpu_ras_late_init()
4263 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_late_init()
4267 if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block)) in amdgpu_ras_late_init()
4270 if (obj->ras_late_init) { in amdgpu_ras_late_init()
4271 r = obj->ras_late_init(adev, &obj->ras_comm); in amdgpu_ras_late_init()
4273 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", in amdgpu_ras_late_init()
4274 obj->ras_comm.name, r); in amdgpu_ras_late_init()
4278 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); in amdgpu_ras_late_init()
4281 return 0; in amdgpu_ras_late_init()
4289 if (!adev->ras_enabled || !con) in amdgpu_ras_pre_fini()
4290 return 0; in amdgpu_ras_pre_fini()
4294 if (AMDGPU_RAS_GET_FEATURES(con->features)) in amdgpu_ras_pre_fini()
4295 amdgpu_ras_disable_all_features(adev, 0); in amdgpu_ras_pre_fini()
4297 return 0; in amdgpu_ras_pre_fini()
4306 if (!adev->ras_enabled || !con) in amdgpu_ras_fini()
4307 return 0; in amdgpu_ras_fini()
4309 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { in amdgpu_ras_fini()
4310 if (ras_node->ras_obj) { in amdgpu_ras_fini()
4311 obj = ras_node->ras_obj; in amdgpu_ras_fini()
4312 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && in amdgpu_ras_fini()
4313 obj->ras_fini) in amdgpu_ras_fini()
4314 obj->ras_fini(adev, &obj->ras_comm); in amdgpu_ras_fini()
4316 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); in amdgpu_ras_fini()
4320 list_del(&ras_node->node); in amdgpu_ras_fini()
4334 WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); in amdgpu_ras_fini()
4336 if (AMDGPU_RAS_GET_FEATURES(con->features)) in amdgpu_ras_fini()
4337 amdgpu_ras_disable_all_features(adev, 0); in amdgpu_ras_fini()
4339 cancel_delayed_work_sync(&con->ras_counte_delay_work); in amdgpu_ras_fini()
4344 return 0; in amdgpu_ras_fini()
4355 return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); in amdgpu_ras_get_fed_status()
4365 set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); in amdgpu_ras_set_fed()
4367 clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); in amdgpu_ras_set_fed()
4377 ras->ras_err_state = 0; in amdgpu_ras_clear_err_state()
4387 set_bit(block, &ras->ras_err_state); in amdgpu_ras_set_err_poison()
4397 return (ras->ras_err_state != 0); in amdgpu_ras_is_err_state()
4399 return test_bit(block, &ras->ras_err_state) || in amdgpu_ras_is_err_state()
4401 &ras->ras_err_state); in amdgpu_ras_is_err_state()
4415 return ras->event_mgr; in __get_ras_event_mgr()
4423 int ret = 0; in amdgpu_ras_mark_ras_event_caller()
4426 ret = -EINVAL; in amdgpu_ras_mark_ras_event_caller()
4432 ret = -EINVAL; in amdgpu_ras_mark_ras_event_caller()
4436 event_state = &event_mgr->event_state[type]; in amdgpu_ras_mark_ras_event_caller()
4437 event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno); in amdgpu_ras_mark_ras_event_caller()
4438 atomic64_inc(&event_state->count); in amdgpu_ras_mark_ras_event_caller()
4442 dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n", in amdgpu_ras_mark_ras_event_caller()
4464 id = event_mgr->event_state[type].last_seqno; in amdgpu_ras_acquire_event_id()
4477 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { in amdgpu_ras_global_ras_isr()
4491 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_global_ras_isr()
4498 if (adev->asic_type == CHIP_VEGA20 && in amdgpu_ras_need_emergency_restart()
4499 adev->pm.fw_version <= 0x283400) { in amdgpu_ras_need_emergency_restart()
4514 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { in amdgpu_release_ras_context()
4515 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); in amdgpu_release_ras_context()
4527 for (i = 0; i < mce_adev_list.num_gpu; i++) { in find_adev()
4530 if (adev && adev->gmc.xgmi.connected_to_cpu && in find_adev()
4531 adev->gmc.xgmi.physical_node_id == node_id) in find_adev()
4539 #define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF)
4540 #define GET_UMC_INST(m) (((m) >> 21) & 0x7)
4541 #define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
4549 uint32_t gpu_id = 0; in amdgpu_bad_page_notifier()
4550 uint32_t umc_inst = 0, ch_inst = 0; in amdgpu_bad_page_notifier()
4554 * and error occurred in DramECC (Extended error code = 0) then only in amdgpu_bad_page_notifier()
4557 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) && in amdgpu_bad_page_notifier()
4558 (XEC(m->status, 0x3f) == 0x0))) in amdgpu_bad_page_notifier()
4570 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; in amdgpu_bad_page_notifier()
4583 umc_inst = GET_UMC_INST(m->ipid); in amdgpu_bad_page_notifier()
4584 ch_inst = GET_CHAN_INDEX(m->ipid); in amdgpu_bad_page_notifier()
4586 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", in amdgpu_bad_page_notifier()
4589 if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) in amdgpu_bad_page_notifier()
4628 return adev->psp.ras_context.ras; in amdgpu_ras_get_context()
4634 return -EINVAL; in amdgpu_ras_set_context()
4636 adev->psp.ras_context.ras = ras_con; in amdgpu_ras_set_context()
4637 return 0; in amdgpu_ras_set_context()
4644 int ret = 0; in amdgpu_ras_is_supported()
4648 return 0; in amdgpu_ras_is_supported()
4650 ret = ras && (adev->ras_enabled & (1 << block)); in amdgpu_ras_is_supported()
4665 amdgpu_ras_get_ras_block(adev, block, 0)) in amdgpu_ras_is_supported()
4677 ras->gpu_reset_flags = 0; in amdgpu_ras_reset_gpu()
4678 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_reset_gpu()
4681 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) { in amdgpu_ras_reset_gpu()
4683 int hive_ras_recovery = 0; in amdgpu_ras_reset_gpu()
4686 hive_ras_recovery = atomic_read(&hive->ras_recovery); in amdgpu_ras_reset_gpu()
4694 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
4696 atomic_set(&ras->in_recovery, 0); in amdgpu_ras_reset_gpu()
4698 flush_work(&ras->recovery_work); in amdgpu_ras_reset_gpu()
4699 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
4702 return 0; in amdgpu_ras_reset_gpu()
4708 int ret = 0; in amdgpu_ras_set_mca_debug_mode()
4713 con->is_aca_debug_mode = enable; in amdgpu_ras_set_mca_debug_mode()
4722 int ret = 0; in amdgpu_ras_set_aca_debug_mode()
4730 con->is_aca_debug_mode = enable; in amdgpu_ras_set_aca_debug_mode()
4739 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; in amdgpu_ras_get_aca_debug_mode()
4740 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; in amdgpu_ras_get_aca_debug_mode()
4745 if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) || in amdgpu_ras_get_aca_debug_mode()
4746 (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode)) in amdgpu_ras_get_aca_debug_mode()
4747 return con->is_aca_debug_mode; in amdgpu_ras_get_aca_debug_mode()
4756 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; in amdgpu_ras_get_error_query_mode()
4757 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; in amdgpu_ras_get_error_query_mode()
4766 …} else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode… in amdgpu_ras_get_error_query_mode()
4768 (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; in amdgpu_ras_get_error_query_mode()
4782 return -EINVAL; in amdgpu_ras_register_ras_block()
4786 return -ENOMEM; in amdgpu_ras_register_ras_block()
4788 INIT_LIST_HEAD(&ras_node->node); in amdgpu_ras_register_ras_block()
4789 ras_node->ras_obj = ras_block_obj; in amdgpu_ras_register_ras_block()
4790 list_add_tail(&ras_node->node, &adev->ras_list); in amdgpu_ras_register_ras_block()
4792 return 0; in amdgpu_ras_register_ras_block()
4824 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, in amdgpu_ras_inst_get_memory_id_field()
4825 reg_entry->seg_lo, reg_entry->reg_lo); in amdgpu_ras_inst_get_memory_id_field()
4828 if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && in amdgpu_ras_inst_get_memory_id_field()
4848 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, in amdgpu_ras_inst_get_err_cnt_field()
4849 reg_entry->seg_hi, reg_entry->reg_hi); in amdgpu_ras_inst_get_err_cnt_field()
4852 if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && in amdgpu_ras_inst_get_err_cnt_field()
4855 dev_dbg(adev->dev, "Invalid err_info field\n"); in amdgpu_ras_inst_get_err_cnt_field()
4877 for (i = 0; i < reg_list_size; i++) { in amdgpu_ras_inst_query_ras_error_count()
4895 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()
4901 for (j = 0; j < mem_list_size; j++) { in amdgpu_ras_inst_query_ras_error_count()
4903 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()
4923 for (i = 0; i < reg_list_size; i++) { in amdgpu_ras_inst_reset_ras_error_count()
4930 WREG32(err_status_lo_offset, 0); in amdgpu_ras_inst_reset_ras_error_count()
4931 WREG32(err_status_hi_offset, 0); in amdgpu_ras_inst_reset_ras_error_count()
4937 memset(err_data, 0, sizeof(*err_data)); in amdgpu_ras_error_data_init()
4939 INIT_LIST_HEAD(&err_data->err_node_list); in amdgpu_ras_error_data_init()
4941 return 0; in amdgpu_ras_error_data_init()
4949 list_del(&err_node->node); in amdgpu_ras_error_node_release()
4957 list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) in amdgpu_ras_error_data_fini()
4971 ref_id = &err_node->err_info.mcm_info; in amdgpu_ras_error_find_node_by_id()
4973 if (mcm_info->socket_id == ref_id->socket_id && in amdgpu_ras_error_find_node_by_id()
4974 mcm_info->die_id == ref_id->die_id) in amdgpu_ras_error_find_node_by_id()
4989 INIT_LIST_HEAD(&err_node->node); in amdgpu_ras_error_node_new()
4998 struct amdgpu_smuio_mcm_config_info *infoa = &nodea->err_info.mcm_info; in ras_err_info_cmp()
4999 struct amdgpu_smuio_mcm_config_info *infob = &nodeb->err_info.mcm_info; in ras_err_info_cmp()
5001 if (unlikely(infoa->socket_id != infob->socket_id)) in ras_err_info_cmp()
5002 return infoa->socket_id - infob->socket_id; in ras_err_info_cmp()
5004 return infoa->die_id - infob->die_id; in ras_err_info_cmp()
5006 return 0; in ras_err_info_cmp()
5016 return &err_node->err_info; in amdgpu_ras_error_get_info()
5022 memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); in amdgpu_ras_error_get_info()
5024 err_data->err_list_count++; in amdgpu_ras_error_get_info()
5025 list_add_tail(&err_node->node, &err_data->err_node_list); in amdgpu_ras_error_get_info()
5026 list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp); in amdgpu_ras_error_get_info()
5028 return &err_node->err_info; in amdgpu_ras_error_get_info()
5038 return -EINVAL; in amdgpu_ras_error_statistic_ue_count()
5041 return 0; in amdgpu_ras_error_statistic_ue_count()
5045 return -EINVAL; in amdgpu_ras_error_statistic_ue_count()
5047 err_info->ue_count += count; in amdgpu_ras_error_statistic_ue_count()
5048 err_data->ue_count += count; in amdgpu_ras_error_statistic_ue_count()
5050 return 0; in amdgpu_ras_error_statistic_ue_count()
5060 return -EINVAL; in amdgpu_ras_error_statistic_ce_count()
5063 return 0; in amdgpu_ras_error_statistic_ce_count()
5067 return -EINVAL; in amdgpu_ras_error_statistic_ce_count()
5069 err_info->ce_count += count; in amdgpu_ras_error_statistic_ce_count()
5070 err_data->ce_count += count; in amdgpu_ras_error_statistic_ce_count()
5072 return 0; in amdgpu_ras_error_statistic_ce_count()
5082 return -EINVAL; in amdgpu_ras_error_statistic_de_count()
5085 return 0; in amdgpu_ras_error_statistic_de_count()
5089 return -EINVAL; in amdgpu_ras_error_statistic_de_count()
5091 err_info->de_count += count; in amdgpu_ras_error_statistic_de_count()
5092 err_data->de_count += count; in amdgpu_ras_error_statistic_de_count()
5094 return 0; in amdgpu_ras_error_statistic_de_count()
5097 #define mmMP0_SMN_C2PMSG_92 0x1609C
5098 #define mmMP0_SMN_C2PMSG_126 0x160BE
5121 hbm_id = ((1 == AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error)) ? 0 : 1); in amdgpu_ras_boot_time_error_reporting()
5124 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5125 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n", in amdgpu_ras_boot_time_error_reporting()
5129 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5130 "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n", in amdgpu_ras_boot_time_error_reporting()
5134 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5135 "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n", in amdgpu_ras_boot_time_error_reporting()
5139 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5140 "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n", in amdgpu_ras_boot_time_error_reporting()
5144 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5145 "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n", in amdgpu_ras_boot_time_error_reporting()
5149 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5150 "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n", in amdgpu_ras_boot_time_error_reporting()
5154 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5155 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n", in amdgpu_ras_boot_time_error_reporting()
5159 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5160 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n", in amdgpu_ras_boot_time_error_reporting()
5164 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5165 "socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n", in amdgpu_ras_boot_time_error_reporting()
5169 dev_info(adev->dev, in amdgpu_ras_boot_time_error_reporting()
5170 "socket: %d, aid: %d, fw_status: 0x%x, Boot Controller Generic Error\n", in amdgpu_ras_boot_time_error_reporting()
5184 for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { in amdgpu_ras_boot_error_detected()
5199 for (i = 0; i < num_instances; i++) { in amdgpu_ras_query_boot_status()
5208 struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; in amdgpu_ras_reserve_page()
5210 int ret = 0; in amdgpu_ras_reserve_page()
5212 mutex_lock(&con->page_rsv_lock); in amdgpu_ras_reserve_page()
5214 if (ret == -ENOENT) in amdgpu_ras_reserve_page()
5216 mutex_unlock(&con->page_rsv_lock); in amdgpu_ras_reserve_page()
5232 dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); in amdgpu_ras_event_log_print()
5234 dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); in amdgpu_ras_event_log_print()
5246 return con->is_rma; in amdgpu_ras_is_rma()