Lines Matching +full:gfx +full:- +full:mem

61 	"gfx",
97 if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT) in get_ras_block_str()
100 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) in get_ras_block_str()
101 return ras_mca_block_string[ras_block->sub_block_index]; in get_ras_block_str()
103 return ras_block_string[ras_block->block]; in get_ras_block_str()
143 amdgpu_ras_get_context(adev)->error_query_ready = ready; in amdgpu_ras_set_error_query_ready()
149 return amdgpu_ras_get_context(adev)->error_query_ready; in amdgpu_ras_get_error_query_ready()
160 if ((address >= adev->gmc.mc_vram_size) || in amdgpu_reserve_page_direct()
162 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
165 return -EINVAL; in amdgpu_reserve_page_direct()
169 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
191 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); in amdgpu_reserve_page_direct()
192 dev_warn(adev->dev, "Clear EEPROM:\n"); in amdgpu_reserve_page_direct()
193 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); in amdgpu_reserve_page_direct()
201 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; in amdgpu_ras_debugfs_read()
203 .head = obj->head, in amdgpu_ras_debugfs_read()
208 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_debugfs_read()
209 return -EINVAL; in amdgpu_ras_debugfs_read()
212 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && in amdgpu_ras_debugfs_read()
213 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { in amdgpu_ras_debugfs_read()
214 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_debugfs_read()
215 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_debugfs_read()
224 s -= *pos; in amdgpu_ras_debugfs_read()
229 return -EINVAL; in amdgpu_ras_debugfs_read()
252 return -EINVAL; in amdgpu_ras_find_block_id_by_name()
263 int op = -1; in amdgpu_ras_debugfs_ctrl_parse_data()
271 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
278 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
290 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
292 if (op != -1) { in amdgpu_ras_debugfs_ctrl_parse_data()
296 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
298 data->op = op; in amdgpu_ras_debugfs_ctrl_parse_data()
299 data->inject.address = address; in amdgpu_ras_debugfs_ctrl_parse_data()
305 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
307 data->head.block = block_id; in amdgpu_ras_debugfs_ctrl_parse_data()
310 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; in amdgpu_ras_debugfs_ctrl_parse_data()
312 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; in amdgpu_ras_debugfs_ctrl_parse_data()
314 data->head.type = AMDGPU_RAS_ERROR__POISON; in amdgpu_ras_debugfs_ctrl_parse_data()
316 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
318 data->op = op; in amdgpu_ras_debugfs_ctrl_parse_data()
329 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
330 data->head.sub_block_index = sub_block; in amdgpu_ras_debugfs_ctrl_parse_data()
331 data->inject.address = address; in amdgpu_ras_debugfs_ctrl_parse_data()
332 data->inject.value = value; in amdgpu_ras_debugfs_ctrl_parse_data()
333 data->inject.instance_mask = instance_mask; in amdgpu_ras_debugfs_ctrl_parse_data()
337 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
340 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
349 int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; in amdgpu_ras_instance_mask_check()
350 uint32_t mask, inst_mask = data->inject.instance_mask; in amdgpu_ras_instance_mask_check()
354 data->inject.instance_mask = 0; in amdgpu_ras_instance_mask_check()
355 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
362 switch (data->head.block) { in amdgpu_ras_instance_mask_check()
364 mask = GENMASK(num_xcc - 1, 0); in amdgpu_ras_instance_mask_check()
367 mask = GENMASK(adev->sdma.num_instances - 1, 0); in amdgpu_ras_instance_mask_check()
371 mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); in amdgpu_ras_instance_mask_check()
379 data->inject.instance_mask &= mask; in amdgpu_ras_instance_mask_check()
380 if (inst_mask != data->inject.instance_mask) in amdgpu_ras_instance_mask_check()
381 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
383 inst_mask, data->inject.instance_mask); in amdgpu_ras_instance_mask_check()
398 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
408 * - 0: disable RAS on the block. Take ::head as its data.
409 * - 1: enable RAS on the block. Take ::head as its data.
410 * - 2: inject errors on the block. Take ::inject as its data.
421 * .. code-block:: bash
425 …* echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/…
433 * The block is one of: umc, sdma, gfx, etc.
437 * ue is multi-uncorrectable
438 * ce is single-correctable
441 * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
447 * .. code-block:: bash
459 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
471 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_ctrl_write()
476 dev_warn(adev->dev, "RAS WARN: error injection " in amdgpu_ras_debugfs_ctrl_write()
494 return -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
504 if ((data.inject.address >= adev->gmc.mc_vram_size && in amdgpu_ras_debugfs_ctrl_write()
505 adev->gmc.mc_vram_size) || in amdgpu_ras_debugfs_ctrl_write()
507 dev_warn(adev->dev, "RAS WARN: input address " in amdgpu_ras_debugfs_ctrl_write()
510 ret = -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
517 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " in amdgpu_ras_debugfs_ctrl_write()
529 ret = -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
548 * .. code-block:: bash
560 (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_eeprom_write()
564 &(amdgpu_ras_get_context(adev)->eeprom_control)); in amdgpu_ras_debugfs_eeprom_write()
569 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_debugfs_eeprom_write()
594 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
605 * .. code-block:: bash
616 .head = obj->head, in amdgpu_ras_sysfs_read()
619 if (!amdgpu_ras_get_error_query_ready(obj->adev)) in amdgpu_ras_sysfs_read()
622 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_sysfs_read()
623 return -EINVAL; in amdgpu_ras_sysfs_read()
625 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && in amdgpu_ras_sysfs_read()
626 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { in amdgpu_ras_sysfs_read()
627 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_sysfs_read()
628 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_sysfs_read()
637 #define get_obj(obj) do { (obj)->use++; } while (0)
638 #define alive_obj(obj) ((obj)->use)
642 if (obj && (--obj->use == 0)) { in put_obj()
643 list_del(&obj->node); in put_obj()
644 amdgpu_ras_error_data_fini(&obj->err_data); in put_obj()
647 if (obj && (obj->use < 0)) in put_obj()
648 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); in put_obj()
658 if (!adev->ras_enabled || !con) in amdgpu_ras_create_obj()
661 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) in amdgpu_ras_create_obj()
664 if (head->block == AMDGPU_RAS_BLOCK__MCA) { in amdgpu_ras_create_obj()
665 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) in amdgpu_ras_create_obj()
668 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; in amdgpu_ras_create_obj()
670 obj = &con->objs[head->block]; in amdgpu_ras_create_obj()
676 if (amdgpu_ras_error_data_init(&obj->err_data)) in amdgpu_ras_create_obj()
679 obj->head = *head; in amdgpu_ras_create_obj()
680 obj->adev = adev; in amdgpu_ras_create_obj()
681 list_add(&obj->node, &con->head); in amdgpu_ras_create_obj()
695 if (!adev->ras_enabled || !con) in amdgpu_ras_find_obj()
699 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) in amdgpu_ras_find_obj()
702 if (head->block == AMDGPU_RAS_BLOCK__MCA) { in amdgpu_ras_find_obj()
703 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) in amdgpu_ras_find_obj()
706 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; in amdgpu_ras_find_obj()
708 obj = &con->objs[head->block]; in amdgpu_ras_find_obj()
714 obj = &con->objs[i]; in amdgpu_ras_find_obj()
728 return adev->ras_hw_enabled & BIT(head->block); in amdgpu_ras_is_feature_allowed()
736 return con->features & BIT(head->block); in amdgpu_ras_is_feature_enabled()
751 * Ras framework checks con->hw_supported to see if it need do in __amdgpu_ras_feature_enable()
753 * IP checks con->support to see if it need disable ras. in __amdgpu_ras_feature_enable()
762 return -EINVAL; in __amdgpu_ras_feature_enable()
767 con->features |= BIT(head->block); in __amdgpu_ras_feature_enable()
770 con->features &= ~BIT(head->block); in __amdgpu_ras_feature_enable()
787 return -EINVAL; in amdgpu_ras_feature_enable()
789 /* For non-gfx ip, do not enable ras feature if it is not allowed */ in amdgpu_ras_feature_enable()
790 /* For gfx ip, regardless of feature support status, */ in amdgpu_ras_feature_enable()
792 if (head->block != AMDGPU_RAS_BLOCK__GFX && in amdgpu_ras_feature_enable()
796 /* Only enable gfx ras feature from host side */ in amdgpu_ras_feature_enable()
797 if (head->block == AMDGPU_RAS_BLOCK__GFX && in amdgpu_ras_feature_enable()
802 return -ENOMEM; in amdgpu_ras_feature_enable()
805 info->disable_features = (struct ta_ras_disable_features_input) { in amdgpu_ras_feature_enable()
806 .block_id = amdgpu_ras_block_to_ta(head->block), in amdgpu_ras_feature_enable()
807 .error_type = amdgpu_ras_error_to_ta(head->type), in amdgpu_ras_feature_enable()
810 info->enable_features = (struct ta_ras_enable_features_input) { in amdgpu_ras_feature_enable()
811 .block_id = amdgpu_ras_block_to_ta(head->block), in amdgpu_ras_feature_enable()
812 .error_type = amdgpu_ras_error_to_ta(head->type), in amdgpu_ras_feature_enable()
816 ret = psp_ras_enable_features(&adev->psp, info, enable); in amdgpu_ras_feature_enable()
818 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", in amdgpu_ras_feature_enable()
843 return -EINVAL; in amdgpu_ras_feature_enable_on_boot()
845 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { in amdgpu_ras_feature_enable_on_boot()
851 * with error code -EAGAIN. in amdgpu_ras_feature_enable_on_boot()
858 if (ret == -EINVAL) { in amdgpu_ras_feature_enable_on_boot()
861 dev_info(adev->dev, in amdgpu_ras_feature_enable_on_boot()
871 /* gfx block ras dsiable cmd must send to ras-ta */ in amdgpu_ras_feature_enable_on_boot()
872 if (head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
873 con->features |= BIT(head->block); in amdgpu_ras_feature_enable_on_boot()
877 /* clean gfx block ras features flag */ in amdgpu_ras_feature_enable_on_boot()
878 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
879 con->features &= ~BIT(head->block); in amdgpu_ras_feature_enable_on_boot()
893 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_disable_all_features()
898 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
901 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
906 return con->features; in amdgpu_ras_disable_all_features()
959 return con->features; in amdgpu_ras_enable_all_features()
967 return -EINVAL; in amdgpu_ras_block_match_default()
969 if (block_obj->ras_comm.block == block) in amdgpu_ras_block_match_default()
972 return -EINVAL; in amdgpu_ras_block_match_default()
984 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_get_ras_block()
985 if (!node->ras_obj) { in amdgpu_ras_get_ras_block()
986 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_get_ras_block()
990 obj = node->ras_obj; in amdgpu_ras_get_ras_block()
991 if (obj->ras_block_match) { in amdgpu_ras_get_ras_block()
992 if (obj->ras_block_match(obj, block, sub_block_index) == 0) in amdgpu_ras_get_ras_block()
1012 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); in amdgpu_ras_get_ecc_info()
1013 if (ret == -EOPNOTSUPP) { in amdgpu_ras_get_ecc_info()
1014 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1015 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) in amdgpu_ras_get_ecc_info()
1016 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1021 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1022 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) in amdgpu_ras_get_ecc_info()
1023 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1025 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1026 adev->umc.ras->ecc_info_query_ras_error_count) in amdgpu_ras_get_ecc_info()
1027 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1029 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1030 adev->umc.ras->ecc_info_query_ras_error_address) in amdgpu_ras_get_ecc_info()
1031 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1047 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1048 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1049 if (err_info->ue_count) { in amdgpu_ras_error_print_error_data()
1050 dev_info(adev->dev, "socket: %d, die: %d, " in amdgpu_ras_error_print_error_data()
1052 mcm_info->socket_id, in amdgpu_ras_error_print_error_data()
1053 mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1054 err_info->ue_count, in amdgpu_ras_error_print_error_data()
1059 for_each_ras_error(err_node, &ras_mgr->err_data) { in amdgpu_ras_error_print_error_data()
1060 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1061 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1062 dev_info(adev->dev, "socket: %d, die: %d, " in amdgpu_ras_error_print_error_data()
1064 mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name); in amdgpu_ras_error_print_error_data()
1069 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1070 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1071 if (err_info->ce_count) { in amdgpu_ras_error_print_error_data()
1072 dev_info(adev->dev, "socket: %d, die: %d, " in amdgpu_ras_error_print_error_data()
1074 mcm_info->socket_id, in amdgpu_ras_error_print_error_data()
1075 mcm_info->die_id, in amdgpu_ras_error_print_error_data()
1076 err_info->ce_count, in amdgpu_ras_error_print_error_data()
1081 for_each_ras_error(err_node, &ras_mgr->err_data) { in amdgpu_ras_error_print_error_data()
1082 err_info = &err_node->err_info; in amdgpu_ras_error_print_error_data()
1083 mcm_info = &err_info->mcm_info; in amdgpu_ras_error_print_error_data()
1084 dev_info(adev->dev, "socket: %d, die: %d, " in amdgpu_ras_error_print_error_data()
1086 mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name); in amdgpu_ras_error_print_error_data()
1093 return !list_empty(&data->err_node_list); in err_data_has_source_info()
1100 struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); in amdgpu_ras_error_generate_report()
1101 const char *blk_name = get_ras_block_str(&query_if->head); in amdgpu_ras_error_generate_report()
1103 if (err_data->ce_count) { in amdgpu_ras_error_generate_report()
1106 } else if (!adev->aid_mask && in amdgpu_ras_error_generate_report()
1107 adev->smuio.funcs && in amdgpu_ras_error_generate_report()
1108 adev->smuio.funcs->get_socket_id && in amdgpu_ras_error_generate_report()
1109 adev->smuio.funcs->get_die_id) { in amdgpu_ras_error_generate_report()
1110 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_error_generate_report()
1113 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_error_generate_report()
1114 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_error_generate_report()
1115 ras_mgr->err_data.ce_count, in amdgpu_ras_error_generate_report()
1118 dev_info(adev->dev, "%ld correctable hardware errors " in amdgpu_ras_error_generate_report()
1120 ras_mgr->err_data.ce_count, in amdgpu_ras_error_generate_report()
1125 if (err_data->ue_count) { in amdgpu_ras_error_generate_report()
1128 } else if (!adev->aid_mask && in amdgpu_ras_error_generate_report()
1129 adev->smuio.funcs && in amdgpu_ras_error_generate_report()
1130 adev->smuio.funcs->get_socket_id && in amdgpu_ras_error_generate_report()
1131 adev->smuio.funcs->get_die_id) { in amdgpu_ras_error_generate_report()
1132 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_error_generate_report()
1135 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_error_generate_report()
1136 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_error_generate_report()
1137 ras_mgr->err_data.ue_count, in amdgpu_ras_error_generate_report()
1140 dev_info(adev->dev, "%ld uncorrectable hardware errors " in amdgpu_ras_error_generate_report()
1142 ras_mgr->err_data.ue_count, in amdgpu_ras_error_generate_report()
1156 err_info = &err_node->err_info; in amdgpu_rasmgr_error_data_statistic_update()
1158 amdgpu_ras_error_statistic_ce_count(&obj->err_data, in amdgpu_rasmgr_error_data_statistic_update()
1159 &err_info->mcm_info, NULL, err_info->ce_count); in amdgpu_rasmgr_error_data_statistic_update()
1160 amdgpu_ras_error_statistic_ue_count(&obj->err_data, in amdgpu_rasmgr_error_data_statistic_update()
1161 &err_info->mcm_info, NULL, err_info->ue_count); in amdgpu_rasmgr_error_data_statistic_update()
1165 obj->err_data.ue_count += err_data->ue_count; in amdgpu_rasmgr_error_data_statistic_update()
1166 obj->err_data.ce_count += err_data->ce_count; in amdgpu_rasmgr_error_data_statistic_update()
1175 enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; in amdgpu_ras_query_error_status_helper()
1179 return -EINVAL; in amdgpu_ras_query_error_status_helper()
1182 return -EINVAL; in amdgpu_ras_query_error_status_helper()
1185 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { in amdgpu_ras_query_error_status_helper()
1188 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); in amdgpu_ras_query_error_status_helper()
1189 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_query_error_status_helper()
1190 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_query_error_status_helper()
1191 get_ras_block_str(&info->head)); in amdgpu_ras_query_error_status_helper()
1192 return -EINVAL; in amdgpu_ras_query_error_status_helper()
1195 if (block_obj->hw_ops->query_ras_error_count) in amdgpu_ras_query_error_status_helper()
1196 block_obj->hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_query_error_status_helper()
1198 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || in amdgpu_ras_query_error_status_helper()
1199 (info->head.block == AMDGPU_RAS_BLOCK__GFX) || in amdgpu_ras_query_error_status_helper()
1200 (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { in amdgpu_ras_query_error_status_helper()
1201 if (block_obj->hw_ops->query_ras_error_status) in amdgpu_ras_query_error_status_helper()
1202 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_query_error_status_helper()
1217 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_query_error_status()
1223 return -EINVAL; in amdgpu_ras_query_error_status()
1230 return -EINVAL; in amdgpu_ras_query_error_status()
1240 info->ue_count = obj->err_data.ue_count; in amdgpu_ras_query_error_status()
1241 info->ce_count = obj->err_data.ce_count; in amdgpu_ras_query_error_status()
1256 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; in amdgpu_ras_reset_error_count()
1260 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_reset_error_count()
1261 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_reset_error_count()
1263 return -EOPNOTSUPP; in amdgpu_ras_reset_error_count()
1268 return -EOPNOTSUPP; in amdgpu_ras_reset_error_count()
1272 hive_ras_recovery = atomic_read(&hive->ras_recovery); in amdgpu_ras_reset_error_count()
1277 if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || in amdgpu_ras_reset_error_count()
1279 mca_funcs && mca_funcs->mca_set_debug_mode) in amdgpu_ras_reset_error_count()
1280 return -EOPNOTSUPP; in amdgpu_ras_reset_error_count()
1282 if (block_obj->hw_ops->reset_ras_error_count) in amdgpu_ras_reset_error_count()
1283 block_obj->hw_ops->reset_ras_error_count(adev); in amdgpu_ras_reset_error_count()
1293 if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) in amdgpu_ras_reset_error_status()
1298 if (block_obj->hw_ops->reset_ras_error_status) in amdgpu_ras_reset_error_status()
1299 block_obj->hw_ops->reset_ras_error_status(adev); in amdgpu_ras_reset_error_status()
1309 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_error_inject()
1311 .block_id = amdgpu_ras_block_to_ta(info->head.block), in amdgpu_ras_error_inject()
1312 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), in amdgpu_ras_error_inject()
1313 .sub_block_index = info->head.sub_block_index, in amdgpu_ras_error_inject()
1314 .address = info->address, in amdgpu_ras_error_inject()
1315 .value = info->value, in amdgpu_ras_error_inject()
1317 int ret = -EINVAL; in amdgpu_ras_error_inject()
1319 info->head.block, in amdgpu_ras_error_inject()
1320 info->head.sub_block_index); in amdgpu_ras_error_inject()
1327 return -EINVAL; in amdgpu_ras_error_inject()
1329 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_error_inject()
1330 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_inject()
1331 get_ras_block_str(&info->head)); in amdgpu_ras_error_inject()
1332 return -EINVAL; in amdgpu_ras_error_inject()
1336 if (adev->gmc.xgmi.num_physical_nodes > 1 && in amdgpu_ras_error_inject()
1337 info->head.block != AMDGPU_RAS_BLOCK__GFX) { in amdgpu_ras_error_inject()
1343 if (block_obj->hw_ops->ras_error_inject) { in amdgpu_ras_error_inject()
1344 if (info->head.block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_error_inject()
1345 ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); in amdgpu_ras_error_inject()
1347 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, in amdgpu_ras_error_inject()
1348 info->instance_mask); in amdgpu_ras_error_inject()
1351 ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); in amdgpu_ras_error_inject()
1355 dev_err(adev->dev, "ras inject %s failed %d\n", in amdgpu_ras_error_inject()
1356 get_ras_block_str(&info->head), ret); in amdgpu_ras_error_inject()
1362 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
1386 *ce_count += query_info->ce_count; in amdgpu_ras_query_error_count_helper()
1387 *ue_count += query_info->ue_count; in amdgpu_ras_query_error_count_helper()
1393 if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) in amdgpu_ras_query_error_count_helper()
1394 dev_warn(adev->dev, in amdgpu_ras_query_error_count_helper()
1402 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
1413 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
1425 if (!adev->ras_enabled || !con) in amdgpu_ras_query_error_count()
1426 return -EOPNOTSUPP; in amdgpu_ras_query_error_count()
1437 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_query_error_count()
1439 .head = obj->head, in amdgpu_ras_query_error_count()
1504 * .. code-block:: bash
1517 struct amdgpu_device *adev = con->adev; in amdgpu_ras_sysfs_badpages_read()
1519 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; in amdgpu_ras_sysfs_badpages_read()
1520 unsigned int start = div64_ul(ppos + element_size - 1, element_size); in amdgpu_ras_sysfs_badpages_read()
1521 unsigned int end = div64_ul(ppos + count - 1, element_size); in amdgpu_ras_sysfs_badpages_read()
1549 return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); in amdgpu_ras_sysfs_features_read()
1557 return sysfs_emit(buf, "table version: 0x%x\n", con->eeprom_control.tbl_hdr.version); in amdgpu_ras_sysfs_version_show()
1565 return sysfs_emit(buf, "schema: 0x%x\n", con->schema); in amdgpu_ras_sysfs_schema_show()
1572 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove_bad_page_node()
1573 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove_bad_page_node()
1574 &con->badpages_attr.attr, in amdgpu_ras_sysfs_remove_bad_page_node()
1582 &con->features_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1583 &con->version_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1584 &con->schema_attr.attr, in amdgpu_ras_sysfs_remove_dev_attr_node()
1592 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove_dev_attr_node()
1593 sysfs_remove_group(&adev->dev->kobj, &group); in amdgpu_ras_sysfs_remove_dev_attr_node()
1603 if (!obj || obj->attr_inuse) in amdgpu_ras_sysfs_create()
1604 return -EINVAL; in amdgpu_ras_sysfs_create()
1608 snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), in amdgpu_ras_sysfs_create()
1609 "%s_err_count", head->name); in amdgpu_ras_sysfs_create()
1611 obj->sysfs_attr = (struct device_attribute){ in amdgpu_ras_sysfs_create()
1613 .name = obj->fs_data.sysfs_name, in amdgpu_ras_sysfs_create()
1618 sysfs_attr_init(&obj->sysfs_attr.attr); in amdgpu_ras_sysfs_create()
1620 if (sysfs_add_file_to_group(&adev->dev->kobj, in amdgpu_ras_sysfs_create()
1621 &obj->sysfs_attr.attr, in amdgpu_ras_sysfs_create()
1624 return -EINVAL; in amdgpu_ras_sysfs_create()
1627 obj->attr_inuse = 1; in amdgpu_ras_sysfs_create()
1637 if (!obj || !obj->attr_inuse) in amdgpu_ras_sysfs_remove()
1638 return -EINVAL; in amdgpu_ras_sysfs_remove()
1640 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove()
1641 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove()
1642 &obj->sysfs_attr.attr, in amdgpu_ras_sysfs_remove()
1644 obj->attr_inuse = 0; in amdgpu_ras_sysfs_remove()
1655 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_sysfs_remove_all()
1656 amdgpu_ras_sysfs_remove(adev, &obj->head); in amdgpu_ras_sysfs_remove_all()
1681 * .. code-block:: bash
1690 struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control; in amdgpu_ras_debugfs_create_ctrl_node()
1691 struct drm_minor *minor = adev_to_drm(adev)->primary; in amdgpu_ras_debugfs_create_ctrl_node()
1694 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); in amdgpu_ras_debugfs_create_ctrl_node()
1700 &con->bad_page_cnt_threshold); in amdgpu_ras_debugfs_create_ctrl_node()
1701 debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs); in amdgpu_ras_debugfs_create_ctrl_node()
1702 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1703 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1706 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table", in amdgpu_ras_debugfs_create_ctrl_node()
1709 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control); in amdgpu_ras_debugfs_create_ctrl_node()
1719 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); in amdgpu_ras_debugfs_create_ctrl_node()
1726 &con->disable_ras_err_cnt_harvest); in amdgpu_ras_debugfs_create_ctrl_node()
1734 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); in amdgpu_ras_debugfs_create()
1741 memcpy(obj->fs_data.debugfs_name, in amdgpu_ras_debugfs_create()
1742 head->debugfs_name, in amdgpu_ras_debugfs_create()
1743 sizeof(obj->fs_data.debugfs_name)); in amdgpu_ras_debugfs_create()
1745 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, in amdgpu_ras_debugfs_create()
1765 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_debugfs_create_all()
1766 if (amdgpu_ras_is_supported(adev, obj->head.block) && in amdgpu_ras_debugfs_create_all()
1767 (obj->attr_inuse == 1)) { in amdgpu_ras_debugfs_create_all()
1769 get_ras_block_str(&obj->head)); in amdgpu_ras_debugfs_create_all()
1770 fs_info.head = obj->head; in amdgpu_ras_debugfs_create_all()
1796 &con->features_attr.attr, in amdgpu_ras_fs_init()
1797 &con->version_attr.attr, in amdgpu_ras_fs_init()
1798 &con->schema_attr.attr, in amdgpu_ras_fs_init()
1810 con->features_attr = dev_attr_features; in amdgpu_ras_fs_init()
1814 con->version_attr = dev_attr_version; in amdgpu_ras_fs_init()
1818 con->schema_attr = dev_attr_schema; in amdgpu_ras_fs_init()
1824 con->badpages_attr = bin_attr_gpu_vram_bad_pages; in amdgpu_ras_fs_init()
1825 bin_attrs[0] = &con->badpages_attr; in amdgpu_ras_fs_init()
1830 r = sysfs_create_group(&adev->dev->kobj, &group); in amdgpu_ras_fs_init()
1832 dev_err(adev->dev, "Failed to create RAS sysfs group!"); in amdgpu_ras_fs_init()
1843 list_for_each_entry_safe(con_obj, tmp, &con->head, node) { in amdgpu_ras_fs_fini()
1844 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); in amdgpu_ras_fs_fini()
1868 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
1869 adev->nbio.ras->handle_ras_controller_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
1870 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
1872 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
1873 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
1874 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
1881 struct amdgpu_device *adev = obj->adev; in amdgpu_ras_interrupt_poison_consumption_handler()
1883 amdgpu_ras_get_ras_block(adev, obj->head.block, 0); in amdgpu_ras_interrupt_poison_consumption_handler()
1892 if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) { in amdgpu_ras_interrupt_poison_consumption_handler()
1893 poison_stat = block_obj->hw_ops->query_poison_status(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1896 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", in amdgpu_ras_interrupt_poison_consumption_handler()
1897 block_obj->ras_comm.name); in amdgpu_ras_interrupt_poison_consumption_handler()
1905 if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) in amdgpu_ras_interrupt_poison_consumption_handler()
1906 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1910 dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", in amdgpu_ras_interrupt_poison_consumption_handler()
1911 block_obj->ras_comm.name); in amdgpu_ras_interrupt_poison_consumption_handler()
1921 dev_info(obj->adev->dev, in amdgpu_ras_interrupt_poison_creation_handler()
1928 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_umc_handler()
1932 if (!data->cb) in amdgpu_ras_interrupt_umc_handler()
1942 ret = data->cb(obj->adev, &err_data, entry); in amdgpu_ras_interrupt_umc_handler()
1952 obj->err_data.ue_count += err_data.ue_count; in amdgpu_ras_interrupt_umc_handler()
1953 obj->err_data.ce_count += err_data.ce_count; in amdgpu_ras_interrupt_umc_handler()
1961 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_handler()
1964 while (data->rptr != data->wptr) { in amdgpu_ras_interrupt_handler()
1966 memcpy(&entry, &data->ring[data->rptr], in amdgpu_ras_interrupt_handler()
1967 data->element_size); in amdgpu_ras_interrupt_handler()
1970 data->rptr = (data->aligned_element_size + in amdgpu_ras_interrupt_handler()
1971 data->rptr) % data->ring_size; in amdgpu_ras_interrupt_handler()
1973 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { in amdgpu_ras_interrupt_handler()
1974 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) in amdgpu_ras_interrupt_handler()
1979 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) in amdgpu_ras_interrupt_handler()
1982 dev_warn(obj->adev->dev, in amdgpu_ras_interrupt_handler()
1983 "No RAS interrupt handler for non-UMC block with poison disabled.\n"); in amdgpu_ras_interrupt_handler()
2001 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_interrupt_dispatch()
2002 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_dispatch()
2005 return -EINVAL; in amdgpu_ras_interrupt_dispatch()
2007 if (data->inuse == 0) in amdgpu_ras_interrupt_dispatch()
2011 memcpy(&data->ring[data->wptr], info->entry, in amdgpu_ras_interrupt_dispatch()
2012 data->element_size); in amdgpu_ras_interrupt_dispatch()
2015 data->wptr = (data->aligned_element_size + in amdgpu_ras_interrupt_dispatch()
2016 data->wptr) % data->ring_size; in amdgpu_ras_interrupt_dispatch()
2018 schedule_work(&data->ih_work); in amdgpu_ras_interrupt_dispatch()
2030 return -EINVAL; in amdgpu_ras_interrupt_remove_handler()
2032 data = &obj->ih_data; in amdgpu_ras_interrupt_remove_handler()
2033 if (data->inuse == 0) in amdgpu_ras_interrupt_remove_handler()
2036 cancel_work_sync(&data->ih_work); in amdgpu_ras_interrupt_remove_handler()
2038 kfree(data->ring); in amdgpu_ras_interrupt_remove_handler()
2056 return -EINVAL; in amdgpu_ras_interrupt_add_handler()
2062 data = &obj->ih_data; in amdgpu_ras_interrupt_add_handler()
2066 .cb = ras_obj->ras_cb, in amdgpu_ras_interrupt_add_handler()
2072 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); in amdgpu_ras_interrupt_add_handler()
2074 data->aligned_element_size = ALIGN(data->element_size, 8); in amdgpu_ras_interrupt_add_handler()
2076 data->ring_size = 64 * data->aligned_element_size; in amdgpu_ras_interrupt_add_handler()
2077 data->ring = kmalloc(data->ring_size, GFP_KERNEL); in amdgpu_ras_interrupt_add_handler()
2078 if (!data->ring) { in amdgpu_ras_interrupt_add_handler()
2080 return -ENOMEM; in amdgpu_ras_interrupt_add_handler()
2084 data->inuse = 1; in amdgpu_ras_interrupt_add_handler()
2094 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_interrupt_remove_all()
2095 amdgpu_ras_interrupt_remove_handler(adev, &obj->head); in amdgpu_ras_interrupt_remove_all()
2108 if (!adev->ras_enabled || !con) in amdgpu_ras_log_on_err_counter()
2111 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_log_on_err_counter()
2113 .head = obj->head, in amdgpu_ras_log_on_err_counter()
2145 dev_warn(adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_log_on_err_counter()
2159 if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && in amdgpu_ras_error_status_query()
2160 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) in amdgpu_ras_error_status_query()
2164 info->head.block, in amdgpu_ras_error_status_query()
2165 info->head.sub_block_index); in amdgpu_ras_error_status_query()
2167 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_error_status_query()
2168 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_status_query()
2169 get_ras_block_str(&info->head)); in amdgpu_ras_error_status_query()
2173 if (block_obj->hw_ops->query_ras_error_status) in amdgpu_ras_error_status_query()
2174 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_error_status_query()
2183 if (!adev->ras_enabled || !con) in amdgpu_ras_query_err_status()
2186 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_query_err_status()
2188 .head = obj->head, in amdgpu_ras_query_err_status()
2208 if (!con || !con->eh_data || !bps || !count) in amdgpu_ras_badpages_read()
2209 return -EINVAL; in amdgpu_ras_badpages_read()
2211 mutex_lock(&con->recovery_lock); in amdgpu_ras_badpages_read()
2212 data = con->eh_data; in amdgpu_ras_badpages_read()
2213 if (!data || data->count == 0) { in amdgpu_ras_badpages_read()
2215 ret = -EINVAL; in amdgpu_ras_badpages_read()
2219 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); in amdgpu_ras_badpages_read()
2221 ret = -ENOMEM; in amdgpu_ras_badpages_read()
2225 for (; i < data->count; i++) { in amdgpu_ras_badpages_read()
2227 .bp = data->bps[i].retired_page, in amdgpu_ras_badpages_read()
2231 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, in amdgpu_ras_badpages_read()
2232 data->bps[i].retired_page); in amdgpu_ras_badpages_read()
2233 if (status == -EBUSY) in amdgpu_ras_badpages_read()
2235 else if (status == -ENOENT) in amdgpu_ras_badpages_read()
2239 *count = data->count; in amdgpu_ras_badpages_read()
2241 mutex_unlock(&con->recovery_lock); in amdgpu_ras_badpages_read()
2250 struct amdgpu_device *adev = ras->adev; in amdgpu_ras_do_recovery()
2255 atomic_set(&hive->ras_recovery, 1); in amdgpu_ras_do_recovery()
2256 if (!ras->disable_ras_err_cnt_harvest) { in amdgpu_ras_do_recovery()
2259 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { in amdgpu_ras_do_recovery()
2260 device_list_handle = &hive->device_list; in amdgpu_ras_do_recovery()
2263 list_add_tail(&adev->gmc.xgmi.head, &device_list); in amdgpu_ras_do_recovery()
2275 if (amdgpu_device_should_recover_gpu(ras->adev)) { in amdgpu_ras_do_recovery()
2283 if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) in amdgpu_ras_do_recovery()
2288 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { in amdgpu_ras_do_recovery()
2289 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; in amdgpu_ras_do_recovery()
2296 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { in amdgpu_ras_do_recovery()
2297 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_do_recovery()
2300 psp_fatal_error_recovery_quirk(&adev->psp); in amdgpu_ras_do_recovery()
2304 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); in amdgpu_ras_do_recovery()
2306 atomic_set(&ras->in_recovery, 0); in amdgpu_ras_do_recovery()
2308 atomic_set(&hive->ras_recovery, 0); in amdgpu_ras_do_recovery()
2317 unsigned int old_space = data->count + data->space_left; in amdgpu_ras_realloc_eh_data_space()
2320 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); in amdgpu_ras_realloc_eh_data_space()
2323 return -ENOMEM; in amdgpu_ras_realloc_eh_data_space()
2326 if (data->bps) { in amdgpu_ras_realloc_eh_data_space()
2327 memcpy(bps, data->bps, in amdgpu_ras_realloc_eh_data_space()
2328 data->count * sizeof(*data->bps)); in amdgpu_ras_realloc_eh_data_space()
2329 kfree(data->bps); in amdgpu_ras_realloc_eh_data_space()
2332 data->bps = bps; in amdgpu_ras_realloc_eh_data_space()
2333 data->space_left += align_space - old_space; in amdgpu_ras_realloc_eh_data_space()
2346 if (!con || !con->eh_data || !bps || pages <= 0) in amdgpu_ras_add_bad_pages()
2349 mutex_lock(&con->recovery_lock); in amdgpu_ras_add_bad_pages()
2350 data = con->eh_data; in amdgpu_ras_add_bad_pages()
2359 if (!data->space_left && in amdgpu_ras_add_bad_pages()
2361 ret = -ENOMEM; in amdgpu_ras_add_bad_pages()
2365 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr, in amdgpu_ras_add_bad_pages()
2369 memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); in amdgpu_ras_add_bad_pages()
2370 data->count++; in amdgpu_ras_add_bad_pages()
2371 data->space_left--; in amdgpu_ras_add_bad_pages()
2374 mutex_unlock(&con->recovery_lock); in amdgpu_ras_add_bad_pages()
2392 if (!con || !con->eh_data) { in amdgpu_ras_save_bad_pages()
2399 mutex_lock(&con->recovery_lock); in amdgpu_ras_save_bad_pages()
2400 control = &con->eeprom_control; in amdgpu_ras_save_bad_pages()
2401 data = con->eh_data; in amdgpu_ras_save_bad_pages()
2402 save_count = data->count - control->ras_num_recs; in amdgpu_ras_save_bad_pages()
2403 mutex_unlock(&con->recovery_lock); in amdgpu_ras_save_bad_pages()
2406 *new_cnt = save_count / adev->umc.retire_unit; in amdgpu_ras_save_bad_pages()
2411 &data->bps[control->ras_num_recs], in amdgpu_ras_save_bad_pages()
2413 dev_err(adev->dev, "Failed to save EEPROM table data!"); in amdgpu_ras_save_bad_pages()
2414 return -EIO; in amdgpu_ras_save_bad_pages()
2417 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); in amdgpu_ras_save_bad_pages()
2430 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_load_bad_pages()
2435 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) in amdgpu_ras_load_bad_pages()
2438 bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL); in amdgpu_ras_load_bad_pages()
2440 return -ENOMEM; in amdgpu_ras_load_bad_pages()
2442 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); in amdgpu_ras_load_bad_pages()
2444 dev_err(adev->dev, "Failed to load EEPROM table records!"); in amdgpu_ras_load_bad_pages()
2446 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); in amdgpu_ras_load_bad_pages()
2455 struct ras_err_handler_data *data = con->eh_data; in amdgpu_ras_check_bad_page_unlock()
2459 for (i = 0; i < data->count; i++) in amdgpu_ras_check_bad_page_unlock()
2460 if (addr == data->bps[i].retired_page) in amdgpu_ras_check_bad_page_unlock()
2477 if (!con || !con->eh_data) in amdgpu_ras_check_bad_page()
2480 mutex_lock(&con->recovery_lock); in amdgpu_ras_check_bad_page()
2482 mutex_unlock(&con->recovery_lock); in amdgpu_ras_check_bad_page()
2495 * in eeprom or amdgpu_bad_page_threshold == -2, introduce two in amdgpu_ras_validate_threshold()
2499 * - If amdgpu_bad_page_threshold = -2, in amdgpu_ras_validate_threshold()
2502 * - When the value from user is 0 < amdgpu_bad_page_threshold < in amdgpu_ras_validate_threshold()
2506 * - If amdgpu_bad_page_threshold = 0, bad page retirement in amdgpu_ras_validate_threshold()
2512 u64 val = adev->gmc.mc_vram_size; in amdgpu_ras_validate_threshold()
2515 con->bad_page_cnt_threshold = min(lower_32_bits(val), in amdgpu_ras_validate_threshold()
2518 con->bad_page_cnt_threshold = min_t(int, max_count, in amdgpu_ras_validate_threshold()
2536 * adev->ras_enabled is unset, i.e. when "ras_enable" in amdgpu_ras_recovery_init()
2539 con->adev = adev; in amdgpu_ras_recovery_init()
2541 if (!adev->ras_enabled) in amdgpu_ras_recovery_init()
2544 data = &con->eh_data; in amdgpu_ras_recovery_init()
2547 ret = -ENOMEM; in amdgpu_ras_recovery_init()
2551 mutex_init(&con->recovery_lock); in amdgpu_ras_recovery_init()
2552 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); in amdgpu_ras_recovery_init()
2553 atomic_set(&con->in_recovery, 0); in amdgpu_ras_recovery_init()
2554 con->eeprom_control.bad_channel_bitmap = 0; in amdgpu_ras_recovery_init()
2556 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); in amdgpu_ras_recovery_init()
2563 if (adev->gmc.xgmi.pending_reset) in amdgpu_ras_recovery_init()
2565 ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); in amdgpu_ras_recovery_init()
2573 if (con->eeprom_control.ras_num_recs) { in amdgpu_ras_recovery_init()
2578 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); in amdgpu_ras_recovery_init()
2580 if (con->update_channel_flag == true) { in amdgpu_ras_recovery_init()
2581 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); in amdgpu_ras_recovery_init()
2582 con->update_channel_flag = false; in amdgpu_ras_recovery_init()
2587 if ((adev->asic_type == CHIP_ALDEBARAN) && in amdgpu_ras_recovery_init()
2588 (adev->gmc.xgmi.connected_to_cpu)) in amdgpu_ras_recovery_init()
2594 kfree((*data)->bps); in amdgpu_ras_recovery_init()
2596 con->eh_data = NULL; in amdgpu_ras_recovery_init()
2598 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); in amdgpu_ras_recovery_init()
2607 ret = -EINVAL; in amdgpu_ras_recovery_init()
2615 struct ras_err_handler_data *data = con->eh_data; in amdgpu_ras_recovery_fini()
2621 cancel_work_sync(&con->recovery_work); in amdgpu_ras_recovery_fini()
2623 mutex_lock(&con->recovery_lock); in amdgpu_ras_recovery_fini()
2624 con->eh_data = NULL; in amdgpu_ras_recovery_fini()
2625 kfree(data->bps); in amdgpu_ras_recovery_fini()
2627 mutex_unlock(&con->recovery_lock); in amdgpu_ras_recovery_fini()
2645 if (adev->asic_type == CHIP_IP_DISCOVERY) { in amdgpu_ras_asic_supported()
2656 return adev->asic_type == CHIP_VEGA10 || in amdgpu_ras_asic_supported()
2657 adev->asic_type == CHIP_VEGA20 || in amdgpu_ras_asic_supported()
2658 adev->asic_type == CHIP_ARCTURUS || in amdgpu_ras_asic_supported()
2659 adev->asic_type == CHIP_ALDEBARAN || in amdgpu_ras_asic_supported()
2660 adev->asic_type == CHIP_SIENNA_CICHLID; in amdgpu_ras_asic_supported()
2665 * force enable gfx ras, ignore vbios gfx ras flag
2670 struct atom_context *ctx = adev->mode_info.atom_context; in amdgpu_ras_get_quirks()
2675 if (strnstr(ctx->vbios_pn, "D16406", in amdgpu_ras_get_quirks()
2676 sizeof(ctx->vbios_pn)) || in amdgpu_ras_get_quirks()
2677 strnstr(ctx->vbios_pn, "D36002", in amdgpu_ras_get_quirks()
2678 sizeof(ctx->vbios_pn))) in amdgpu_ras_get_quirks()
2679 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_get_quirks()
2693 adev->ras_hw_enabled = adev->ras_enabled = 0; in amdgpu_ras_check_supported()
2698 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { in amdgpu_ras_check_supported()
2700 dev_info(adev->dev, "MEM ECC is active.\n"); in amdgpu_ras_check_supported()
2701 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2704 dev_info(adev->dev, "MEM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2708 dev_info(adev->dev, "SRAM ECC is active.\n"); in amdgpu_ras_check_supported()
2710 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2713 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | in amdgpu_ras_check_supported()
2726 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_check_supported()
2729 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_check_supported()
2736 if (!adev->gmc.xgmi.num_physical_nodes) in amdgpu_ras_check_supported()
2737 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); in amdgpu_ras_check_supported()
2739 dev_info(adev->dev, "SRAM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2744 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | in amdgpu_ras_check_supported()
2752 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; in amdgpu_ras_check_supported()
2754 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : in amdgpu_ras_check_supported()
2755 adev->ras_hw_enabled & amdgpu_ras_mask; in amdgpu_ras_check_supported()
2762 struct amdgpu_device *adev = con->adev; in amdgpu_ras_counte_dw()
2767 res = pm_runtime_get_sync(dev->dev); in amdgpu_ras_counte_dw()
2774 atomic_set(&con->ras_ce_count, ce_count); in amdgpu_ras_counte_dw()
2775 atomic_set(&con->ras_ue_count, ue_count); in amdgpu_ras_counte_dw()
2778 pm_runtime_mark_last_busy(dev->dev); in amdgpu_ras_counte_dw()
2780 pm_runtime_put_autosuspend(dev->dev); in amdgpu_ras_counte_dw()
2793 if (adev->gmc.xgmi.connected_to_cpu || in amdgpu_ras_query_poison_mode()
2794 adev->gmc.is_app_apu) { in amdgpu_ras_query_poison_mode()
2796 con->poison_supported = true; in amdgpu_ras_query_poison_mode()
2797 } else if (adev->df.funcs && in amdgpu_ras_query_poison_mode()
2798 adev->df.funcs->query_ras_poison_mode && in amdgpu_ras_query_poison_mode()
2799 adev->umc.ras && in amdgpu_ras_query_poison_mode()
2800 adev->umc.ras->query_ras_poison_mode) { in amdgpu_ras_query_poison_mode()
2802 adev->df.funcs->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
2804 adev->umc.ras->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
2808 con->poison_supported = true; in amdgpu_ras_query_poison_mode()
2810 dev_warn(adev->dev, in amdgpu_ras_query_poison_mode()
2837 return -ENOMEM; in amdgpu_ras_init()
2839 con->adev = adev; in amdgpu_ras_init()
2840 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); in amdgpu_ras_init()
2841 atomic_set(&con->ras_ce_count, 0); in amdgpu_ras_init()
2842 atomic_set(&con->ras_ue_count, 0); in amdgpu_ras_init()
2844 con->objs = (struct ras_manager *)(con + 1); in amdgpu_ras_init()
2850 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { in amdgpu_ras_init()
2851 /* set gfx block ras context feature for VEGA20 Gaming in amdgpu_ras_init()
2854 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { in amdgpu_ras_init()
2855 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_init()
2864 con->update_channel_flag = false; in amdgpu_ras_init()
2865 con->features = 0; in amdgpu_ras_init()
2866 con->schema = 0; in amdgpu_ras_init()
2867 INIT_LIST_HEAD(&con->head); in amdgpu_ras_init()
2869 con->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_init()
2878 if (!adev->gmc.xgmi.connected_to_cpu) in amdgpu_ras_init()
2879 adev->nbio.ras = &nbio_v7_4_ras; in amdgpu_ras_init()
2882 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) in amdgpu_ras_init()
2889 adev->nbio.ras = &nbio_v4_3_ras; in amdgpu_ras_init()
2892 if (!adev->gmc.is_app_apu) in amdgpu_ras_init()
2893 adev->nbio.ras = &nbio_v7_9_ras; in amdgpu_ras_init()
2906 if (adev->nbio.ras && in amdgpu_ras_init()
2907 adev->nbio.ras->init_ras_controller_interrupt) { in amdgpu_ras_init()
2908 r = adev->nbio.ras->init_ras_controller_interrupt(adev); in amdgpu_ras_init()
2913 if (adev->nbio.ras && in amdgpu_ras_init()
2914 adev->nbio.ras->init_ras_err_event_athub_interrupt) { in amdgpu_ras_init()
2915 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); in amdgpu_ras_init()
2923 if (adev->smuio.funcs && in amdgpu_ras_init()
2924 adev->smuio.funcs->get_socket_id) in amdgpu_ras_init()
2925 con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 29); in amdgpu_ras_init()
2928 con->schema = amdgpu_get_ras_schema(adev); in amdgpu_ras_init()
2931 r = -EINVAL; in amdgpu_ras_init()
2935 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " in amdgpu_ras_init()
2937 adev->ras_hw_enabled, adev->ras_enabled); in amdgpu_ras_init()
2949 if (adev->gmc.xgmi.connected_to_cpu || in amdgpu_persistent_edc_harvesting_supported()
2950 adev->gmc.is_app_apu) in amdgpu_persistent_edc_harvesting_supported()
2968 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) in amdgpu_persistent_edc_harvesting()
2981 return con->poison_supported; in amdgpu_ras_is_poison_mode_supported()
2995 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { in amdgpu_ras_block_late_init()
3002 if (adev->in_suspend || amdgpu_in_reset(adev)) { in amdgpu_ras_block_late_init()
3014 if (adev->in_suspend || amdgpu_in_reset(adev)) in amdgpu_ras_block_late_init()
3018 if (ras_obj->ras_cb || (ras_obj->hw_ops && in amdgpu_ras_block_late_init()
3019 (ras_obj->hw_ops->query_poison_status || in amdgpu_ras_block_late_init()
3020 ras_obj->hw_ops->handle_poison_consumption))) { in amdgpu_ras_block_late_init()
3026 if (ras_obj->hw_ops && in amdgpu_ras_block_late_init()
3027 (ras_obj->hw_ops->query_ras_error_count || in amdgpu_ras_block_late_init()
3028 ras_obj->hw_ops->query_ras_error_status)) { in amdgpu_ras_block_late_init()
3037 return -ENOMEM; in amdgpu_ras_block_late_init()
3038 memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); in amdgpu_ras_block_late_init()
3041 atomic_set(&con->ras_ce_count, ce_count); in amdgpu_ras_block_late_init()
3042 atomic_set(&con->ras_ue_count, ue_count); in amdgpu_ras_block_late_init()
3051 if (ras_obj->ras_cb) in amdgpu_ras_block_late_init()
3075 if (ras_obj->ras_cb) in amdgpu_ras_block_late_fini()
3093 if (!adev->ras_enabled || !con) { in amdgpu_ras_resume()
3100 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { in amdgpu_ras_resume()
3112 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_resume()
3113 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { in amdgpu_ras_resume()
3114 amdgpu_ras_feature_enable(adev, &obj->head, 0); in amdgpu_ras_resume()
3126 if (!adev->ras_enabled || !con) in amdgpu_ras_suspend()
3131 if (con->features) in amdgpu_ras_suspend()
3147 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_late_init()
3148 if (!node->ras_obj) { in amdgpu_ras_late_init()
3149 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_late_init()
3153 obj = node->ras_obj; in amdgpu_ras_late_init()
3154 if (obj->ras_late_init) { in amdgpu_ras_late_init()
3155 r = obj->ras_late_init(adev, &obj->ras_comm); in amdgpu_ras_late_init()
3157 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", in amdgpu_ras_late_init()
3158 obj->ras_comm.name, r); in amdgpu_ras_late_init()
3162 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); in amdgpu_ras_late_init()
3173 if (!adev->ras_enabled || !con) in amdgpu_ras_pre_fini()
3178 if (con->features) in amdgpu_ras_pre_fini()
3190 if (!adev->ras_enabled || !con) in amdgpu_ras_fini()
3193 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { in amdgpu_ras_fini()
3194 if (ras_node->ras_obj) { in amdgpu_ras_fini()
3195 obj = ras_node->ras_obj; in amdgpu_ras_fini()
3196 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && in amdgpu_ras_fini()
3197 obj->ras_fini) in amdgpu_ras_fini()
3198 obj->ras_fini(adev, &obj->ras_comm); in amdgpu_ras_fini()
3200 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); in amdgpu_ras_fini()
3204 list_del(&ras_node->node); in amdgpu_ras_fini()
3211 WARN(con->features, "Feature mask is not cleared"); in amdgpu_ras_fini()
3213 if (con->features) in amdgpu_ras_fini()
3216 cancel_delayed_work_sync(&con->ras_counte_delay_work); in amdgpu_ras_fini()
3229 dev_info(adev->dev, "uncorrectable hardware error" in amdgpu_ras_global_ras_isr()
3232 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_global_ras_isr()
3239 if (adev->asic_type == CHIP_VEGA20 && in amdgpu_ras_need_emergency_restart()
3240 adev->pm.fw_version <= 0x283400) { in amdgpu_ras_need_emergency_restart()
3255 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { in amdgpu_release_ras_context()
3256 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); in amdgpu_release_ras_context()
3271 if (adev && adev->gmc.xgmi.connected_to_cpu && in find_adev()
3272 adev->gmc.xgmi.physical_node_id == node_id) in find_adev()
3298 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) && in amdgpu_bad_page_notifier()
3299 (XEC(m->status, 0x3f) == 0x0))) in amdgpu_bad_page_notifier()
3311 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; in amdgpu_bad_page_notifier()
3324 umc_inst = GET_UMC_INST(m->ipid); in amdgpu_bad_page_notifier()
3325 ch_inst = GET_CHAN_INDEX(m->ipid); in amdgpu_bad_page_notifier()
3327 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", in amdgpu_bad_page_notifier()
3330 if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) in amdgpu_bad_page_notifier()
3369 return adev->psp.ras_context.ras; in amdgpu_ras_get_context()
3375 return -EINVAL; in amdgpu_ras_set_context()
3377 adev->psp.ras_context.ras = ras_con; in amdgpu_ras_set_context()
3381 /* check if ras is supported on block, say, sdma, gfx */
3391 ret = ras && (adev->ras_enabled & (1 << block)); in amdgpu_ras_is_supported()
3393 /* For the special asic with mem ecc enabled but sram ecc in amdgpu_ras_is_supported()
3415 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) in amdgpu_ras_reset_gpu()
3416 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
3428 con->is_mca_debug_mode = enable; in amdgpu_ras_set_mca_debug_mode()
3437 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; in amdgpu_ras_get_mca_debug_mode()
3442 if (mca_funcs && mca_funcs->mca_set_debug_mode) in amdgpu_ras_get_mca_debug_mode()
3443 return con->is_mca_debug_mode; in amdgpu_ras_get_mca_debug_mode()
3452 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; in amdgpu_ras_get_error_query_mode()
3459 if (mca_funcs && mca_funcs->mca_set_debug_mode) in amdgpu_ras_get_error_query_mode()
3461 (con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; in amdgpu_ras_get_error_query_mode()
3474 return -EINVAL; in amdgpu_ras_register_ras_block()
3478 return -ENOMEM; in amdgpu_ras_register_ras_block()
3480 INIT_LIST_HEAD(&ras_node->node); in amdgpu_ras_register_ras_block()
3481 ras_node->ras_obj = ras_block_obj; in amdgpu_ras_register_ras_block()
3482 list_add_tail(&ras_node->node, &adev->ras_list); in amdgpu_ras_register_ras_block()
3516 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, in amdgpu_ras_inst_get_memory_id_field()
3517 reg_entry->seg_lo, reg_entry->reg_lo); in amdgpu_ras_inst_get_memory_id_field()
3520 if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && in amdgpu_ras_inst_get_memory_id_field()
3540 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, in amdgpu_ras_inst_get_err_cnt_field()
3541 reg_entry->seg_hi, reg_entry->reg_hi); in amdgpu_ras_inst_get_err_cnt_field()
3544 if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && in amdgpu_ras_inst_get_err_cnt_field()
3547 dev_dbg(adev->dev, "Invalid err_info field\n"); in amdgpu_ras_inst_get_err_cnt_field()
3587 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()
3595 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()
3631 INIT_LIST_HEAD(&err_data->err_node_list); in amdgpu_ras_error_data_init()
3641 list_del(&err_node->node); in amdgpu_ras_error_node_release()
3649 list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) in amdgpu_ras_error_data_fini()
3663 ref_id = &err_node->err_info.mcm_info; in amdgpu_ras_error_find_node_by_id()
3665 if (mcm_info->socket_id == ref_id->socket_id && in amdgpu_ras_error_find_node_by_id()
3666 mcm_info->die_id == ref_id->die_id) in amdgpu_ras_error_find_node_by_id()
3681 INIT_LIST_HEAD(&err_node->node); in amdgpu_ras_error_node_new()
3690 struct amdgpu_smuio_mcm_config_info *infoa = &nodea->err_info.mcm_info; in ras_err_info_cmp()
3691 struct amdgpu_smuio_mcm_config_info *infob = &nodeb->err_info.mcm_info; in ras_err_info_cmp()
3693 if (unlikely(infoa->socket_id != infob->socket_id)) in ras_err_info_cmp()
3694 return infoa->socket_id - infob->socket_id; in ras_err_info_cmp()
3696 return infoa->die_id - infob->die_id; in ras_err_info_cmp()
3709 return &err_node->err_info; in amdgpu_ras_error_get_info()
3715 memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); in amdgpu_ras_error_get_info()
3718 memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr)); in amdgpu_ras_error_get_info()
3720 err_data->err_list_count++; in amdgpu_ras_error_get_info()
3721 list_add_tail(&err_node->node, &err_data->err_node_list); in amdgpu_ras_error_get_info()
3722 list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp); in amdgpu_ras_error_get_info()
3724 return &err_node->err_info; in amdgpu_ras_error_get_info()
3734 return -EINVAL; in amdgpu_ras_error_statistic_ue_count()
3741 return -EINVAL; in amdgpu_ras_error_statistic_ue_count()
3743 err_info->ue_count += count; in amdgpu_ras_error_statistic_ue_count()
3744 err_data->ue_count += count; in amdgpu_ras_error_statistic_ue_count()
3756 return -EINVAL; in amdgpu_ras_error_statistic_ce_count()
3763 return -EINVAL; in amdgpu_ras_error_statistic_ce_count()
3765 err_info->ce_count += count; in amdgpu_ras_error_statistic_ce_count()
3766 err_data->ce_count += count; in amdgpu_ras_error_statistic_ce_count()