#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
+/* inject address is 52 bits */
+#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
+
static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
uint64_t offset, uint64_t size,
struct amdgpu_bo **bo_ptr);
return -EINVAL;
data->head.block = block_id;
- data->head.type = memcmp("ue", err, 2) == 0 ?
- AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
- AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+ /* only ue and ce errors are supported */
+ if (!memcmp("ue", err, 2))
+ data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+ else if (!memcmp("ce", err, 2))
+ data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+ else
+ return -EINVAL;
+
data->op = op;
if (op == 2) {
{
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
struct ras_debug_if data;
- struct amdgpu_bo *bo;
int ret = 0;
ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
break;
case 2:
- ret = amdgpu_ras_reserve_vram(adev,
- data.inject.address, PAGE_SIZE, &bo);
- if (ret) {
- /* address was offset, now it is absolute.*/
- data.inject.address += adev->gmc.vram_start;
- if (data.inject.address > adev->gmc.vram_end)
- break;
- } else
- data.inject.address = amdgpu_bo_gpu_offset(bo);
+ if ((data.inject.address >= adev->gmc.mc_vram_size) ||
+ (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* data.inject.address is offset instead of absolute gpu address */
ret = amdgpu_ras_error_inject(adev, &data.inject);
- amdgpu_ras_release_vram(adev, &bo);
break;
default:
ret = -EINVAL;
struct ras_query_if *info)
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
- struct ras_err_data err_data = {0, 0};
+ struct ras_err_data err_data = {0, 0, 0, NULL};
if (!obj)
return -EINVAL;
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC:
- if (adev->umc_funcs->query_ras_error_count)
- adev->umc_funcs->query_ras_error_count(adev, &err_data);
+ if (adev->umc.funcs->query_ras_error_count)
+ adev->umc.funcs->query_ras_error_count(adev, &err_data);
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ if (adev->umc.funcs->query_ras_error_address)
+ adev->umc.funcs->query_ras_error_address(adev, &err_data);
+ break;
+ case AMDGPU_RAS_BLOCK__GFX:
+ if (adev->gfx.funcs->query_ras_error_count)
+ adev->gfx.funcs->query_ras_error_count(adev, &err_data);
break;
default:
break;
if (!obj)
return -EINVAL;
- if (block_info.block_id != TA_RAS_BLOCK__UMC) {
+ switch (info->head.block) {
+ case AMDGPU_RAS_BLOCK__GFX:
+ if (adev->gfx.funcs->ras_error_inject)
+ ret = adev->gfx.funcs->ras_error_inject(adev, info);
+ else
+ ret = -EINVAL;
+ break;
+ case AMDGPU_RAS_BLOCK__UMC:
+ ret = psp_ras_trigger_error(&adev->psp, &block_info);
+ break;
+ default:
DRM_INFO("%s error injection is not supported yet\n",
ras_block_str(info->head.block));
- return -EINVAL;
+ ret = -EINVAL;
}
- ret = psp_ras_trigger_error(&adev->psp, &block_info);
if (ret)
DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
ras_block_str(info->head.block),
struct amdgpu_device *adev = ddev->dev_private;
struct ras_common_if head;
int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
- int i;
+ int i, enabled;
ssize_t s;
- struct ras_manager *obj;
s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
for (i = 0; i < ras_block_count; i++) {
head.block = i;
+ enabled = amdgpu_ras_is_feature_enabled(adev, &head);
- if (amdgpu_ras_is_feature_enabled(adev, &head)) {
- obj = amdgpu_ras_find_obj(adev, &head);
- s += scnprintf(&buf[s], PAGE_SIZE - s,
- "%s: %s\n",
- ras_block_str(i),
- ras_err_str(obj->head.type));
- } else
- s += scnprintf(&buf[s], PAGE_SIZE - s,
- "%s: disabled\n",
- ras_block_str(i));
+ s += scnprintf(&buf[s], PAGE_SIZE - s,
+ "%s ras feature mask: %s\n",
+ ras_block_str(i), enabled?"on":"off");
}
return s;
struct ras_ih_data *data = &obj->ih_data;
struct amdgpu_iv_entry entry;
int ret;
- struct ras_err_data err_data = {0, 0};
+ struct ras_err_data err_data = {0, 0, 0, NULL};
while (data->rptr != data->wptr) {
rmb();
* from the callback to udpate the error type/count, etc
*/
if (data->cb) {
- ret = data->cb(obj->adev, &entry);
+ ret = data->cb(obj->adev, &err_data, &entry);
/* ue will trigger an interrupt, and in that case
* we need do a reset to recovery the whole system.
* But leave IP do that recovery, here we just dispatch
* the error.
*/
- if (ret == AMDGPU_RAS_UE) {
- obj->err_data.ue_count++;
+ if (ret == AMDGPU_RAS_SUCCESS) {
+ /* these counts could be left as 0 if
+ * some blocks do not count error number
+ */
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
}
- /* Might need get ce count by register, but not all IP
- * saves ce count, some IP just use one bit or two bits
- * to indicate ce happened.
- */
}
}
}
if (amdgpu_ras_fs_init(adev))
goto fs_out;
+ /* ras init for each ras block */
+ if (adev->umc.funcs->ras_init)
+ adev->umc.funcs->ras_init(adev);
+
DRM_INFO("RAS INFO: ras initialized successfully, "
"hardware ability[%x] ras_mask[%x]\n",
con->hw_supported, con->supported);