Merge tag 'v5.3-rc3' into drm-next-5.4
[sfrench/cifs-2.6.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
index a6134280b9417f8988beda522cde643b412f2acf..523f43732deeb52e4d299d44d3867925f02fc051 100644 (file)
@@ -62,6 +62,9 @@ const char *ras_block_string[] = {
 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET                2
 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
 
+/* inject address is 52 bits */
+#define        RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
+
 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
                uint64_t offset, uint64_t size,
                struct amdgpu_bo **bo_ptr);
@@ -155,9 +158,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
                        return -EINVAL;
 
                data->head.block = block_id;
-               data->head.type = memcmp("ue", err, 2) == 0 ?
-                       AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
-                       AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+               /* only ue and ce errors are supported */
+               if (!memcmp("ue", err, 2))
+                       data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+               else if (!memcmp("ce", err, 2))
+                       data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+               else
+                       return -EINVAL;
+
                data->op = op;
 
                if (op == 2) {
@@ -242,7 +250,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
        struct ras_debug_if data;
-       struct amdgpu_bo *bo;
        int ret = 0;
 
        ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
@@ -260,17 +267,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
                ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
                break;
        case 2:
-               ret = amdgpu_ras_reserve_vram(adev,
-                               data.inject.address, PAGE_SIZE, &bo);
-               if (ret) {
-                       /* address was offset, now it is absolute.*/
-                       data.inject.address += adev->gmc.vram_start;
-                       if (data.inject.address > adev->gmc.vram_end)
-                               break;
-               } else
-                       data.inject.address = amdgpu_bo_gpu_offset(bo);
+               if ((data.inject.address >= adev->gmc.mc_vram_size) ||
+                   (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+                       ret = -EINVAL;
+                       break;
+               }
+
+               /* data.inject.address is offset instead of absolute gpu address */
                ret = amdgpu_ras_error_inject(adev, &data.inject);
-               amdgpu_ras_release_vram(adev, &bo);
                break;
        default:
                ret = -EINVAL;
@@ -588,15 +592,24 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
                struct ras_query_if *info)
 {
        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
-       struct ras_err_data err_data = {0, 0};
+       struct ras_err_data err_data = {0, 0, 0, NULL};
 
        if (!obj)
                return -EINVAL;
 
        switch (info->head.block) {
        case AMDGPU_RAS_BLOCK__UMC:
-               if (adev->umc_funcs->query_ras_error_count)
-                       adev->umc_funcs->query_ras_error_count(adev, &err_data);
+               if (adev->umc.funcs->query_ras_error_count)
+                       adev->umc.funcs->query_ras_error_count(adev, &err_data);
+               /* umc query_ras_error_address is also responsible for clearing
+                * error status
+                */
+               if (adev->umc.funcs->query_ras_error_address)
+                       adev->umc.funcs->query_ras_error_address(adev, &err_data);
+               break;
+       case AMDGPU_RAS_BLOCK__GFX:
+               if (adev->gfx.funcs->query_ras_error_count)
+                       adev->gfx.funcs->query_ras_error_count(adev, &err_data);
                break;
        default:
                break;
@@ -635,13 +648,22 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
        if (!obj)
                return -EINVAL;
 
-       if (block_info.block_id != TA_RAS_BLOCK__UMC) {
+       switch (info->head.block) {
+       case AMDGPU_RAS_BLOCK__GFX:
+               if (adev->gfx.funcs->ras_error_inject)
+                       ret = adev->gfx.funcs->ras_error_inject(adev, info);
+               else
+                       ret = -EINVAL;
+               break;
+       case AMDGPU_RAS_BLOCK__UMC:
+               ret = psp_ras_trigger_error(&adev->psp, &block_info);
+               break;
+       default:
                DRM_INFO("%s error injection is not supported yet\n",
                         ras_block_str(info->head.block));
-               return -EINVAL;
+               ret = -EINVAL;
        }
 
-       ret = psp_ras_trigger_error(&adev->psp, &block_info);
        if (ret)
                DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
                                ras_block_str(info->head.block),
@@ -767,25 +789,18 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
        struct amdgpu_device *adev = ddev->dev_private;
        struct ras_common_if head;
        int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
-       int i;
+       int i, enabled;
        ssize_t s;
-       struct ras_manager *obj;
 
        s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 
        for (i = 0; i < ras_block_count; i++) {
                head.block = i;
+               enabled = amdgpu_ras_is_feature_enabled(adev, &head);
 
-               if (amdgpu_ras_is_feature_enabled(adev, &head)) {
-                       obj = amdgpu_ras_find_obj(adev, &head);
-                       s += scnprintf(&buf[s], PAGE_SIZE - s,
-                                       "%s: %s\n",
-                                       ras_block_str(i),
-                                       ras_err_str(obj->head.type));
-               } else
-                       s += scnprintf(&buf[s], PAGE_SIZE - s,
-                                       "%s: disabled\n",
-                                       ras_block_str(i));
+               s += scnprintf(&buf[s], PAGE_SIZE - s,
+                               "%s ras feature mask: %s\n",
+                               ras_block_str(i), enabled?"on":"off");
        }
 
        return s;
@@ -1005,7 +1020,7 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
        struct ras_ih_data *data = &obj->ih_data;
        struct amdgpu_iv_entry entry;
        int ret;
-       struct ras_err_data err_data = {0, 0};
+       struct ras_err_data err_data = {0, 0, 0, NULL};
 
        while (data->rptr != data->wptr) {
                rmb();
@@ -1020,19 +1035,19 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
                 * from the callback to udpate the error type/count, etc
                 */
                if (data->cb) {
-                       ret = data->cb(obj->adev, &entry);
+                       ret = data->cb(obj->adev, &err_data, &entry);
                        /* ue will trigger an interrupt, and in that case
                         * we need do a reset to recovery the whole system.
                         * But leave IP do that recovery, here we just dispatch
                         * the error.
                         */
-                       if (ret == AMDGPU_RAS_UE) {
-                               obj->err_data.ue_count++;
+                       if (ret == AMDGPU_RAS_SUCCESS) {
+                               /* these counts could be left as 0 if
+                                * some blocks do not count error number
+                                */
+                               obj->err_data.ue_count += err_data.ue_count;
+                               obj->err_data.ce_count += err_data.ce_count;
                        }
-                       /* Might need get ce count by register, but not all IP
-                        * saves ce count, some IP just use one bit or two bits
-                        * to indicate ce happened.
-                        */
                }
        }
 }
@@ -1529,6 +1544,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
        if (amdgpu_ras_fs_init(adev))
                goto fs_out;
 
+       /* ras init for each ras block */
+       if (adev->umc.funcs->ras_init)
+               adev->umc.funcs->ras_init(adev);
+
        DRM_INFO("RAS INFO: ras initialized successfully, "
                        "hardware ability[%x] ras_mask[%x]\n",
                        con->hw_supported, con->supported);