Merge tag 'v5.3-rc3' into drm-next-5.4
[sfrench/cifs-2.6.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
index fac7aa2c244fa9f80776165e193313d543fb9082..523f43732deeb52e4d299d44d3867925f02fc051 100644 (file)
 #include "amdgpu_ras.h"
 #include "amdgpu_atomfirmware.h"
 
-struct ras_ih_data {
-       /* interrupt bottom half */
-       struct work_struct ih_work;
-       int inuse;
-       /* IP callback */
-       ras_ih_cb cb;
-       /* full of entries */
-       unsigned char *ring;
-       unsigned int ring_size;
-       unsigned int element_size;
-       unsigned int aligned_element_size;
-       unsigned int rptr;
-       unsigned int wptr;
-};
-
-struct ras_fs_data {
-       char sysfs_name[32];
-       char debugfs_name[32];
-};
-
-struct ras_err_data {
-       unsigned long ue_count;
-       unsigned long ce_count;
-};
-
-struct ras_err_handler_data {
-       /* point to bad pages array */
-       struct {
-               unsigned long bp;
-               struct amdgpu_bo *bo;
-       } *bps;
-       /* the count of entries */
-       int count;
-       /* the space can place new entries */
-       int space_left;
-       /* last reserved entry's index + 1 */
-       int last_reserved;
-};
-
-struct ras_manager {
-       struct ras_common_if head;
-       /* reference count */
-       int use;
-       /* ras block link */
-       struct list_head node;
-       /* the device */
-       struct amdgpu_device *adev;
-       /* debugfs */
-       struct dentry *ent;
-       /* sysfs */
-       struct device_attribute sysfs_attr;
-       int attr_inuse;
-
-       /* fs node name */
-       struct ras_fs_data fs_data;
-
-       /* IH data */
-       struct ras_ih_data ih_data;
-
-       struct ras_err_data err_data;
-};
-
-struct ras_badpage {
-       unsigned int bp;
-       unsigned int size;
-       unsigned int flags;
-};
-
 const char *ras_error_string[] = {
        "none",
        "parity",
@@ -130,6 +62,9 @@ const char *ras_block_string[] = {
 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET                2
 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
 
+/* inject address is 52 bits */
+#define        RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
+
 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
                uint64_t offset, uint64_t size,
                struct amdgpu_bo **bo_ptr);
@@ -223,9 +158,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
                        return -EINVAL;
 
                data->head.block = block_id;
-               data->head.type = memcmp("ue", err, 2) == 0 ?
-                       AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
-                       AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+               /* only ue and ce errors are supported */
+               if (!memcmp("ue", err, 2))
+                       data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+               else if (!memcmp("ce", err, 2))
+                       data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+               else
+                       return -EINVAL;
+
                data->op = op;
 
                if (op == 2) {
@@ -310,7 +250,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
        struct ras_debug_if data;
-       struct amdgpu_bo *bo;
        int ret = 0;
 
        ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
@@ -328,17 +267,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
                ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
                break;
        case 2:
-               ret = amdgpu_ras_reserve_vram(adev,
-                               data.inject.address, PAGE_SIZE, &bo);
-               if (ret) {
-                       /* address was offset, now it is absolute.*/
-                       data.inject.address += adev->gmc.vram_start;
-                       if (data.inject.address > adev->gmc.vram_end)
-                               break;
-               } else
-                       data.inject.address = amdgpu_bo_gpu_offset(bo);
+               if ((data.inject.address >= adev->gmc.mc_vram_size) ||
+                   (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+                       ret = -EINVAL;
+                       break;
+               }
+
+               /* data.inject.address is offset instead of absolute gpu address */
                ret = amdgpu_ras_error_inject(adev, &data.inject);
-               amdgpu_ras_release_vram(adev, &bo);
                break;
        default:
                ret = -EINVAL;
@@ -656,14 +592,42 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
                struct ras_query_if *info)
 {
        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
+       struct ras_err_data err_data = {0, 0, 0, NULL};
 
        if (!obj)
                return -EINVAL;
-       /* TODO might read the register to read the count */
+
+       switch (info->head.block) {
+       case AMDGPU_RAS_BLOCK__UMC:
+               if (adev->umc.funcs->query_ras_error_count)
+                       adev->umc.funcs->query_ras_error_count(adev, &err_data);
+               /* umc query_ras_error_address is also responsible for clearing
+                * error status
+                */
+               if (adev->umc.funcs->query_ras_error_address)
+                       adev->umc.funcs->query_ras_error_address(adev, &err_data);
+               break;
+       case AMDGPU_RAS_BLOCK__GFX:
+               if (adev->gfx.funcs->query_ras_error_count)
+                       adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+               break;
+       default:
+               break;
+       }
+
+       obj->err_data.ue_count += err_data.ue_count;
+       obj->err_data.ce_count += err_data.ce_count;
 
        info->ue_count = obj->err_data.ue_count;
        info->ce_count = obj->err_data.ce_count;
 
+       if (err_data.ce_count)
+               dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
+                        obj->err_data.ce_count, ras_block_str(info->head.block));
+       if (err_data.ue_count)
+               dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
+                        obj->err_data.ue_count, ras_block_str(info->head.block));
+
        return 0;
 }
 
@@ -684,13 +648,22 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
        if (!obj)
                return -EINVAL;
 
-       if (block_info.block_id != TA_RAS_BLOCK__UMC) {
+       switch (info->head.block) {
+       case AMDGPU_RAS_BLOCK__GFX:
+               if (adev->gfx.funcs->ras_error_inject)
+                       ret = adev->gfx.funcs->ras_error_inject(adev, info);
+               else
+                       ret = -EINVAL;
+               break;
+       case AMDGPU_RAS_BLOCK__UMC:
+               ret = psp_ras_trigger_error(&adev->psp, &block_info);
+               break;
+       default:
                DRM_INFO("%s error injection is not supported yet\n",
                         ras_block_str(info->head.block));
-               return -EINVAL;
+               ret = -EINVAL;
        }
 
-       ret = psp_ras_trigger_error(&adev->psp, &block_info);
        if (ret)
                DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
                                ras_block_str(info->head.block),
@@ -816,25 +789,18 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
        struct amdgpu_device *adev = ddev->dev_private;
        struct ras_common_if head;
        int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
-       int i;
+       int i, enabled;
        ssize_t s;
-       struct ras_manager *obj;
 
        s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 
        for (i = 0; i < ras_block_count; i++) {
                head.block = i;
+               enabled = amdgpu_ras_is_feature_enabled(adev, &head);
 
-               if (amdgpu_ras_is_feature_enabled(adev, &head)) {
-                       obj = amdgpu_ras_find_obj(adev, &head);
-                       s += scnprintf(&buf[s], PAGE_SIZE - s,
-                                       "%s: %s\n",
-                                       ras_block_str(i),
-                                       ras_err_str(obj->head.type));
-               } else
-                       s += scnprintf(&buf[s], PAGE_SIZE - s,
-                                       "%s: disabled\n",
-                                       ras_block_str(i));
+               s += scnprintf(&buf[s], PAGE_SIZE - s,
+                               "%s ras feature mask: %s\n",
+                               ras_block_str(i), enabled?"on":"off");
        }
 
        return s;
@@ -1054,6 +1020,7 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
        struct ras_ih_data *data = &obj->ih_data;
        struct amdgpu_iv_entry entry;
        int ret;
+       struct ras_err_data err_data = {0, 0, 0, NULL};
 
        while (data->rptr != data->wptr) {
                rmb();
@@ -1068,19 +1035,19 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
                 * from the callback to udpate the error type/count, etc
                 */
                if (data->cb) {
-                       ret = data->cb(obj->adev, &entry);
+                       ret = data->cb(obj->adev, &err_data, &entry);
                        /* ue will trigger an interrupt, and in that case
                         * we need do a reset to recovery the whole system.
                         * But leave IP do that recovery, here we just dispatch
                         * the error.
                         */
-                       if (ret == AMDGPU_RAS_UE) {
-                               obj->err_data.ue_count++;
+                       if (ret == AMDGPU_RAS_SUCCESS) {
+                               /* these counts could be left as 0 if
+                                * some blocks do not count error number
+                                */
+                               obj->err_data.ue_count += err_data.ue_count;
+                               obj->err_data.ce_count += err_data.ce_count;
                        }
-                       /* Might need get ce count by register, but not all IP
-                        * saves ce count, some IP just use one bit or two bits
-                        * to indicate ce happened.
-                        */
                }
        }
 }
@@ -1577,6 +1544,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
        if (amdgpu_ras_fs_init(adev))
                goto fs_out;
 
+       /* ras init for each ras block */
+       if (adev->umc.funcs->ras_init)
+               adev->umc.funcs->ras_init(adev);
+
        DRM_INFO("RAS INFO: ras initialized successfully, "
                        "hardware ability[%x] ras_mask[%x]\n",
                        con->hw_supported, con->supported);