drm/amdgpu: schedule ras recovery when reaching bad page threshold(v2)

author Guchun Chen <guchun.chen@amd.com>

Thu, 23 Jul 2020 08:05:00 +0000 (16:05 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 4 Aug 2020 21:26:46 +0000 (17:26 -0400)
author Guchun Chen <guchun.chen@amd.com>
Thu, 23 Jul 2020 08:05:00 +0000 (16:05 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 4 Aug 2020 21:26:46 +0000 (17:26 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

index da8b35a5b41ca1b926606a248089b34e9c56dd23..461dfd22bc1c7ab08295ca15fee17b5efc02230c 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -394,8 +394,10 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
         int i, ret = 0;
         struct i2c_msg *msgs, *msg;
         unsigned char *buffs, *buff;
+       bool sched_ras_recovery = false;
         struct eeprom_table_record *record;
         struct amdgpu_device *adev = to_amdgpu_device(control);
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
  
         if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS)
                 return 0;
@@ -413,11 +415,30 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
                 goto free_buff;
         }
  
+       /*
+        * If saved bad pages number exceeds the bad page threshold for
+        * the whole VRAM, update table header to mark the BAD GPU tag
+        * and schedule one ras recovery after eeprom write is done,
+        * this can avoid the missing for latest records.
+        *
+        * This new header will be picked up and checked in the bootup
+        * by ras recovery, which may break bootup process to notify
+        * user this GPU is in bad state and to retire such GPU for
+        * further check.
+        */
+       if (write && (amdgpu_bad_page_threshold != 0) &&
+               ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
+               dev_warn(adev->dev,
+                       "Saved bad pages(%d) reaches threshold value(%d).\n",
+                       control->num_recs + num, ras->bad_page_cnt_threshold);
+               control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
+               sched_ras_recovery = true;
+       }
+
         /* In case of overflow just start from beginning to not lose newest records */
         if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES))
                 control->next_addr = EEPROM_RECORD_START;
  
-
         /*
          * TODO Currently makes EEPROM writes for each record, this creates
          * internal fragmentation. Optimized the code to do full page write of
@@ -493,6 +514,20 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
                 __update_tbl_checksum(control, records, num, old_hdr_byte_sum);
  
                 __update_table_header(control, buffs);
+
+               if (sched_ras_recovery) {
+                       /*
+                        * Before scheduling ras recovery, assert the related
+                        * flag first, which shall bypass common bad page
+                        * reservation execution in amdgpu_ras_reset_gpu.
+                        */
+                       amdgpu_ras_get_context(adev)->flags |=
+                               AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;
+
+                       dev_warn(adev->dev, "Conduct ras recovery due to bad "
+                               "page threshold reached.\n");
+                       amdgpu_ras_reset_gpu(adev);
+               }
         } else if (!__validate_tbl_checksum(control, records, num)) {
                 DRM_WARN("EEPROM Table checksum mismatch!");
                 /* TODO Uncomment when EEPROM read/write is relliable */
author	Guchun Chen <guchun.chen@amd.com>
	Thu, 23 Jul 2020 08:05:00 +0000 (16:05 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 4 Aug 2020 21:26:46 +0000 (17:26 -0400)