drivers/gpu/drm/i915/i915_reset.c

   1 /*
   2  * SPDX-License-Identifier: MIT
   3  *
   4  * Copyright © 2008-2018 Intel Corporation
   5  */
   6
   7 #include <linux/sched/mm.h>
   8 #include <linux/stop_machine.h>
   9
  10 #include "i915_drv.h"
  11 #include "i915_gpu_error.h"
  12 #include "i915_reset.h"
  13
  14 #include "intel_guc.h"
  15
  16 #define RESET_MAX_RETRIES 3
  17
  18 /* XXX How to handle concurrent GGTT updates using tiling registers? */
  19 #define RESET_UNDER_STOP_MACHINE 0
  20
  21 static void engine_skip_context(struct i915_request *rq)
  22 {
  23         struct intel_engine_cs *engine = rq->engine;
  24         struct i915_gem_context *hung_ctx = rq->gem_context;
  25         struct i915_timeline *timeline = rq->timeline;
  26
  27         lockdep_assert_held(&engine->timeline.lock);
  28         GEM_BUG_ON(timeline == &engine->timeline);
  29
  30         spin_lock(&timeline->lock);
  31
  32         if (i915_request_is_active(rq)) {
  33                 list_for_each_entry_continue(rq,
  34                                              &engine->timeline.requests, link)
  35                         if (rq->gem_context == hung_ctx)
  36                                 i915_request_skip(rq, -EIO);
  37         }
  38
  39         list_for_each_entry(rq, &timeline->requests, link)
  40                 i915_request_skip(rq, -EIO);
  41
  42         spin_unlock(&timeline->lock);
  43 }
  44
  45 static void client_mark_guilty(struct drm_i915_file_private *file_priv,
  46                                const struct i915_gem_context *ctx)
  47 {
  48         unsigned int score;
  49         unsigned long prev_hang;
  50
  51         if (i915_gem_context_is_banned(ctx))
  52                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
  53         else
  54                 score = 0;
  55
  56         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
  57         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
  58                 score += I915_CLIENT_SCORE_HANG_FAST;
  59
  60         if (score) {
  61                 atomic_add(score, &file_priv->ban_score);
  62
  63                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
  64                                  ctx->name, score,
  65                                  atomic_read(&file_priv->ban_score));
  66         }
  67 }
  68
  69 static bool context_mark_guilty(struct i915_gem_context *ctx)
  70 {
  71         unsigned int score;
  72         bool banned, bannable;
  73
  74         atomic_inc(&ctx->guilty_count);
  75
  76         bannable = i915_gem_context_is_bannable(ctx);
  77         score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
  78         banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
  79
  80         /* Cool contexts don't accumulate client ban score */
  81         if (!bannable)
  82                 return false;
  83
  84         if (banned) {
  85                 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
  86                                  ctx->name, atomic_read(&ctx->guilty_count),
  87                                  score);
  88                 i915_gem_context_set_banned(ctx);
  89         }
  90
  91         if (!IS_ERR_OR_NULL(ctx->file_priv))
  92                 client_mark_guilty(ctx->file_priv, ctx);
  93
  94         return banned;
  95 }
  96
  97 static void context_mark_innocent(struct i915_gem_context *ctx)
  98 {
  99         atomic_inc(&ctx->active_count);
 100 }
 101
 102 void i915_reset_request(struct i915_request *rq, bool guilty)
 103 {
 104         lockdep_assert_held(&rq->engine->timeline.lock);
 105         GEM_BUG_ON(i915_request_completed(rq));
 106
 107         if (guilty) {
 108                 i915_request_skip(rq, -EIO);
 109                 if (context_mark_guilty(rq->gem_context))
 110                         engine_skip_context(rq);
 111         } else {
 112                 dma_fence_set_error(&rq->fence, -EAGAIN);
 113                 context_mark_innocent(rq->gem_context);
 114         }
 115 }
 116
 117 static void gen3_stop_engine(struct intel_engine_cs *engine)
 118 {
 119         struct drm_i915_private *dev_priv = engine->i915;
 120         const u32 base = engine->mmio_base;
 121
 122         if (intel_engine_stop_cs(engine))
 123                 DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
 124
 125         I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
 126         POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
 127
 128         I915_WRITE_FW(RING_HEAD(base), 0);
 129         I915_WRITE_FW(RING_TAIL(base), 0);
 130         POSTING_READ_FW(RING_TAIL(base));
 131
 132         /* The ring must be empty before it is disabled */
 133         I915_WRITE_FW(RING_CTL(base), 0);
 134
 135         /* Check acts as a post */
 136         if (I915_READ_FW(RING_HEAD(base)) != 0)
 137                 DRM_DEBUG_DRIVER("%s: ring head not parked\n",
 138                                  engine->name);
 139 }
 140
 141 static void i915_stop_engines(struct drm_i915_private *i915,
 142                               unsigned int engine_mask)
 143 {
 144         struct intel_engine_cs *engine;
 145         enum intel_engine_id id;
 146
 147         if (INTEL_GEN(i915) < 3)
 148                 return;
 149
 150         for_each_engine_masked(engine, i915, engine_mask, id)
 151                 gen3_stop_engine(engine);
 152 }
 153
 154 static bool i915_in_reset(struct pci_dev *pdev)
 155 {
 156         u8 gdrst;
 157
 158         pci_read_config_byte(pdev, I915_GDRST, &gdrst);
 159         return gdrst & GRDOM_RESET_STATUS;
 160 }
 161
 162 static int i915_do_reset(struct drm_i915_private *i915,
 163                          unsigned int engine_mask,
 164                          unsigned int retry)
 165 {
 166         struct pci_dev *pdev = i915->drm.pdev;
 167         int err;
 168
 169         /* Assert reset for at least 20 usec, and wait for acknowledgement. */
 170         pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
 171         udelay(50);
 172         err = wait_for_atomic(i915_in_reset(pdev), 50);
 173
 174         /* Clear the reset request. */
 175         pci_write_config_byte(pdev, I915_GDRST, 0);
 176         udelay(50);
 177         if (!err)
 178                 err = wait_for_atomic(!i915_in_reset(pdev), 50);
 179
 180         return err;
 181 }
 182
 183 static bool g4x_reset_complete(struct pci_dev *pdev)
 184 {
 185         u8 gdrst;
 186
 187         pci_read_config_byte(pdev, I915_GDRST, &gdrst);
 188         return (gdrst & GRDOM_RESET_ENABLE) == 0;
 189 }
 190
 191 static int g33_do_reset(struct drm_i915_private *i915,
 192                         unsigned int engine_mask,
 193                         unsigned int retry)
 194 {
 195         struct pci_dev *pdev = i915->drm.pdev;
 196
 197         pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
 198         return wait_for_atomic(g4x_reset_complete(pdev), 50);
 199 }
 200
 201 static int g4x_do_reset(struct drm_i915_private *dev_priv,
 202                         unsigned int engine_mask,
 203                         unsigned int retry)
 204 {
 205         struct pci_dev *pdev = dev_priv->drm.pdev;
 206         int ret;
 207
 208         /* WaVcpClkGateDisableForMediaReset:ctg,elk */
 209         I915_WRITE_FW(VDECCLK_GATE_D,
 210                       I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
 211         POSTING_READ_FW(VDECCLK_GATE_D);
 212
 213         pci_write_config_byte(pdev, I915_GDRST,
 214                               GRDOM_MEDIA | GRDOM_RESET_ENABLE);
 215         ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 216         if (ret) {
 217                 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
 218                 goto out;
 219         }
 220
 221         pci_write_config_byte(pdev, I915_GDRST,
 222                               GRDOM_RENDER | GRDOM_RESET_ENABLE);
 223         ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 224         if (ret) {
 225                 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
 226                 goto out;
 227         }
 228
 229 out:
 230         pci_write_config_byte(pdev, I915_GDRST, 0);
 231
 232         I915_WRITE_FW(VDECCLK_GATE_D,
 233                       I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
 234         POSTING_READ_FW(VDECCLK_GATE_D);
 235
 236         return ret;
 237 }
 238
 239 static int ironlake_do_reset(struct drm_i915_private *dev_priv,
 240                              unsigned int engine_mask,
 241                              unsigned int retry)
 242 {
 243         int ret;
 244
 245         I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
 246         ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
 247                                            ILK_GRDOM_RESET_ENABLE, 0,
 248                                            5000, 0,
 249                                            NULL);
 250         if (ret) {
 251                 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
 252                 goto out;
 253         }
 254
 255         I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
 256         ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
 257                                            ILK_GRDOM_RESET_ENABLE, 0,
 258                                            5000, 0,
 259                                            NULL);
 260         if (ret) {
 261                 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
 262                 goto out;
 263         }
 264
 265 out:
 266         I915_WRITE_FW(ILK_GDSR, 0);
 267         POSTING_READ_FW(ILK_GDSR);
 268         return ret;
 269 }
 270
 271 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
 272 static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
 273                                 u32 hw_domain_mask)
 274 {
 275         int err;
 276
 277         /*
 278          * GEN6_GDRST is not in the gt power well, no need to check
 279          * for fifo space for the write or forcewake the chip for
 280          * the read
 281          */
 282         I915_WRITE_FW(GEN6_GDRST, hw_domain_mask);
 283
 284         /* Wait for the device to ack the reset requests */
 285         err = __intel_wait_for_register_fw(dev_priv,
 286                                            GEN6_GDRST, hw_domain_mask, 0,
 287                                            500, 0,
 288                                            NULL);
 289         if (err)
 290                 DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
 291                                  hw_domain_mask);
 292
 293         return err;
 294 }
 295
 296 static int gen6_reset_engines(struct drm_i915_private *i915,
 297                               unsigned int engine_mask,
 298                               unsigned int retry)
 299 {
 300         struct intel_engine_cs *engine;
 301         const u32 hw_engine_mask[I915_NUM_ENGINES] = {
 302                 [RCS] = GEN6_GRDOM_RENDER,
 303                 [BCS] = GEN6_GRDOM_BLT,
 304                 [VCS] = GEN6_GRDOM_MEDIA,
 305                 [VCS2] = GEN8_GRDOM_MEDIA2,
 306                 [VECS] = GEN6_GRDOM_VECS,
 307         };
 308         u32 hw_mask;
 309
 310         if (engine_mask == ALL_ENGINES) {
 311                 hw_mask = GEN6_GRDOM_FULL;
 312         } else {
 313                 unsigned int tmp;
 314
 315                 hw_mask = 0;
 316                 for_each_engine_masked(engine, i915, engine_mask, tmp)
 317                         hw_mask |= hw_engine_mask[engine->id];
 318         }
 319
 320         return gen6_hw_domain_reset(i915, hw_mask);
 321 }
 322
 323 static u32 gen11_lock_sfc(struct drm_i915_private *dev_priv,
 324                           struct intel_engine_cs *engine)
 325 {
 326         u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
 327         i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
 328         u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
 329         i915_reg_t sfc_usage;
 330         u32 sfc_usage_bit;
 331         u32 sfc_reset_bit;
 332
 333         switch (engine->class) {
 334         case VIDEO_DECODE_CLASS:
 335                 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
 336                         return 0;
 337
 338                 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
 339                 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
 340
 341                 sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
 342                 sfc_forced_lock_ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
 343
 344                 sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
 345                 sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
 346                 sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
 347                 break;
 348
 349         case VIDEO_ENHANCEMENT_CLASS:
 350                 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
 351                 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
 352
 353                 sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
 354                 sfc_forced_lock_ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
 355
 356                 sfc_usage = GEN11_VECS_SFC_USAGE(engine);
 357                 sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
 358                 sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
 359                 break;
 360
 361         default:
 362                 return 0;
 363         }
 364
 365         /*
 366          * Tell the engine that a software reset is going to happen. The engine
 367          * will then try to force lock the SFC (if currently locked, it will
 368          * remain so until we tell the engine it is safe to unlock; if currently
 369          * unlocked, it will ignore this and all new lock requests). If SFC
 370          * ends up being locked to the engine we want to reset, we have to reset
 371          * it as well (we will unlock it once the reset sequence is completed).
 372          */
 373         I915_WRITE_FW(sfc_forced_lock,
 374                       I915_READ_FW(sfc_forced_lock) | sfc_forced_lock_bit);
 375
 376         if (__intel_wait_for_register_fw(dev_priv,
 377                                          sfc_forced_lock_ack,
 378                                          sfc_forced_lock_ack_bit,
 379                                          sfc_forced_lock_ack_bit,
 380                                          1000, 0, NULL)) {
 381                 DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
 382                 return 0;
 383         }
 384
 385         if (I915_READ_FW(sfc_usage) & sfc_usage_bit)
 386                 return sfc_reset_bit;
 387
 388         return 0;
 389 }
 390
 391 static void gen11_unlock_sfc(struct drm_i915_private *dev_priv,
 392                              struct intel_engine_cs *engine)
 393 {
 394         u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
 395         i915_reg_t sfc_forced_lock;
 396         u32 sfc_forced_lock_bit;
 397
 398         switch (engine->class) {
 399         case VIDEO_DECODE_CLASS:
 400                 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
 401                         return;
 402
 403                 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
 404                 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
 405                 break;
 406
 407         case VIDEO_ENHANCEMENT_CLASS:
 408                 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
 409                 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
 410                 break;
 411
 412         default:
 413                 return;
 414         }
 415
 416         I915_WRITE_FW(sfc_forced_lock,
 417                       I915_READ_FW(sfc_forced_lock) & ~sfc_forced_lock_bit);
 418 }
 419
 420 static int gen11_reset_engines(struct drm_i915_private *i915,
 421                                unsigned int engine_mask,
 422                                unsigned int retry)
 423 {
 424         const u32 hw_engine_mask[I915_NUM_ENGINES] = {
 425                 [RCS] = GEN11_GRDOM_RENDER,
 426                 [BCS] = GEN11_GRDOM_BLT,
 427                 [VCS] = GEN11_GRDOM_MEDIA,
 428                 [VCS2] = GEN11_GRDOM_MEDIA2,
 429                 [VCS3] = GEN11_GRDOM_MEDIA3,
 430                 [VCS4] = GEN11_GRDOM_MEDIA4,
 431                 [VECS] = GEN11_GRDOM_VECS,
 432                 [VECS2] = GEN11_GRDOM_VECS2,
 433         };
 434         struct intel_engine_cs *engine;
 435         unsigned int tmp;
 436         u32 hw_mask;
 437         int ret;
 438
 439         BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
 440
 441         if (engine_mask == ALL_ENGINES) {
 442                 hw_mask = GEN11_GRDOM_FULL;
 443         } else {
 444                 hw_mask = 0;
 445                 for_each_engine_masked(engine, i915, engine_mask, tmp) {
 446                         hw_mask |= hw_engine_mask[engine->id];
 447                         hw_mask |= gen11_lock_sfc(i915, engine);
 448                 }
 449         }
 450
 451         ret = gen6_hw_domain_reset(i915, hw_mask);
 452
 453         if (engine_mask != ALL_ENGINES)
 454                 for_each_engine_masked(engine, i915, engine_mask, tmp)
 455                         gen11_unlock_sfc(i915, engine);
 456
 457         return ret;
 458 }
 459
 460 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
 461 {
 462         struct drm_i915_private *dev_priv = engine->i915;
 463         int ret;
 464
 465         I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
 466                       _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
 467
 468         ret = __intel_wait_for_register_fw(dev_priv,
 469                                            RING_RESET_CTL(engine->mmio_base),
 470                                            RESET_CTL_READY_TO_RESET,
 471                                            RESET_CTL_READY_TO_RESET,
 472                                            700, 0,
 473                                            NULL);
 474         if (ret)
 475                 DRM_ERROR("%s: reset request timeout\n", engine->name);
 476
 477         return ret;
 478 }
 479
 480 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
 481 {
 482         struct drm_i915_private *dev_priv = engine->i915;
 483
 484         I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
 485                       _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
 486 }
 487
 488 static int gen8_reset_engines(struct drm_i915_private *i915,
 489                               unsigned int engine_mask,
 490                               unsigned int retry)
 491 {
 492         struct intel_engine_cs *engine;
 493         const bool reset_non_ready = retry >= 1;
 494         unsigned int tmp;
 495         int ret;
 496
 497         for_each_engine_masked(engine, i915, engine_mask, tmp) {
 498                 ret = gen8_engine_reset_prepare(engine);
 499                 if (ret && !reset_non_ready)
 500                         goto skip_reset;
 501
 502                 /*
 503                  * If this is not the first failed attempt to prepare,
 504                  * we decide to proceed anyway.
 505                  *
 506                  * By doing so we risk context corruption and with
 507                  * some gens (kbl), possible system hang if reset
 508                  * happens during active bb execution.
 509                  *
 510                  * We rather take context corruption instead of
 511                  * failed reset with a wedged driver/gpu. And
 512                  * active bb execution case should be covered by
 513                  * i915_stop_engines we have before the reset.
 514                  */
 515         }
 516
 517         if (INTEL_GEN(i915) >= 11)
 518                 ret = gen11_reset_engines(i915, engine_mask, retry);
 519         else
 520                 ret = gen6_reset_engines(i915, engine_mask, retry);
 521
 522 skip_reset:
 523         for_each_engine_masked(engine, i915, engine_mask, tmp)
 524                 gen8_engine_reset_cancel(engine);
 525
 526         return ret;
 527 }
 528
 529 typedef int (*reset_func)(struct drm_i915_private *,
 530                           unsigned int engine_mask,
 531                           unsigned int retry);
 532
 533 static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
 534 {
 535         if (!i915_modparams.reset)
 536                 return NULL;
 537
 538         if (INTEL_GEN(i915) >= 8)
 539                 return gen8_reset_engines;
 540         else if (INTEL_GEN(i915) >= 6)
 541                 return gen6_reset_engines;
 542         else if (INTEL_GEN(i915) >= 5)
 543                 return ironlake_do_reset;
 544         else if (IS_G4X(i915))
 545                 return g4x_do_reset;
 546         else if (IS_G33(i915) || IS_PINEVIEW(i915))
 547                 return g33_do_reset;
 548         else if (INTEL_GEN(i915) >= 3)
 549                 return i915_do_reset;
 550         else
 551                 return NULL;
 552 }
 553
 554 int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 555 {
 556         const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
 557         reset_func reset;
 558         int ret = -ETIMEDOUT;
 559         int retry;
 560
 561         reset = intel_get_gpu_reset(i915);
 562         if (!reset)
 563                 return -ENODEV;
 564
 565         /*
 566          * If the power well sleeps during the reset, the reset
 567          * request may be dropped and never completes (causing -EIO).
 568          */
 569         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
 570         for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
 571                 /*
 572                  * We stop engines, otherwise we might get failed reset and a
 573                  * dead gpu (on elk). Also as modern gpu as kbl can suffer
 574                  * from system hang if batchbuffer is progressing when
 575                  * the reset is issued, regardless of READY_TO_RESET ack.
 576                  * Thus assume it is best to stop engines on all gens
 577                  * where we have a gpu reset.
 578                  *
 579                  * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
 580                  *
 581                  * WaMediaResetMainRingCleanup:ctg,elk (presumably)
 582                  *
 583                  * FIXME: Wa for more modern gens needs to be validated
 584                  */
 585                 i915_stop_engines(i915, engine_mask);
 586
 587                 GEM_TRACE("engine_mask=%x\n", engine_mask);
 588                 preempt_disable();
 589                 ret = reset(i915, engine_mask, retry);
 590                 preempt_enable();
 591         }
 592         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 593
 594         return ret;
 595 }
 596
 597 bool intel_has_gpu_reset(struct drm_i915_private *i915)
 598 {
 599         if (USES_GUC(i915))
 600                 return false;
 601
 602         return intel_get_gpu_reset(i915);
 603 }
 604
 605 bool intel_has_reset_engine(struct drm_i915_private *i915)
 606 {
 607         return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2;
 608 }
 609
 610 int intel_reset_guc(struct drm_i915_private *i915)
 611 {
 612         u32 guc_domain =
 613                 INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
 614         int ret;
 615
 616         GEM_BUG_ON(!HAS_GUC(i915));
 617
 618         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
 619         ret = gen6_hw_domain_reset(i915, guc_domain);
 620         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 621
 622         return ret;
 623 }
 624
 625 /*
 626  * Ensure irq handler finishes, and not run again.
 627  * Also return the active request so that we only search for it once.
 628  */
 629 static void reset_prepare_engine(struct intel_engine_cs *engine)
 630 {
 631         /*
 632          * During the reset sequence, we must prevent the engine from
 633          * entering RC6. As the context state is undefined until we restart
 634          * the engine, if it does enter RC6 during the reset, the state
 635          * written to the powercontext is undefined and so we may lose
 636          * GPU state upon resume, i.e. fail to restart after a reset.
 637          */
 638         intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
 639         engine->reset.prepare(engine);
 640 }
 641
 642 static void reset_prepare(struct drm_i915_private *i915)
 643 {
 644         struct intel_engine_cs *engine;
 645         enum intel_engine_id id;
 646
 647         for_each_engine(engine, i915, id)
 648                 reset_prepare_engine(engine);
 649
 650         intel_uc_sanitize(i915);
 651 }
 652
 653 static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 654 {
 655         struct intel_engine_cs *engine;
 656         enum intel_engine_id id;
 657         int err;
 658
 659         /*
 660          * Everything depends on having the GTT running, so we need to start
 661          * there.
 662          */
 663         err = i915_ggtt_enable_hw(i915);
 664         if (err)
 665                 return err;
 666
 667         for_each_engine(engine, i915, id)
 668                 intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id));
 669
 670         i915_gem_restore_fences(i915);
 671
 672         return err;
 673 }
 674
 675 static void reset_finish_engine(struct intel_engine_cs *engine)
 676 {
 677         engine->reset.finish(engine);
 678         intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
 679 }
 680
 681 struct i915_gpu_restart {
 682         struct work_struct work;
 683         struct drm_i915_private *i915;
 684 };
 685
 686 static void restart_work(struct work_struct *work)
 687 {
 688         struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work);
 689         struct drm_i915_private *i915 = arg->i915;
 690         struct intel_engine_cs *engine;
 691         enum intel_engine_id id;
 692         intel_wakeref_t wakeref;
 693
 694         wakeref = intel_runtime_pm_get(i915);
 695         mutex_lock(&i915->drm.struct_mutex);
 696         WRITE_ONCE(i915->gpu_error.restart, NULL);
 697
 698         for_each_engine(engine, i915, id) {
 699                 struct i915_request *rq;
 700
 701                 /*
 702                  * Ostensibily, we always want a context loaded for powersaving,
 703                  * so if the engine is idle after the reset, send a request
 704                  * to load our scratch kernel_context.
 705                  */
 706                 if (!intel_engine_is_idle(engine))
 707                         continue;
 708
 709                 rq = i915_request_alloc(engine, i915->kernel_context);
 710                 if (!IS_ERR(rq))
 711                         i915_request_add(rq);
 712         }
 713
 714         mutex_unlock(&i915->drm.struct_mutex);
 715         intel_runtime_pm_put(i915, wakeref);
 716
 717         kfree(arg);
 718 }
 719
 720 static void reset_finish(struct drm_i915_private *i915)
 721 {
 722         struct intel_engine_cs *engine;
 723         enum intel_engine_id id;
 724
 725         for_each_engine(engine, i915, id)
 726                 reset_finish_engine(engine);
 727 }
 728
 729 static void reset_restart(struct drm_i915_private *i915)
 730 {
 731         struct i915_gpu_restart *arg;
 732
 733         /*
 734          * Following the reset, ensure that we always reload context for
 735          * powersaving, and to correct engine->last_retired_context. Since
 736          * this requires us to submit a request, queue a worker to do that
 737          * task for us to evade any locking here.
 738          */
 739         if (READ_ONCE(i915->gpu_error.restart))
 740                 return;
 741
 742         arg = kmalloc(sizeof(*arg), GFP_KERNEL);
 743         if (arg) {
 744                 arg->i915 = i915;
 745                 INIT_WORK(&arg->work, restart_work);
 746
 747                 WRITE_ONCE(i915->gpu_error.restart, arg);
 748                 queue_work(i915->wq, &arg->work);
 749         }
 750 }
 751
 752 static void nop_submit_request(struct i915_request *request)
 753 {
 754         struct intel_engine_cs *engine = request->engine;
 755         unsigned long flags;
 756
 757         GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
 758                   engine->name, request->fence.context, request->fence.seqno);
 759         dma_fence_set_error(&request->fence, -EIO);
 760
 761         spin_lock_irqsave(&engine->timeline.lock, flags);
 762         __i915_request_submit(request);
 763         i915_request_mark_complete(request);
 764         intel_engine_write_global_seqno(engine, request->global_seqno);
 765         spin_unlock_irqrestore(&engine->timeline.lock, flags);
 766
 767         intel_engine_queue_breadcrumbs(engine);
 768 }
 769
 770 void i915_gem_set_wedged(struct drm_i915_private *i915)
 771 {
 772         struct i915_gpu_error *error = &i915->gpu_error;
 773         struct intel_engine_cs *engine;
 774         enum intel_engine_id id;
 775
 776         mutex_lock(&error->wedge_mutex);
 777         if (test_bit(I915_WEDGED, &error->flags)) {
 778                 mutex_unlock(&error->wedge_mutex);
 779                 return;
 780         }
 781
 782         if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
 783                 struct drm_printer p = drm_debug_printer(__func__);
 784
 785                 for_each_engine(engine, i915, id)
 786                         intel_engine_dump(engine, &p, "%s\n", engine->name);
 787         }
 788
 789         GEM_TRACE("start\n");
 790
 791         /*
 792          * First, stop submission to hw, but do not yet complete requests by
 793          * rolling the global seqno forward (since this would complete requests
 794          * for which we haven't set the fence error to EIO yet).
 795          */
 796         for_each_engine(engine, i915, id)
 797                 reset_prepare_engine(engine);
 798
 799         /* Even if the GPU reset fails, it should still stop the engines */
 800         if (INTEL_GEN(i915) >= 5)
 801                 intel_gpu_reset(i915, ALL_ENGINES);
 802
 803         for_each_engine(engine, i915, id) {
 804                 engine->submit_request = nop_submit_request;
 805                 engine->schedule = NULL;
 806         }
 807         i915->caps.scheduler = 0;
 808
 809         /*
 810          * Make sure no request can slip through without getting completed by
 811          * either this call here to intel_engine_write_global_seqno, or the one
 812          * in nop_submit_request.
 813          */
 814         synchronize_rcu();
 815
 816         /* Mark all executing requests as skipped */
 817         for_each_engine(engine, i915, id)
 818                 engine->cancel_requests(engine);
 819
 820         for_each_engine(engine, i915, id) {
 821                 reset_finish_engine(engine);
 822                 intel_engine_signal_breadcrumbs(engine);
 823         }
 824
 825         smp_mb__before_atomic();
 826         set_bit(I915_WEDGED, &error->flags);
 827
 828         GEM_TRACE("end\n");
 829         mutex_unlock(&error->wedge_mutex);
 830
 831         wake_up_all(&error->reset_queue);
 832 }
 833
 834 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 835 {
 836         struct i915_gpu_error *error = &i915->gpu_error;
 837         struct i915_timeline *tl;
 838         bool ret = false;
 839
 840         if (!test_bit(I915_WEDGED, &error->flags))
 841                 return true;
 842
 843         if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
 844                 return false;
 845
 846         mutex_lock(&error->wedge_mutex);
 847
 848         GEM_TRACE("start\n");
 849
 850         /*
 851          * Before unwedging, make sure that all pending operations
 852          * are flushed and errored out - we may have requests waiting upon
 853          * third party fences. We marked all inflight requests as EIO, and
 854          * every execbuf since returned EIO, for consistency we want all
 855          * the currently pending requests to also be marked as EIO, which
 856          * is done inside our nop_submit_request - and so we must wait.
 857          *
 858          * No more can be submitted until we reset the wedged bit.
 859          */
 860         mutex_lock(&i915->gt.timelines.mutex);
 861         list_for_each_entry(tl, &i915->gt.timelines.active_list, link) {
 862                 struct i915_request *rq;
 863                 long timeout;
 864
 865                 rq = i915_active_request_get_unlocked(&tl->last_request);
 866                 if (!rq)
 867                         continue;
 868
 869                 /*
 870                  * We can't use our normal waiter as we want to
 871                  * avoid recursively trying to handle the current
 872                  * reset. The basic dma_fence_default_wait() installs
 873                  * a callback for dma_fence_signal(), which is
 874                  * triggered by our nop handler (indirectly, the
 875                  * callback enables the signaler thread which is
 876                  * woken by the nop_submit_request() advancing the seqno
 877                  * and when the seqno passes the fence, the signaler
 878                  * then signals the fence waking us up).
 879                  */
 880                 timeout = dma_fence_default_wait(&rq->fence, true,
 881                                                  MAX_SCHEDULE_TIMEOUT);
 882                 i915_request_put(rq);
 883                 if (timeout < 0) {
 884                         mutex_unlock(&i915->gt.timelines.mutex);
 885                         goto unlock;
 886                 }
 887         }
 888         mutex_unlock(&i915->gt.timelines.mutex);
 889
 890         intel_engines_sanitize(i915, false);
 891
 892         /*
 893          * Undo nop_submit_request. We prevent all new i915 requests from
 894          * being queued (by disallowing execbuf whilst wedged) so having
 895          * waited for all active requests above, we know the system is idle
 896          * and do not have to worry about a thread being inside
 897          * engine->submit_request() as we swap over. So unlike installing
 898          * the nop_submit_request on reset, we can do this from normal
 899          * context and do not require stop_machine().
 900          */
 901         intel_engines_reset_default_submission(i915);
 902
 903         GEM_TRACE("end\n");
 904
 905         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 906         clear_bit(I915_WEDGED, &i915->gpu_error.flags);
 907         ret = true;
 908 unlock:
 909         mutex_unlock(&i915->gpu_error.wedge_mutex);
 910
 911         return ret;
 912 }
 913
 914 struct __i915_reset {
 915         struct drm_i915_private *i915;
 916         unsigned int stalled_mask;
 917 };
 918
 919 static int __i915_reset__BKL(void *data)
 920 {
 921         struct __i915_reset *arg = data;
 922         int err;
 923
 924         err = intel_gpu_reset(arg->i915, ALL_ENGINES);
 925         if (err)
 926                 return err;
 927
 928         return gt_reset(arg->i915, arg->stalled_mask);
 929 }
 930
 931 #if RESET_UNDER_STOP_MACHINE
 932 /*
 933  * XXX An alternative to using stop_machine would be to park only the
 934  * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
 935  * we should be able to prevent their memmory accesses via the lost fence
 936  * registers over the course of the reset without the potential recursive
 937  * of mutexes between the pagefault handler and reset.
 938  *
 939  * See igt/gem_mmap_gtt/hang
 940  */
 941 #define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
 942 #else
 943 #define __do_reset(fn, arg) fn(arg)
 944 #endif
 945
 946 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 947 {
 948         struct __i915_reset arg = { i915, stalled_mask };
 949         int err, i;
 950
 951         err = __do_reset(__i915_reset__BKL, &arg);
 952         for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
 953                 msleep(100);
 954                 err = __do_reset(__i915_reset__BKL, &arg);
 955         }
 956
 957         return err;
 958 }
 959
 960 /**
 961  * i915_reset - reset chip after a hang
 962  * @i915: #drm_i915_private to reset
 963  * @stalled_mask: mask of the stalled engines with the guilty requests
 964  * @reason: user error message for why we are resetting
 965  *
 966  * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
 967  * on failure.
 968  *
 969  * Caller must hold the struct_mutex.
 970  *
 971  * Procedure is fairly simple:
 972  *   - reset the chip using the reset reg
 973  *   - re-init context state
 974  *   - re-init hardware status page
 975  *   - re-init ring buffer
 976  *   - re-init interrupt state
 977  *   - re-init display
 978  */
 979 void i915_reset(struct drm_i915_private *i915,
 980                 unsigned int stalled_mask,
 981                 const char *reason)
 982 {
 983         struct i915_gpu_error *error = &i915->gpu_error;
 984         int ret;
 985
 986         GEM_TRACE("flags=%lx\n", error->flags);
 987
 988         might_sleep();
 989         assert_rpm_wakelock_held(i915);
 990         GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 991
 992         /* Clear any previous failed attempts at recovery. Time to try again. */
 993         if (!i915_gem_unset_wedged(i915))
 994                 return;
 995
 996         if (reason)
 997                 dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
 998         error->reset_count++;
 999
1000         reset_prepare(i915);
1001
1002         if (!intel_has_gpu_reset(i915)) {
1003                 if (i915_modparams.reset)
1004                         dev_err(i915->drm.dev, "GPU reset not supported\n");
1005                 else
1006                         DRM_DEBUG_DRIVER("GPU reset disabled\n");
1007                 goto error;
1008         }
1009
1010         if (do_reset(i915, stalled_mask)) {
1011                 dev_err(i915->drm.dev, "Failed to reset chip\n");
1012                 goto taint;
1013         }
1014
1015         intel_overlay_reset(i915);
1016
1017         /*
1018          * Next we need to restore the context, but we don't use those
1019          * yet either...
1020          *
1021          * Ring buffer needs to be re-initialized in the KMS case, or if X
1022          * was running at the time of the reset (i.e. we weren't VT
1023          * switched away).
1024          */
1025         ret = i915_gem_init_hw(i915);
1026         if (ret) {
1027                 DRM_ERROR("Failed to initialise HW following reset (%d)\n",
1028                           ret);
1029                 goto error;
1030         }
1031
1032         i915_queue_hangcheck(i915);
1033
1034 finish:
1035         reset_finish(i915);
1036         if (!i915_terminally_wedged(error))
1037                 reset_restart(i915);
1038         return;
1039
1040 taint:
1041         /*
1042          * History tells us that if we cannot reset the GPU now, we
1043          * never will. This then impacts everything that is run
1044          * subsequently. On failing the reset, we mark the driver
1045          * as wedged, preventing further execution on the GPU.
1046          * We also want to go one step further and add a taint to the
1047          * kernel so that any subsequent faults can be traced back to
1048          * this failure. This is important for CI, where if the
1049          * GPU/driver fails we would like to reboot and restart testing
1050          * rather than continue on into oblivion. For everyone else,
1051          * the system should still plod along, but they have been warned!
1052          */
1053         add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
1054 error:
1055         i915_gem_set_wedged(i915);
1056         goto finish;
1057 }
1058
1059 static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
1060                                         struct intel_engine_cs *engine)
1061 {
1062         return intel_gpu_reset(i915, intel_engine_flag(engine));
1063 }
1064
1065 /**
1066  * i915_reset_engine - reset GPU engine to recover from a hang
1067  * @engine: engine to reset
1068  * @msg: reason for GPU reset; or NULL for no dev_notice()
1069  *
1070  * Reset a specific GPU engine. Useful if a hang is detected.
1071  * Returns zero on successful reset or otherwise an error code.
1072  *
1073  * Procedure is:
1074  *  - identifies the request that caused the hang and it is dropped
1075  *  - reset engine (which will force the engine to idle)
1076  *  - re-init/configure engine
1077  */
1078 int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
1079 {
1080         struct i915_gpu_error *error = &engine->i915->gpu_error;
1081         int ret;
1082
1083         GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
1084         GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
1085
1086         reset_prepare_engine(engine);
1087
1088         if (msg)
1089                 dev_notice(engine->i915->drm.dev,
1090                            "Resetting %s for %s\n", engine->name, msg);
1091         error->reset_engine_count[engine->id]++;
1092
1093         if (!engine->i915->guc.execbuf_client)
1094                 ret = intel_gt_reset_engine(engine->i915, engine);
1095         else
1096                 ret = intel_guc_reset_engine(&engine->i915->guc, engine);
1097         if (ret) {
1098                 /* If we fail here, we expect to fallback to a global reset */
1099                 DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
1100                                  engine->i915->guc.execbuf_client ? "GuC " : "",
1101                                  engine->name, ret);
1102                 goto out;
1103         }
1104
1105         /*
1106          * The request that caused the hang is stuck on elsp, we know the
1107          * active request and can drop it, adjust head to skip the offending
1108          * request to resume executing remaining requests in the queue.
1109          */
1110         intel_engine_reset(engine, true);
1111
1112         /*
1113          * The engine and its registers (and workarounds in case of render)
1114          * have been reset to their default values. Follow the init_ring
1115          * process to program RING_MODE, HWSP and re-enable submission.
1116          */
1117         ret = engine->init_hw(engine);
1118         if (ret)
1119                 goto out;
1120
1121 out:
1122         intel_engine_cancel_stop_cs(engine);
1123         reset_finish_engine(engine);
1124         return ret;
1125 }
1126
1127 static void i915_reset_device(struct drm_i915_private *i915,
1128                               u32 engine_mask,
1129                               const char *reason)
1130 {
1131         struct i915_gpu_error *error = &i915->gpu_error;
1132         struct kobject *kobj = &i915->drm.primary->kdev->kobj;
1133         char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1134         char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1135         char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1136         struct i915_wedge_me w;
1137
1138         kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1139
1140         DRM_DEBUG_DRIVER("resetting chip\n");
1141         kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1142
1143         /* Use a watchdog to ensure that our reset completes */
1144         i915_wedge_on_timeout(&w, i915, 5 * HZ) {
1145                 intel_prepare_reset(i915);
1146
1147                 i915_reset(i915, engine_mask, reason);
1148
1149                 intel_finish_reset(i915);
1150         }
1151
1152         if (!test_bit(I915_WEDGED, &error->flags))
1153                 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1154 }
1155
1156 void i915_clear_error_registers(struct drm_i915_private *dev_priv)
1157 {
1158         u32 eir;
1159
1160         if (!IS_GEN(dev_priv, 2))
1161                 I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
1162
1163         if (INTEL_GEN(dev_priv) < 4)
1164                 I915_WRITE(IPEIR, I915_READ(IPEIR));
1165         else
1166                 I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
1167
1168         I915_WRITE(EIR, I915_READ(EIR));
1169         eir = I915_READ(EIR);
1170         if (eir) {
1171                 /*
1172                  * some errors might have become stuck,
1173                  * mask them.
1174                  */
1175                 DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
1176                 I915_WRITE(EMR, I915_READ(EMR) | eir);
1177                 I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
1178         }
1179
1180         if (INTEL_GEN(dev_priv) >= 8) {
1181                 I915_WRITE(GEN8_RING_FAULT_REG,
1182                            I915_READ(GEN8_RING_FAULT_REG) & ~RING_FAULT_VALID);
1183                 POSTING_READ(GEN8_RING_FAULT_REG);
1184         } else if (INTEL_GEN(dev_priv) >= 6) {
1185                 struct intel_engine_cs *engine;
1186                 enum intel_engine_id id;
1187
1188                 for_each_engine(engine, dev_priv, id) {
1189                         I915_WRITE(RING_FAULT_REG(engine),
1190                                    I915_READ(RING_FAULT_REG(engine)) &
1191                                    ~RING_FAULT_VALID);
1192                 }
1193                 POSTING_READ(RING_FAULT_REG(dev_priv->engine[RCS]));
1194         }
1195 }
1196
1197 /**
1198  * i915_handle_error - handle a gpu error
1199  * @i915: i915 device private
1200  * @engine_mask: mask representing engines that are hung
1201  * @flags: control flags
1202  * @fmt: Error message format string
1203  *
1204  * Do some basic checking of register state at error time and
1205  * dump it to the syslog.  Also call i915_capture_error_state() to make
1206  * sure we get a record and make it available in debugfs.  Fire a uevent
1207  * so userspace knows something bad happened (should trigger collection
1208  * of a ring dump etc.).
1209  */
1210 void i915_handle_error(struct drm_i915_private *i915,
1211                        u32 engine_mask,
1212                        unsigned long flags,
1213                        const char *fmt, ...)
1214 {
1215         struct intel_engine_cs *engine;
1216         intel_wakeref_t wakeref;
1217         unsigned int tmp;
1218         char error_msg[80];
1219         char *msg = NULL;
1220
1221         if (fmt) {
1222                 va_list args;
1223
1224                 va_start(args, fmt);
1225                 vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1226                 va_end(args);
1227
1228                 msg = error_msg;
1229         }
1230
1231         /*
1232          * In most cases it's guaranteed that we get here with an RPM
1233          * reference held, for example because there is a pending GPU
1234          * request that won't finish until the reset is done. This
1235          * isn't the case at least when we get here by doing a
1236          * simulated reset via debugfs, so get an RPM reference.
1237          */
1238         wakeref = intel_runtime_pm_get(i915);
1239
1240         engine_mask &= INTEL_INFO(i915)->ring_mask;
1241
1242         if (flags & I915_ERROR_CAPTURE) {
1243                 i915_capture_error_state(i915, engine_mask, msg);
1244                 i915_clear_error_registers(i915);
1245         }
1246
1247         /*
1248          * Try engine reset when available. We fall back to full reset if
1249          * single reset fails.
1250          */
1251         if (intel_has_reset_engine(i915) &&
1252             !i915_terminally_wedged(&i915->gpu_error)) {
1253                 for_each_engine_masked(engine, i915, engine_mask, tmp) {
1254                         BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1255                         if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1256                                              &i915->gpu_error.flags))
1257                                 continue;
1258
1259                         if (i915_reset_engine(engine, msg) == 0)
1260                                 engine_mask &= ~intel_engine_flag(engine);
1261
1262                         clear_bit(I915_RESET_ENGINE + engine->id,
1263                                   &i915->gpu_error.flags);
1264                         wake_up_bit(&i915->gpu_error.flags,
1265                                     I915_RESET_ENGINE + engine->id);
1266                 }
1267         }
1268
1269         if (!engine_mask)
1270                 goto out;
1271
1272         /* Full reset needs the mutex, stop any other user trying to do so. */
1273         if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
1274                 wait_event(i915->gpu_error.reset_queue,
1275                            !test_bit(I915_RESET_BACKOFF,
1276                                      &i915->gpu_error.flags));
1277                 goto out;
1278         }
1279
1280         /* Prevent any other reset-engine attempt. */
1281         for_each_engine(engine, i915, tmp) {
1282                 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1283                                         &i915->gpu_error.flags))
1284                         wait_on_bit(&i915->gpu_error.flags,
1285                                     I915_RESET_ENGINE + engine->id,
1286                                     TASK_UNINTERRUPTIBLE);
1287         }
1288
1289         i915_reset_device(i915, engine_mask, msg);
1290
1291         for_each_engine(engine, i915, tmp) {
1292                 clear_bit(I915_RESET_ENGINE + engine->id,
1293                           &i915->gpu_error.flags);
1294         }
1295
1296         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
1297         wake_up_all(&i915->gpu_error.reset_queue);
1298
1299 out:
1300         intel_runtime_pm_put(i915, wakeref);
1301 }
1302
1303 bool i915_reset_flush(struct drm_i915_private *i915)
1304 {
1305         int err;
1306
1307         cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
1308
1309         flush_workqueue(i915->wq);
1310         GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart));
1311
1312         mutex_lock(&i915->drm.struct_mutex);
1313         err = i915_gem_wait_for_idle(i915,
1314                                      I915_WAIT_LOCKED |
1315                                      I915_WAIT_FOR_IDLE_BOOST,
1316                                      MAX_SCHEDULE_TIMEOUT);
1317         mutex_unlock(&i915->drm.struct_mutex);
1318
1319         return !err;
1320 }
1321
1322 static void i915_wedge_me(struct work_struct *work)
1323 {
1324         struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
1325
1326         dev_err(w->i915->drm.dev,
1327                 "%s timed out, cancelling all in-flight rendering.\n",
1328                 w->name);
1329         i915_gem_set_wedged(w->i915);
1330 }
1331
1332 void __i915_init_wedge(struct i915_wedge_me *w,
1333                        struct drm_i915_private *i915,
1334                        long timeout,
1335                        const char *name)
1336 {
1337         w->i915 = i915;
1338         w->name = name;
1339
1340         INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me);
1341         schedule_delayed_work(&w->work, timeout);
1342 }
1343
1344 void __i915_fini_wedge(struct i915_wedge_me *w)
1345 {
1346         cancel_delayed_work_sync(&w->work);
1347         destroy_delayed_work_on_stack(&w->work);
1348         w->i915 = NULL;
1349 }