drm/i915/gt: Be defensive in the face of false CS events

author Chris Wilson <chris@chris-wilson.co.uk>

Fri, 10 Jul 2020 13:31:25 +0000 (14:31 +0100)

committer Chris Wilson <chris@chris-wilson.co.uk>

Fri, 10 Jul 2020 14:24:17 +0000 (15:24 +0100)
author Chris Wilson <chris@chris-wilson.co.uk>
Fri, 10 Jul 2020 13:31:25 +0000 (14:31 +0100)
committer Chris Wilson <chris@chris-wilson.co.uk>
Fri, 10 Jul 2020 14:24:17 +0000 (15:24 +0100)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h

index 490af81bd6f370e3204739afc4fac1b622ee96b5..8de92fd7d392f85be0dc027c10824aed2cc0e2b6 100644 (file)
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -177,8 +177,12 @@ struct intel_engine_execlists {
          * the first error interrupt, record the EIR and schedule the tasklet.
          * In the tasklet, we process the pending CS events to ensure we have
          * the guilty request, and then reset the engine.
+        *
+        * Low 16b are used by HW, with the upper 16b used as the enabling mask.
+        * Reserve the upper 16b for tracking internal errors.
          */
         u32 error_interrupt;
+#define ERROR_CSB BIT(31)
  
         /**
          * @reset_ccid: Active CCID [EXECLISTS_STATUS_HI] at the time of reset
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c

index e1964cf40fd62ca888212c431526caf709556977..b05da68e52f4eb32320f273012493219bdcf6b3c 100644 (file)
--- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
@@ -27,7 +27,8 @@ cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
         if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) {
                 u32 eir;
  
-               eir = ENGINE_READ(engine, RING_EIR);
+               /* Upper 16b are the enabling mask, rsvd for internal errors */
+               eir = ENGINE_READ(engine, RING_EIR) & GENMASK(15, 0);
                 ENGINE_TRACE(engine, "CS error: %x\n", eir);
  
                 /* Disable the error interrupt until after the reset */
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c

index fbcfeaed6441a084b90a0bd375c049570e3bf4b6..cd4262cc96e208572727744b7726bb88fbbeb07c 100644 (file)
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2568,6 +2568,25 @@ static void process_csb(struct intel_engine_cs *engine)
         if (unlikely(head == tail))
                 return;
  
+       /*
+        * We will consume all events from HW, or at least pretend to.
+        *
+        * The sequence of events from the HW is deterministic, and derived
+        * from our writes to the ELSP, with a smidgen of variability for
+        * the arrival of the asynchronous requests wrt to the inflight
+        * execution. If the HW sends an event that does not correspond with
+        * the one we are expecting, we have to abandon all hope as we lose
+        * all tracking of what the engine is actually executing. We will
+        * only detect we are out of sequence with the HW when we get an
+        * 'impossible' event because we have already drained our own
+        * preemption/promotion queue. If this occurs, we know that we likely
+        * lost track of execution earlier and must unwind and restart, the
+        * simplest way is by stop processing the event queue and force the
+        * engine to reset.
+        */
+       execlists->csb_head = tail;
+       ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
+
         /*
          * Hopefully paired with a wmb() in HW!
          *
@@ -2577,8 +2596,6 @@ static void process_csb(struct intel_engine_cs *engine)
          * we perform the READ_ONCE(*csb_write).
          */
         rmb();
-
-       ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
         do {
                 bool promote;
  
@@ -2613,6 +2630,11 @@ static void process_csb(struct intel_engine_cs *engine)
                 if (promote) {
                         struct i915_request * const *old = execlists->active;
  
+                       if (GEM_WARN_ON(!*execlists->pending)) {
+                               execlists->error_interrupt |= ERROR_CSB;
+                               break;
+                       }
+
                         ring_set_paused(engine, 0);
  
                         /* Point active to the new ELSP; prevent overwriting */
@@ -2635,7 +2657,10 @@ static void process_csb(struct intel_engine_cs *engine)
  
                         WRITE_ONCE(execlists->pending[0], NULL);
                 } else {
-                       GEM_BUG_ON(!*execlists->active);
+                       if (GEM_WARN_ON(!*execlists->active)) {
+                               execlists->error_interrupt |= ERROR_CSB;
+                               break;
+                       }
  
                         /* port0 completed, advanced to port1 */
                         trace_ports(execlists, "completed", execlists->active);
@@ -2686,7 +2711,6 @@ static void process_csb(struct intel_engine_cs *engine)
                 }
         } while (head != tail);
  
-       execlists->csb_head = head;
         set_timeslice(engine);
  
         /*
@@ -3117,9 +3141,18 @@ static void execlists_submission_tasklet(unsigned long data)
         process_csb(engine);
  
         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
+               const char *msg;
+
+               /* Generate the error message in priority wrt to the user! */
+               if (engine->execlists.error_interrupt & GENMASK(15, 0))
+                       msg = "CS error"; /* thrown by a user payload */
+               else if (engine->execlists.error_interrupt & ERROR_CSB)
+                       msg = "invalid CSB event";
+               else
+                       msg = "internal error";
+
                 engine->execlists.error_interrupt = 0;
-               if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
-                       execlists_reset(engine, "CS error");
+               execlists_reset(engine, msg);
         }
  
         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
author	Chris Wilson <chris@chris-wilson.co.uk>
	Fri, 10 Jul 2020 13:31:25 +0000 (14:31 +0100)
committer	Chris Wilson <chris@chris-wilson.co.uk>
	Fri, 10 Jul 2020 14:24:17 +0000 (15:24 +0100)
drivers/gpu/drm/i915/gt/intel_engine_types.h		patch \| blob \| history
drivers/gpu/drm/i915/gt/intel_gt_irq.c		patch \| blob \| history
drivers/gpu/drm/i915/gt/intel_lrc.c		patch \| blob \| history