drm/i915: Use time based guilty context banning
authorChris Wilson <chris@chris-wilson.co.uk>
Tue, 19 Feb 2019 12:21:52 +0000 (12:21 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Tue, 19 Feb 2019 14:46:21 +0000 (14:46 +0000)
Currently, we accumulate each time a context hangs the GPU, offset
against the number of requests it submits, and if that score exceeds a
certain threshold, we ban that context from submitting any more requests
(cancelling any work in flight). In contrast, we use a simple timer on
the file, that if we see more than a 9 hangs faster than 60s apart in
total across all of its contexts, we will ban the client from creating
any more contexts. This leads to a confusing situation where the file
may be banned before the context, so lets use a simple timer scheme for
each.

If the context submits 3 hanging requests within a 120s period, declare
it forbidden to ever send more requests.

This has the advantage of not being easy to repair by simply sending
empty requests, but has the disadvantage that if the context is idle
then it is forgiven. However, if the context is idle, it is not
disrupting the system, but a hog can evade the request counting and
cause much more severe disruption to the system.

Updating ban_score from request retirement is dubious as the retirement
is purposely not in sync with request submission (i.e. we try and batch
retirement to reduce overhead and avoid latency on submission), which
leads to surprising situations where we can forgive a hang immediately
due to a backlog of requests from before the hang being retired
afterwards.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190219122215.8941-2-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/i915_gem_context.c
drivers/gpu/drm/i915/i915_gem_context.h
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/i915_gpu_error.h
drivers/gpu/drm/i915/i915_request.c
drivers/gpu/drm/i915/i915_reset.c

index da21c843fed8ab7da2ab1d880966cca1dbb62e39..7541c6f961b33a65088b7a5165176f0c0ead962f 100644 (file)
@@ -355,6 +355,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
        struct i915_gem_context *ctx;
        unsigned int n;
        int ret;
+       int i;
 
        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
        if (ctx == NULL)
@@ -407,6 +408,9 @@ __create_hw_context(struct drm_i915_private *dev_priv,
        ctx->desc_template =
                default_desc_template(dev_priv, dev_priv->mm.aliasing_ppgtt);
 
+       for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
+               ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
+
        return ctx;
 
 err_pid:
index 071108d34ae007d43481274cb3a44d17cbff5463..dc6c58f38cfa8dd2219abd0cbe84048d14595f16 100644 (file)
@@ -209,10 +209,11 @@ struct i915_gem_context {
         */
        atomic_t active_count;
 
-#define CONTEXT_SCORE_GUILTY           10
-#define CONTEXT_SCORE_BAN_THRESHOLD    40
-       /** ban_score: Accumulated score of all hangs caused by this context. */
-       atomic_t ban_score;
+       /**
+        * @hang_timestamp: The last time(s) this context caused a GPU hang
+        */
+       unsigned long hang_timestamp[2];
+#define CONTEXT_FAST_HANG_JIFFIES (120 * HZ) /* 3 hangs within 120s? Banned! */
 
        /** remap_slice: Bitmask of cache lines that need remapping */
        u8 remap_slice;
index 9a65341fec097e500ace05a410b62f6f19390d21..3f6eddb6f6de7ee8bd21aa05296adcadb40874e3 100644 (file)
@@ -434,11 +434,6 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
                           ee->instdone.row[slice][subslice]);
 }
 
-static const char *bannable(const struct drm_i915_error_context *ctx)
-{
-       return ctx->bannable ? "" : " (unbannable)";
-}
-
 static void error_print_request(struct drm_i915_error_state_buf *m,
                                const char *prefix,
                                const struct drm_i915_error_request *erq,
@@ -447,9 +442,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
        if (!erq->seqno)
                return;
 
-       err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
-                  prefix, erq->pid, erq->ban_score,
-                  erq->context, erq->seqno,
+       err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
+                  prefix, erq->pid, erq->context, erq->seqno,
                   test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
                            &erq->flags) ? "!" : "",
                   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
@@ -463,10 +457,9 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
                                const char *header,
                                const struct drm_i915_error_context *ctx)
 {
-       err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
+       err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, guilty %d active %d\n",
                   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
-                  ctx->sched_attr.priority, ctx->ban_score, bannable(ctx),
-                  ctx->guilty, ctx->active);
+                  ctx->sched_attr.priority, ctx->guilty, ctx->active);
 }
 
 static void error_print_engine(struct drm_i915_error_state_buf *m,
@@ -688,12 +681,10 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
                if (!error->engine[i].context.pid)
                        continue;
 
-               err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
+               err_printf(m, "Active process (on ring %s): %s [%d]\n",
                           engine_name(m->i915, i),
                           error->engine[i].context.comm,
-                          error->engine[i].context.pid,
-                          error->engine[i].context.ban_score,
-                          bannable(&error->engine[i].context));
+                          error->engine[i].context.pid);
        }
        err_printf(m, "Reset count: %u\n", error->reset_count);
        err_printf(m, "Suspend count: %u\n", error->suspend_count);
@@ -779,13 +770,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
                if (obj) {
                        err_puts(m, m->i915->engine[i]->name);
                        if (ee->context.pid)
-                               err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
+                               err_printf(m, " (submitted by %s [%d], ctx %d [%d])",
                                           ee->context.comm,
                                           ee->context.pid,
                                           ee->context.handle,
-                                          ee->context.hw_id,
-                                          ee->context.ban_score,
-                                          bannable(&ee->context));
+                                          ee->context.hw_id);
                        err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
                                   upper_32_bits(obj->gtt_offset),
                                   lower_32_bits(obj->gtt_offset));
@@ -1301,8 +1290,6 @@ static void record_request(struct i915_request *request,
        erq->flags = request->fence.flags;
        erq->context = ctx->hw_id;
        erq->sched_attr = request->sched.attr;
-       erq->ban_score = atomic_read(&ctx->ban_score);
-       erq->seqno = request->global_seqno;
        erq->jiffies = request->emitted_jiffies;
        erq->start = i915_ggtt_offset(request->ring->vma);
        erq->head = request->head;
@@ -1396,8 +1383,6 @@ static void record_context(struct drm_i915_error_context *e,
        e->handle = ctx->user_handle;
        e->hw_id = ctx->hw_id;
        e->sched_attr = ctx->sched;
-       e->ban_score = atomic_read(&ctx->ban_score);
-       e->bannable = i915_gem_context_is_bannable(ctx);
        e->guilty = atomic_read(&ctx->guilty_count);
        e->active = atomic_read(&ctx->active_count);
 }
index afa3adb28f02db7fdc1bed464198fc4c50e164da..94eaf8ab905132c75d0c0ce4c6757cec623a57ac 100644 (file)
@@ -122,10 +122,8 @@ struct i915_gpu_state {
                        pid_t pid;
                        u32 handle;
                        u32 hw_id;
-                       int ban_score;
                        int active;
                        int guilty;
-                       bool bannable;
                        struct i915_sched_attr sched_attr;
                } context;
 
@@ -149,7 +147,6 @@ struct i915_gpu_state {
                        long jiffies;
                        pid_t pid;
                        u32 context;
-                       int ban_score;
                        u32 seqno;
                        u32 start;
                        u32 head;
index 5ab4e1c0161832ebb155792a15fa08f203638fde..a61e3a4fc9dc0d9bc3b3251bf6d8b65daaf63a96 100644 (file)
@@ -290,8 +290,6 @@ static void i915_request_retire(struct i915_request *request)
 
        i915_request_remove_from_client(request);
 
-       /* Retirement decays the ban score as it is a sign of ctx progress */
-       atomic_dec_if_positive(&request->gem_context->ban_score);
        intel_context_unpin(request->hw_context);
 
        __retire_engine_upto(request->engine, request);
index 034b654be74c366380eadb2ef8536c68cc9d43d6..c234feb5fdf55009fb65ce6229bb8f02a37151d5 100644 (file)
@@ -59,24 +59,29 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
 
 static bool context_mark_guilty(struct i915_gem_context *ctx)
 {
-       unsigned int score;
-       bool banned, bannable;
+       unsigned long prev_hang;
+       bool banned;
+       int i;
 
        atomic_inc(&ctx->guilty_count);
 
-       bannable = i915_gem_context_is_bannable(ctx);
-       score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
-       banned = (!i915_gem_context_is_recoverable(ctx) ||
-                 score >= CONTEXT_SCORE_BAN_THRESHOLD);
-
-       /* Cool contexts don't accumulate client ban score */
-       if (!bannable)
+       /* Cool contexts are too cool to be banned! (Used for reset testing.) */
+       if (!i915_gem_context_is_bannable(ctx))
                return false;
 
+       /* Record the timestamp for the last N hangs */
+       prev_hang = ctx->hang_timestamp[0];
+       for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
+               ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
+       ctx->hang_timestamp[i] = jiffies;
+
+       /* If we have hung N+1 times in rapid succession, we ban the context! */
+       banned = !i915_gem_context_is_recoverable(ctx);
+       if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
+               banned = true;
        if (banned) {
-               DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
-                                ctx->name, atomic_read(&ctx->guilty_count),
-                                score);
+               DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
+                                ctx->name, atomic_read(&ctx->guilty_count));
                i915_gem_context_set_banned(ctx);
        }