drm/i915: Track hw reported context runtime
authorTvrtko Ursulin <tvrtko.ursulin@intel.com>
Sun, 16 Feb 2020 13:36:20 +0000 (13:36 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Sun, 16 Feb 2020 15:16:22 +0000 (15:16 +0000)
GPU saves accumulated context runtime (in CS timestamp units) in PPHWSP
which will be useful for us in cases when we are not able to track context
busyness ourselves (like with GuC). Keep a copy of this in struct
intel_context from where it can be easily read even if the context is not
pinned.

v2:
 (Chris)
 * Do not store pphwsp address in intel_context.
 * Log CS wrap-around.
 * Simplify calculation by relying on integer wraparound.
v3:
 * Include total/avg in traces and error state for debugging

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200216133620.394962-1-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/gt/intel_context.c
drivers/gpu/drm/i915/gt/intel_context.h
drivers/gpu/drm/i915/gt/intel_context_types.h
drivers/gpu/drm/i915/gt/intel_lrc.c
drivers/gpu/drm/i915/gt/intel_lrc_reg.h
drivers/gpu/drm/i915/gt/selftest_lrc.c
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/i915_gpu_error.h
drivers/gpu/drm/i915/intel_device_info.c
drivers/gpu/drm/i915/intel_device_info.h

index e4f89341d17cfad2d7b6450ff0ba421266a51787..8bb444cda14fa2c21c6be9a9e0422539a63f7109 100644 (file)
@@ -220,7 +220,9 @@ static void __intel_context_retire(struct i915_active *active)
 {
        struct intel_context *ce = container_of(active, typeof(*ce), active);
 
-       CE_TRACE(ce, "retire\n");
+       CE_TRACE(ce, "retire runtime: { total:%lluns, avg:%lluns }\n",
+                intel_context_get_total_runtime_ns(ce),
+                intel_context_get_avg_runtime_ns(ce));
 
        set_bit(CONTEXT_VALID_BIT, &ce->flags);
        if (ce->state)
@@ -281,6 +283,8 @@ intel_context_init(struct intel_context *ce,
        ce->sseu = engine->sseu;
        ce->ring = __intel_context_ring_size(SZ_4K);
 
+       ewma_runtime_init(&ce->runtime.avg);
+
        ce->vm = i915_vm_get(engine->gt->vm);
 
        INIT_LIST_HEAD(&ce->signal_link);
index 604d5cfc46bab20af91b57bbe53cdd7f8838595c..18efad2551245a915917a1bcb893ba3a9d1a4099 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 
 #include "i915_active.h"
+#include "i915_drv.h"
 #include "intel_context_types.h"
 #include "intel_engine_types.h"
 #include "intel_ring_types.h"
@@ -227,4 +228,20 @@ intel_context_clear_nopreempt(struct intel_context *ce)
        clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
 }
 
+static inline u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
+{
+       const u32 period =
+               RUNTIME_INFO(ce->engine->i915)->cs_timestamp_period_ns;
+
+       return READ_ONCE(ce->runtime.total) * period;
+}
+
+static inline u64 intel_context_get_avg_runtime_ns(struct intel_context *ce)
+{
+       const u32 period =
+               RUNTIME_INFO(ce->engine->i915)->cs_timestamp_period_ns;
+
+       return mul_u32_u32(ewma_runtime_read(&ce->runtime.avg), period);
+}
+
 #endif /* __INTEL_CONTEXT_H__ */
index ca1420fb8b5339594b2b603db05525359d80b938..11278343b9b55709e0e581e19c44baceb64804d7 100644 (file)
@@ -7,6 +7,7 @@
 #ifndef __INTEL_CONTEXT_TYPES__
 #define __INTEL_CONTEXT_TYPES__
 
+#include <linux/average.h>
 #include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
@@ -19,6 +20,8 @@
 
 #define CONTEXT_REDZONE POISON_INUSE
 
+DECLARE_EWMA(runtime, 3, 8);
+
 struct i915_gem_context;
 struct i915_vma;
 struct intel_context;
@@ -68,6 +71,15 @@ struct intel_context {
        u64 lrc_desc;
        u32 tag; /* cookie passed to HW to track this context on submission */
 
+       /* Time on GPU as tracked by the hw. */
+       struct {
+               struct ewma_runtime avg;
+               u64 total;
+               u32 last;
+               I915_SELFTEST_DECLARE(u32 num_underflow);
+               I915_SELFTEST_DECLARE(u32 max_underflow);
+       } runtime;
+
        unsigned int active_count; /* protected by timeline->mutex */
 
        atomic_t pin_count;
index c3d7727021dba1c28c659b3450c2cd46a35052b7..78e854440949e3d5e4b0dbdcb059e58a8e2cbbe0 100644 (file)
@@ -1195,6 +1195,42 @@ static void reset_active(struct i915_request *rq,
        ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
 }
 
+static u32 intel_context_get_runtime(const struct intel_context *ce)
+{
+       /*
+        * We can use either ppHWSP[16] which is recorded before the context
+        * switch (and so excludes the cost of context switches) or use the
+        * value from the context image itself, which is saved/restored earlier
+        * and so includes the cost of the save.
+        */
+       return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
+}
+
+static void intel_context_update_runtime(struct intel_context *ce)
+{
+       u32 old;
+       s32 dt;
+
+       if (intel_context_is_barrier(ce))
+               return;
+
+       old = ce->runtime.last;
+       ce->runtime.last = intel_context_get_runtime(ce);
+       dt = ce->runtime.last - old;
+
+       if (unlikely(dt <= 0)) {
+               CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
+                        old, ce->runtime.last, dt);
+               I915_SELFTEST_ONLY(ce->runtime.num_underflow += dt < 0);
+               I915_SELFTEST_ONLY(ce->runtime.max_underflow =
+                                  max_t(u32, ce->runtime.max_underflow, -dt));
+               return;
+       }
+
+       ewma_runtime_add(&ce->runtime.avg, dt);
+       ce->runtime.total += dt;
+}
+
 static inline struct intel_engine_cs *
 __execlists_schedule_in(struct i915_request *rq)
 {
@@ -1278,6 +1314,7 @@ __execlists_schedule_out(struct i915_request *rq,
            i915_request_completed(rq))
                intel_engine_add_retire(engine, ce->timeline);
 
+       intel_context_update_runtime(ce);
        intel_engine_context_out(engine);
        execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
        intel_gt_pm_put_async(engine->gt);
@@ -4607,8 +4644,13 @@ populate_lr_context(struct intel_context *ce,
                inhibit = false;
        }
 
-       /* The second page of the context object contains some fields which must
-        * be set up prior to the first execution. */
+       /* Clear the ppHWSP (inc. per-context counters) */
+       memset(vaddr, 0, PAGE_SIZE);
+
+       /*
+        * The second page of the context object contains some registers which
+        * must be set up prior to the first execution.
+        */
        execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
                                 ce, engine, ring, inhibit);
 
index 08a3be65f70093e250e296124616759d6483bcf0..d39b72590e40a0be5890f76ecf9ea494a3f46eec 100644 (file)
@@ -17,6 +17,7 @@
 #define CTX_RING_CTL                   (0x0a + 1)
 #define CTX_BB_STATE                   (0x10 + 1)
 #define CTX_BB_PER_CTX_PTR             (0x18 + 1)
+#define CTX_TIMESTAMP                  (0x22 + 1)
 #define CTX_PDP3_UDW                   (0x24 + 1)
 #define CTX_PDP3_LDW                   (0x26 + 1)
 #define CTX_PDP2_UDW                   (0x28 + 1)
index 64761e6198767abd4304ec121233e5780b295702..40c53cc1c7c0e74765505bd0f78b59d0dca5d7b5 100644 (file)
@@ -4151,6 +4151,11 @@ static int live_lrc_fixed(void *arg)
                                CTX_BB_STATE - 1,
                                "BB_STATE"
                        },
+                       {
+                               i915_mmio_reg_offset(RING_CTX_TIMESTAMP(engine->mmio_base)),
+                               CTX_TIMESTAMP - 1,
+                               "RING_CTX_TIMESTAMP"
+                       },
                        { },
                }, *t;
                u32 *hw;
@@ -4450,6 +4455,96 @@ static int live_gpr_clear(void *arg)
        return err;
 }
 
+static int __live_pphwsp_runtime(struct intel_engine_cs *engine)
+{
+       struct intel_context *ce;
+       struct i915_request *rq;
+       IGT_TIMEOUT(end_time);
+       int err;
+
+       ce = intel_context_create(engine);
+       if (IS_ERR(ce))
+               return PTR_ERR(ce);
+
+       ce->runtime.num_underflow = 0;
+       ce->runtime.max_underflow = 0;
+
+       do {
+               unsigned int loop = 1024;
+
+               while (loop) {
+                       rq = intel_context_create_request(ce);
+                       if (IS_ERR(rq)) {
+                               err = PTR_ERR(rq);
+                               goto err_rq;
+                       }
+
+                       if (--loop == 0)
+                               i915_request_get(rq);
+
+                       i915_request_add(rq);
+               }
+
+               if (__igt_timeout(end_time, NULL))
+                       break;
+
+               i915_request_put(rq);
+       } while (1);
+
+       err = i915_request_wait(rq, 0, HZ / 5);
+       if (err < 0) {
+               pr_err("%s: request not completed!\n", engine->name);
+               goto err_wait;
+       }
+
+       igt_flush_test(engine->i915);
+
+       pr_info("%s: pphwsp runtime %lluns, average %lluns\n",
+               engine->name,
+               intel_context_get_total_runtime_ns(ce),
+               intel_context_get_avg_runtime_ns(ce));
+
+       err = 0;
+       if (ce->runtime.num_underflow) {
+               pr_err("%s: pphwsp underflow %u time(s), max %u cycles!\n",
+                      engine->name,
+                      ce->runtime.num_underflow,
+                      ce->runtime.max_underflow);
+               GEM_TRACE_DUMP();
+               err = -EOVERFLOW;
+       }
+
+err_wait:
+       i915_request_put(rq);
+err_rq:
+       intel_context_put(ce);
+       return err;
+}
+
+static int live_pphwsp_runtime(void *arg)
+{
+       struct intel_gt *gt = arg;
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+       int err = 0;
+
+       /*
+        * Check that cumulative context runtime as stored in the pphwsp[16]
+        * is monotonic.
+        */
+
+       for_each_engine(engine, gt, id) {
+               err = __live_pphwsp_runtime(engine);
+               if (err)
+                       break;
+       }
+
+       if (igt_flush_test(gt->i915))
+               err = -EIO;
+
+       return err;
+}
+
 int intel_lrc_live_selftests(struct drm_i915_private *i915)
 {
        static const struct i915_subtest tests[] = {
@@ -4457,6 +4552,7 @@ int intel_lrc_live_selftests(struct drm_i915_private *i915)
                SUBTEST(live_lrc_fixed),
                SUBTEST(live_lrc_state),
                SUBTEST(live_gpr_clear),
+               SUBTEST(live_pphwsp_runtime),
        };
 
        if (!HAS_LOGICAL_RING_CONTEXTS(i915))
index b2ed977ed971103a1282f9bf8ebca697095b70ef..3052c4eaf9f64c9f7884526d1a091cff2ceec12c 100644 (file)
@@ -481,9 +481,13 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
                                const char *header,
                                const struct i915_gem_context_coredump *ctx)
 {
-       err_printf(m, "%s%s[%d] prio %d, guilty %d active %d\n",
+       const u32 period = RUNTIME_INFO(m->i915)->cs_timestamp_period_ns;
+
+       err_printf(m, "%s%s[%d] prio %d, guilty %d active %d, runtime total %lluns, avg %lluns\n",
                   header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
-                  ctx->guilty, ctx->active);
+                  ctx->guilty, ctx->active,
+                  ctx->total_runtime * period,
+                  mul_u32_u32(ctx->avg_runtime, period));
 }
 
 static struct i915_vma_coredump *
@@ -1260,6 +1264,9 @@ static bool record_context(struct i915_gem_context_coredump *e,
        e->guilty = atomic_read(&ctx->guilty_count);
        e->active = atomic_read(&ctx->active_count);
 
+       e->total_runtime = rq->context->runtime.total;
+       e->avg_runtime = ewma_runtime_read(&rq->context->runtime.avg);
+
        simulated = i915_gem_context_no_error_capture(ctx);
 
        i915_gem_context_put(ctx);
index b35bc9edd7332242271842e0e45918fb03ee912d..0d1f6c8ff3556fa3d8923692741272045141358e 100644 (file)
@@ -88,6 +88,10 @@ struct intel_engine_coredump {
 
        struct i915_gem_context_coredump {
                char comm[TASK_COMM_LEN];
+
+               u64 total_runtime;
+               u32 avg_runtime;
+
                pid_t pid;
                int active;
                int guilty;
index fcdacd6d4aa5531d0b0767b91501156d48e429b3..a97437fac88455caa4b918e8dc6e016dffe53f68 100644 (file)
@@ -1045,6 +1045,12 @@ void intel_device_info_runtime_init(struct drm_i915_private *dev_priv)
 
        /* Initialize command stream timestamp frequency */
        runtime->cs_timestamp_frequency_khz = read_timestamp_frequency(dev_priv);
+       runtime->cs_timestamp_period_ns =
+               div_u64(1e6, runtime->cs_timestamp_frequency_khz);
+       drm_dbg(&dev_priv->drm,
+               "CS timestamp wraparound in %lldms\n",
+               div_u64(mul_u32_u32(runtime->cs_timestamp_period_ns, S32_MAX),
+                       USEC_PER_SEC));
 }
 
 void intel_driver_caps_print(const struct intel_driver_caps *caps,
index 7d4d122d2182f1ea0190bfd164473a162c4d3afb..f8bfa26388c13f38c7c4895ffba02c85312aec55 100644 (file)
@@ -217,6 +217,7 @@ struct intel_runtime_info {
        struct sseu_dev_info sseu;
 
        u32 cs_timestamp_frequency_khz;
+       u32 cs_timestamp_period_ns;
 
        /* Media engine access to SFC per instance */
        u8 vdbox_sfc_access;