drivers/gpu/drm/i915/intel_ringbuffer.h

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 #ifndef _INTEL_RINGBUFFER_H_
   3 #define _INTEL_RINGBUFFER_H_
   4
   5 #include <linux/hashtable.h>
   6 #include "i915_gem_batch_pool.h"
   7 #include "i915_gem_request.h"
   8 #include "i915_gem_timeline.h"
   9 #include "i915_pmu.h"
  10 #include "i915_selftest.h"
  11
  12 struct drm_printer;
  13
  14 #define I915_CMD_HASH_ORDER 9
  15
  16 /* Early gen2 devices have a cacheline of just 32 bytes, using 64 is overkill,
  17  * but keeps the logic simple. Indeed, the whole purpose of this macro is just
  18  * to give some inclination as to some of the magic values used in the various
  19  * workarounds!
  20  */
  21 #define CACHELINE_BYTES 64
  22 #define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(uint32_t))
  23
  24 struct intel_hw_status_page {
  25         struct i915_vma *vma;
  26         u32 *page_addr;
  27         u32 ggtt_offset;
  28 };
  29
  30 #define I915_READ_TAIL(engine) I915_READ(RING_TAIL((engine)->mmio_base))
  31 #define I915_WRITE_TAIL(engine, val) I915_WRITE(RING_TAIL((engine)->mmio_base), val)
  32
  33 #define I915_READ_START(engine) I915_READ(RING_START((engine)->mmio_base))
  34 #define I915_WRITE_START(engine, val) I915_WRITE(RING_START((engine)->mmio_base), val)
  35
  36 #define I915_READ_HEAD(engine)  I915_READ(RING_HEAD((engine)->mmio_base))
  37 #define I915_WRITE_HEAD(engine, val) I915_WRITE(RING_HEAD((engine)->mmio_base), val)
  38
  39 #define I915_READ_CTL(engine) I915_READ(RING_CTL((engine)->mmio_base))
  40 #define I915_WRITE_CTL(engine, val) I915_WRITE(RING_CTL((engine)->mmio_base), val)
  41
  42 #define I915_READ_IMR(engine) I915_READ(RING_IMR((engine)->mmio_base))
  43 #define I915_WRITE_IMR(engine, val) I915_WRITE(RING_IMR((engine)->mmio_base), val)
  44
  45 #define I915_READ_MODE(engine) I915_READ(RING_MI_MODE((engine)->mmio_base))
  46 #define I915_WRITE_MODE(engine, val) I915_WRITE(RING_MI_MODE((engine)->mmio_base), val)
  47
  48 /* seqno size is actually only a uint32, but since we plan to use MI_FLUSH_DW to
  49  * do the writes, and that must have qw aligned offsets, simply pretend it's 8b.
  50  */
  51 enum intel_engine_hangcheck_action {
  52         ENGINE_IDLE = 0,
  53         ENGINE_WAIT,
  54         ENGINE_ACTIVE_SEQNO,
  55         ENGINE_ACTIVE_HEAD,
  56         ENGINE_ACTIVE_SUBUNITS,
  57         ENGINE_WAIT_KICK,
  58         ENGINE_DEAD,
  59 };
  60
  61 static inline const char *
  62 hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
  63 {
  64         switch (a) {
  65         case ENGINE_IDLE:
  66                 return "idle";
  67         case ENGINE_WAIT:
  68                 return "wait";
  69         case ENGINE_ACTIVE_SEQNO:
  70                 return "active seqno";
  71         case ENGINE_ACTIVE_HEAD:
  72                 return "active head";
  73         case ENGINE_ACTIVE_SUBUNITS:
  74                 return "active subunits";
  75         case ENGINE_WAIT_KICK:
  76                 return "wait kick";
  77         case ENGINE_DEAD:
  78                 return "dead";
  79         }
  80
  81         return "unknown";
  82 }
  83
  84 #define I915_MAX_SLICES 3
  85 #define I915_MAX_SUBSLICES 3
  86
  87 #define instdone_slice_mask(dev_priv__) \
  88         (INTEL_GEN(dev_priv__) == 7 ? \
  89          1 : INTEL_INFO(dev_priv__)->sseu.slice_mask)
  90
  91 #define instdone_subslice_mask(dev_priv__) \
  92         (INTEL_GEN(dev_priv__) == 7 ? \
  93          1 : INTEL_INFO(dev_priv__)->sseu.subslice_mask)
  94
  95 #define for_each_instdone_slice_subslice(dev_priv__, slice__, subslice__) \
  96         for ((slice__) = 0, (subslice__) = 0; \
  97              (slice__) < I915_MAX_SLICES; \
  98              (subslice__) = ((subslice__) + 1) < I915_MAX_SUBSLICES ? (subslice__) + 1 : 0, \
  99                (slice__) += ((subslice__) == 0)) \
 100                 for_each_if((BIT(slice__) & instdone_slice_mask(dev_priv__)) && \
 101                             (BIT(subslice__) & instdone_subslice_mask(dev_priv__)))
 102
 103 struct intel_instdone {
 104         u32 instdone;
 105         /* The following exist only in the RCS engine */
 106         u32 slice_common;
 107         u32 sampler[I915_MAX_SLICES][I915_MAX_SUBSLICES];
 108         u32 row[I915_MAX_SLICES][I915_MAX_SUBSLICES];
 109 };
 110
 111 struct intel_engine_hangcheck {
 112         u64 acthd;
 113         u32 seqno;
 114         enum intel_engine_hangcheck_action action;
 115         unsigned long action_timestamp;
 116         int deadlock;
 117         struct intel_instdone instdone;
 118         struct drm_i915_gem_request *active_request;
 119         bool stalled;
 120 };
 121
 122 struct intel_ring {
 123         struct i915_vma *vma;
 124         void *vaddr;
 125
 126         struct list_head request_list;
 127
 128         u32 head;
 129         u32 tail;
 130         u32 emit;
 131
 132         u32 space;
 133         u32 size;
 134         u32 effective_size;
 135 };
 136
 137 struct i915_gem_context;
 138 struct drm_i915_reg_table;
 139
 140 /*
 141  * we use a single page to load ctx workarounds so all of these
 142  * values are referred in terms of dwords
 143  *
 144  * struct i915_wa_ctx_bb:
 145  *  offset: specifies batch starting position, also helpful in case
 146  *    if we want to have multiple batches at different offsets based on
 147  *    some criteria. It is not a requirement at the moment but provides
 148  *    an option for future use.
 149  *  size: size of the batch in DWORDS
 150  */
 151 struct i915_ctx_workarounds {
 152         struct i915_wa_ctx_bb {
 153                 u32 offset;
 154                 u32 size;
 155         } indirect_ctx, per_ctx;
 156         struct i915_vma *vma;
 157 };
 158
 159 struct drm_i915_gem_request;
 160
 161 /*
 162  * Engine IDs definitions.
 163  * Keep instances of the same type engine together.
 164  */
 165 enum intel_engine_id {
 166         RCS = 0,
 167         BCS,
 168         VCS,
 169         VCS2,
 170 #define _VCS(n) (VCS + (n))
 171         VECS
 172 };
 173
 174 struct i915_priolist {
 175         struct rb_node node;
 176         struct list_head requests;
 177         int priority;
 178 };
 179
 180 /**
 181  * struct intel_engine_execlists - execlist submission queue and port state
 182  *
 183  * The struct intel_engine_execlists represents the combined logical state of
 184  * driver and the hardware state for execlist mode of submission.
 185  */
 186 struct intel_engine_execlists {
 187         /**
 188          * @tasklet: softirq tasklet for bottom handler
 189          */
 190         struct tasklet_struct tasklet;
 191
 192         /**
 193          * @default_priolist: priority list for I915_PRIORITY_NORMAL
 194          */
 195         struct i915_priolist default_priolist;
 196
 197         /**
 198          * @no_priolist: priority lists disabled
 199          */
 200         bool no_priolist;
 201
 202         /**
 203          * @port: execlist port states
 204          *
 205          * For each hardware ELSP (ExecList Submission Port) we keep
 206          * track of the last request and the number of times we submitted
 207          * that port to hw. We then count the number of times the hw reports
 208          * a context completion or preemption. As only one context can
 209          * be active on hw, we limit resubmission of context to port[0]. This
 210          * is called Lite Restore, of the context.
 211          */
 212         struct execlist_port {
 213                 /**
 214                  * @request_count: combined request and submission count
 215                  */
 216                 struct drm_i915_gem_request *request_count;
 217 #define EXECLIST_COUNT_BITS 2
 218 #define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS)
 219 #define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS)
 220 #define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS)
 221 #define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, EXECLIST_COUNT_BITS)
 222 #define port_set(p, packed) ((p)->request_count = (packed))
 223 #define port_isset(p) ((p)->request_count)
 224 #define port_index(p, execlists) ((p) - (execlists)->port)
 225
 226                 /**
 227                  * @context_id: context ID for port
 228                  */
 229                 GEM_DEBUG_DECL(u32 context_id);
 230
 231 #define EXECLIST_MAX_PORTS 2
 232         } port[EXECLIST_MAX_PORTS];
 233
 234         /**
 235          * @active: is the HW active? We consider the HW as active after
 236          * submitting any context for execution and until we have seen the
 237          * last context completion event. After that, we do not expect any
 238          * more events until we submit, and so can park the HW.
 239          *
 240          * As we have a small number of different sources from which we feed
 241          * the HW, we track the state of each inside a single bitfield.
 242          */
 243         unsigned int active;
 244 #define EXECLISTS_ACTIVE_USER 0
 245 #define EXECLISTS_ACTIVE_PREEMPT 1
 246 #define EXECLISTS_ACTIVE_HWACK 2
 247
 248         /**
 249          * @port_mask: number of execlist ports - 1
 250          */
 251         unsigned int port_mask;
 252
 253         /**
 254          * @queue: queue of requests, in priority lists
 255          */
 256         struct rb_root queue;
 257
 258         /**
 259          * @first: leftmost level in priority @queue
 260          */
 261         struct rb_node *first;
 262
 263         /**
 264          * @fw_domains: forcewake domains for irq tasklet
 265          */
 266         unsigned int fw_domains;
 267
 268         /**
 269          * @csb_head: context status buffer head
 270          */
 271         unsigned int csb_head;
 272
 273         /**
 274          * @csb_use_mmio: access csb through mmio, instead of hwsp
 275          */
 276         bool csb_use_mmio;
 277 };
 278
 279 #define INTEL_ENGINE_CS_MAX_NAME 8
 280
 281 struct intel_engine_cs {
 282         struct drm_i915_private *i915;
 283         char name[INTEL_ENGINE_CS_MAX_NAME];
 284
 285         enum intel_engine_id id;
 286         unsigned int hw_id;
 287         unsigned int guc_id;
 288
 289         u8 uabi_id;
 290         u8 uabi_class;
 291
 292         u8 class;
 293         u8 instance;
 294         u32 context_size;
 295         u32 mmio_base;
 296         unsigned int irq_shift;
 297
 298         struct intel_ring *buffer;
 299         struct intel_timeline *timeline;
 300
 301         struct drm_i915_gem_object *default_state;
 302
 303         atomic_t irq_count;
 304         unsigned long irq_posted;
 305 #define ENGINE_IRQ_BREADCRUMB 0
 306 #define ENGINE_IRQ_EXECLIST 1
 307
 308         /* Rather than have every client wait upon all user interrupts,
 309          * with the herd waking after every interrupt and each doing the
 310          * heavyweight seqno dance, we delegate the task (of being the
 311          * bottom-half of the user interrupt) to the first client. After
 312          * every interrupt, we wake up one client, who does the heavyweight
 313          * coherent seqno read and either goes back to sleep (if incomplete),
 314          * or wakes up all the completed clients in parallel, before then
 315          * transferring the bottom-half status to the next client in the queue.
 316          *
 317          * Compared to walking the entire list of waiters in a single dedicated
 318          * bottom-half, we reduce the latency of the first waiter by avoiding
 319          * a context switch, but incur additional coherent seqno reads when
 320          * following the chain of request breadcrumbs. Since it is most likely
 321          * that we have a single client waiting on each seqno, then reducing
 322          * the overhead of waking that client is much preferred.
 323          */
 324         struct intel_breadcrumbs {
 325                 spinlock_t irq_lock; /* protects irq_*; irqsafe */
 326                 struct intel_wait *irq_wait; /* oldest waiter by retirement */
 327
 328                 spinlock_t rb_lock; /* protects the rb and wraps irq_lock */
 329                 struct rb_root waiters; /* sorted by retirement, priority */
 330                 struct rb_root signals; /* sorted by retirement */
 331                 struct task_struct *signaler; /* used for fence signalling */
 332                 struct drm_i915_gem_request __rcu *first_signal;
 333                 struct timer_list fake_irq; /* used after a missed interrupt */
 334                 struct timer_list hangcheck; /* detect missed interrupts */
 335
 336                 unsigned int hangcheck_interrupts;
 337                 unsigned int irq_enabled;
 338
 339                 bool irq_armed : 1;
 340                 I915_SELFTEST_DECLARE(bool mock : 1);
 341         } breadcrumbs;
 342
 343         struct {
 344                 /**
 345                  * @enable: Bitmask of enable sample events on this engine.
 346                  *
 347                  * Bits correspond to sample event types, for instance
 348                  * I915_SAMPLE_QUEUED is bit 0 etc.
 349                  */
 350                 u32 enable;
 351                 /**
 352                  * @enable_count: Reference count for the enabled samplers.
 353                  *
 354                  * Index number corresponds to the bit number from @enable.
 355                  */
 356                 unsigned int enable_count[I915_PMU_SAMPLE_BITS];
 357                 /**
 358                  * @sample: Counter values for sampling events.
 359                  *
 360                  * Our internal timer stores the current counters in this field.
 361                  */
 362 #define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_SEMA + 1)
 363                 struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX];
 364                 /**
 365                  * @busy_stats: Has enablement of engine stats tracking been
 366                  *              requested.
 367                  */
 368                 bool busy_stats;
 369                 /**
 370                  * @disable_busy_stats: Work item for busy stats disabling.
 371                  *
 372                  * Same as with @enable_busy_stats action, with the difference
 373                  * that we delay it in case there are rapid enable-disable
 374                  * actions, which can happen during tool startup (like perf
 375                  * stat).
 376                  */
 377                 struct delayed_work disable_busy_stats;
 378         } pmu;
 379
 380         /*
 381          * A pool of objects to use as shadow copies of client batch buffers
 382          * when the command parser is enabled. Prevents the client from
 383          * modifying the batch contents after software parsing.
 384          */
 385         struct i915_gem_batch_pool batch_pool;
 386
 387         struct intel_hw_status_page status_page;
 388         struct i915_ctx_workarounds wa_ctx;
 389         struct i915_vma *scratch;
 390
 391         u32             irq_keep_mask; /* always keep these interrupts */
 392         u32             irq_enable_mask; /* bitmask to enable ring interrupt */
 393         void            (*irq_enable)(struct intel_engine_cs *engine);
 394         void            (*irq_disable)(struct intel_engine_cs *engine);
 395
 396         int             (*init_hw)(struct intel_engine_cs *engine);
 397         void            (*reset_hw)(struct intel_engine_cs *engine,
 398                                     struct drm_i915_gem_request *req);
 399
 400         void            (*park)(struct intel_engine_cs *engine);
 401         void            (*unpark)(struct intel_engine_cs *engine);
 402
 403         void            (*set_default_submission)(struct intel_engine_cs *engine);
 404
 405         struct intel_ring *(*context_pin)(struct intel_engine_cs *engine,
 406                                           struct i915_gem_context *ctx);
 407         void            (*context_unpin)(struct intel_engine_cs *engine,
 408                                          struct i915_gem_context *ctx);
 409         int             (*request_alloc)(struct drm_i915_gem_request *req);
 410         int             (*init_context)(struct drm_i915_gem_request *req);
 411
 412         int             (*emit_flush)(struct drm_i915_gem_request *request,
 413                                       u32 mode);
 414 #define EMIT_INVALIDATE BIT(0)
 415 #define EMIT_FLUSH      BIT(1)
 416 #define EMIT_BARRIER    (EMIT_INVALIDATE | EMIT_FLUSH)
 417         int             (*emit_bb_start)(struct drm_i915_gem_request *req,
 418                                          u64 offset, u32 length,
 419                                          unsigned int dispatch_flags);
 420 #define I915_DISPATCH_SECURE BIT(0)
 421 #define I915_DISPATCH_PINNED BIT(1)
 422 #define I915_DISPATCH_RS     BIT(2)
 423         void            (*emit_breadcrumb)(struct drm_i915_gem_request *req,
 424                                            u32 *cs);
 425         int             emit_breadcrumb_sz;
 426
 427         /* Pass the request to the hardware queue (e.g. directly into
 428          * the legacy ringbuffer or to the end of an execlist).
 429          *
 430          * This is called from an atomic context with irqs disabled; must
 431          * be irq safe.
 432          */
 433         void            (*submit_request)(struct drm_i915_gem_request *req);
 434
 435         /* Call when the priority on a request has changed and it and its
 436          * dependencies may need rescheduling. Note the request itself may
 437          * not be ready to run!
 438          *
 439          * Called under the struct_mutex.
 440          */
 441         void            (*schedule)(struct drm_i915_gem_request *request,
 442                                     int priority);
 443
 444         /*
 445          * Cancel all requests on the hardware, or queued for execution.
 446          * This should only cancel the ready requests that have been
 447          * submitted to the engine (via the engine->submit_request callback).
 448          * This is called when marking the device as wedged.
 449          */
 450         void            (*cancel_requests)(struct intel_engine_cs *engine);
 451
 452         /* Some chipsets are not quite as coherent as advertised and need
 453          * an expensive kick to force a true read of the up-to-date seqno.
 454          * However, the up-to-date seqno is not always required and the last
 455          * seen value is good enough. Note that the seqno will always be
 456          * monotonic, even if not coherent.
 457          */
 458         void            (*irq_seqno_barrier)(struct intel_engine_cs *engine);
 459         void            (*cleanup)(struct intel_engine_cs *engine);
 460
 461         /* GEN8 signal/wait table - never trust comments!
 462          *        signal to     signal to    signal to   signal to      signal to
 463          *          RCS            VCS          BCS        VECS          VCS2
 464          *      --------------------------------------------------------------------
 465          *  RCS | NOP (0x00) | VCS (0x08) | BCS (0x10) | VECS (0x18) | VCS2 (0x20) |
 466          *      |-------------------------------------------------------------------
 467          *  VCS | RCS (0x28) | NOP (0x30) | BCS (0x38) | VECS (0x40) | VCS2 (0x48) |
 468          *      |-------------------------------------------------------------------
 469          *  BCS | RCS (0x50) | VCS (0x58) | NOP (0x60) | VECS (0x68) | VCS2 (0x70) |
 470          *      |-------------------------------------------------------------------
 471          * VECS | RCS (0x78) | VCS (0x80) | BCS (0x88) |  NOP (0x90) | VCS2 (0x98) |
 472          *      |-------------------------------------------------------------------
 473          * VCS2 | RCS (0xa0) | VCS (0xa8) | BCS (0xb0) | VECS (0xb8) | NOP  (0xc0) |
 474          *      |-------------------------------------------------------------------
 475          *
 476          * Generalization:
 477          *  f(x, y) := (x->id * NUM_RINGS * seqno_size) + (seqno_size * y->id)
 478          *  ie. transpose of g(x, y)
 479          *
 480          *       sync from      sync from    sync from    sync from     sync from
 481          *          RCS            VCS          BCS        VECS          VCS2
 482          *      --------------------------------------------------------------------
 483          *  RCS | NOP (0x00) | VCS (0x28) | BCS (0x50) | VECS (0x78) | VCS2 (0xa0) |
 484          *      |-------------------------------------------------------------------
 485          *  VCS | RCS (0x08) | NOP (0x30) | BCS (0x58) | VECS (0x80) | VCS2 (0xa8) |
 486          *      |-------------------------------------------------------------------
 487          *  BCS | RCS (0x10) | VCS (0x38) | NOP (0x60) | VECS (0x88) | VCS2 (0xb0) |
 488          *      |-------------------------------------------------------------------
 489          * VECS | RCS (0x18) | VCS (0x40) | BCS (0x68) |  NOP (0x90) | VCS2 (0xb8) |
 490          *      |-------------------------------------------------------------------
 491          * VCS2 | RCS (0x20) | VCS (0x48) | BCS (0x70) | VECS (0x98) |  NOP (0xc0) |
 492          *      |-------------------------------------------------------------------
 493          *
 494          * Generalization:
 495          *  g(x, y) := (y->id * NUM_RINGS * seqno_size) + (seqno_size * x->id)
 496          *  ie. transpose of f(x, y)
 497          */
 498         struct {
 499 #define GEN6_SEMAPHORE_LAST     VECS_HW
 500 #define GEN6_NUM_SEMAPHORES     (GEN6_SEMAPHORE_LAST + 1)
 501 #define GEN6_SEMAPHORES_MASK    GENMASK(GEN6_SEMAPHORE_LAST, 0)
 502                 struct {
 503                         /* our mbox written by others */
 504                         u32             wait[GEN6_NUM_SEMAPHORES];
 505                         /* mboxes this ring signals to */
 506                         i915_reg_t      signal[GEN6_NUM_SEMAPHORES];
 507                 } mbox;
 508
 509                 /* AKA wait() */
 510                 int     (*sync_to)(struct drm_i915_gem_request *req,
 511                                    struct drm_i915_gem_request *signal);
 512                 u32     *(*signal)(struct drm_i915_gem_request *req, u32 *cs);
 513         } semaphore;
 514
 515         struct intel_engine_execlists execlists;
 516
 517         /* Contexts are pinned whilst they are active on the GPU. The last
 518          * context executed remains active whilst the GPU is idle - the
 519          * switch away and write to the context object only occurs on the
 520          * next execution.  Contexts are only unpinned on retirement of the
 521          * following request ensuring that we can always write to the object
 522          * on the context switch even after idling. Across suspend, we switch
 523          * to the kernel context and trash it as the save may not happen
 524          * before the hardware is powered down.
 525          */
 526         struct i915_gem_context *last_retired_context;
 527
 528         /* We track the current MI_SET_CONTEXT in order to eliminate
 529          * redudant context switches. This presumes that requests are not
 530          * reordered! Or when they are the tracking is updated along with
 531          * the emission of individual requests into the legacy command
 532          * stream (ring).
 533          */
 534         struct i915_gem_context *legacy_active_context;
 535         struct i915_hw_ppgtt *legacy_active_ppgtt;
 536
 537         /* status_notifier: list of callbacks for context-switch changes */
 538         struct atomic_notifier_head context_status_notifier;
 539
 540         struct intel_engine_hangcheck hangcheck;
 541
 542 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 543 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 544         unsigned int flags;
 545
 546         /*
 547          * Table of commands the command parser needs to know about
 548          * for this engine.
 549          */
 550         DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER);
 551
 552         /*
 553          * Table of registers allowed in commands that read/write registers.
 554          */
 555         const struct drm_i915_reg_table *reg_tables;
 556         int reg_table_count;
 557
 558         /*
 559          * Returns the bitmask for the length field of the specified command.
 560          * Return 0 for an unrecognized/invalid command.
 561          *
 562          * If the command parser finds an entry for a command in the engine's
 563          * cmd_tables, it gets the command's length based on the table entry.
 564          * If not, it calls this function to determine the per-engine length
 565          * field encoding for the command (i.e. different opcode ranges use
 566          * certain bits to encode the command length in the header).
 567          */
 568         u32 (*get_cmd_length_mask)(u32 cmd_header);
 569
 570         struct {
 571                 /**
 572                  * @lock: Lock protecting the below fields.
 573                  */
 574                 spinlock_t lock;
 575                 /**
 576                  * @enabled: Reference count indicating number of listeners.
 577                  */
 578                 unsigned int enabled;
 579                 /**
 580                  * @active: Number of contexts currently scheduled in.
 581                  */
 582                 unsigned int active;
 583                 /**
 584                  * @enabled_at: Timestamp when busy stats were enabled.
 585                  */
 586                 ktime_t enabled_at;
 587                 /**
 588                  * @start: Timestamp of the last idle to active transition.
 589                  *
 590                  * Idle is defined as active == 0, active is active > 0.
 591                  */
 592                 ktime_t start;
 593                 /**
 594                  * @total: Total time this engine was busy.
 595                  *
 596                  * Accumulated time not counting the most recent block in cases
 597                  * where engine is currently busy (active > 0).
 598                  */
 599                 ktime_t total;
 600         } stats;
 601 };
 602
 603 static inline bool intel_engine_needs_cmd_parser(struct intel_engine_cs *engine)
 604 {
 605         return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER;
 606 }
 607
 608 static inline bool intel_engine_supports_stats(struct intel_engine_cs *engine)
 609 {
 610         return engine->flags & I915_ENGINE_SUPPORTS_STATS;
 611 }
 612
 613 static inline void
 614 execlists_set_active(struct intel_engine_execlists *execlists,
 615                      unsigned int bit)
 616 {
 617         __set_bit(bit, (unsigned long *)&execlists->active);
 618 }
 619
 620 static inline void
 621 execlists_clear_active(struct intel_engine_execlists *execlists,
 622                        unsigned int bit)
 623 {
 624         __clear_bit(bit, (unsigned long *)&execlists->active);
 625 }
 626
 627 static inline bool
 628 execlists_is_active(const struct intel_engine_execlists *execlists,
 629                     unsigned int bit)
 630 {
 631         return test_bit(bit, (unsigned long *)&execlists->active);
 632 }
 633
 634 void
 635 execlists_cancel_port_requests(struct intel_engine_execlists * const execlists);
 636
 637 void
 638 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists);
 639
 640 static inline unsigned int
 641 execlists_num_ports(const struct intel_engine_execlists * const execlists)
 642 {
 643         return execlists->port_mask + 1;
 644 }
 645
 646 static inline void
 647 execlists_port_complete(struct intel_engine_execlists * const execlists,
 648                         struct execlist_port * const port)
 649 {
 650         const unsigned int m = execlists->port_mask;
 651
 652         GEM_BUG_ON(port_index(port, execlists) != 0);
 653         GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
 654
 655         memmove(port, port + 1, m * sizeof(struct execlist_port));
 656         memset(port + m, 0, sizeof(struct execlist_port));
 657 }
 658
 659 static inline unsigned int
 660 intel_engine_flag(const struct intel_engine_cs *engine)
 661 {
 662         return BIT(engine->id);
 663 }
 664
 665 static inline u32
 666 intel_read_status_page(struct intel_engine_cs *engine, int reg)
 667 {
 668         /* Ensure that the compiler doesn't optimize away the load. */
 669         return READ_ONCE(engine->status_page.page_addr[reg]);
 670 }
 671
 672 static inline void
 673 intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 674 {
 675         /* Writing into the status page should be done sparingly. Since
 676          * we do when we are uncertain of the device state, we take a bit
 677          * of extra paranoia to try and ensure that the HWS takes the value
 678          * we give and that it doesn't end up trapped inside the CPU!
 679          */
 680         if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
 681                 mb();
 682                 clflush(&engine->status_page.page_addr[reg]);
 683                 engine->status_page.page_addr[reg] = value;
 684                 clflush(&engine->status_page.page_addr[reg]);
 685                 mb();
 686         } else {
 687                 WRITE_ONCE(engine->status_page.page_addr[reg], value);
 688         }
 689 }
 690
 691 /*
 692  * Reads a dword out of the status page, which is written to from the command
 693  * queue by automatic updates, MI_REPORT_HEAD, MI_STORE_DATA_INDEX, or
 694  * MI_STORE_DATA_IMM.
 695  *
 696  * The following dwords have a reserved meaning:
 697  * 0x00: ISR copy, updated when an ISR bit not set in the HWSTAM changes.
 698  * 0x04: ring 0 head pointer
 699  * 0x05: ring 1 head pointer (915-class)
 700  * 0x06: ring 2 head pointer (915-class)
 701  * 0x10-0x1b: Context status DWords (GM45)
 702  * 0x1f: Last written status offset. (GM45)
 703  * 0x20-0x2f: Reserved (Gen6+)
 704  *
 705  * The area from dword 0x30 to 0x3ff is available for driver usage.
 706  */
 707 #define I915_GEM_HWS_INDEX              0x30
 708 #define I915_GEM_HWS_INDEX_ADDR (I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
 709 #define I915_GEM_HWS_PREEMPT_INDEX      0x32
 710 #define I915_GEM_HWS_PREEMPT_ADDR (I915_GEM_HWS_PREEMPT_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
 711 #define I915_GEM_HWS_SCRATCH_INDEX      0x40
 712 #define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
 713
 714 #define I915_HWS_CSB_BUF0_INDEX         0x10
 715 #define I915_HWS_CSB_WRITE_INDEX        0x1f
 716 #define CNL_HWS_CSB_WRITE_INDEX         0x2f
 717
 718 struct intel_ring *
 719 intel_engine_create_ring(struct intel_engine_cs *engine, int size);
 720 int intel_ring_pin(struct intel_ring *ring,
 721                    struct drm_i915_private *i915,
 722                    unsigned int offset_bias);
 723 void intel_ring_reset(struct intel_ring *ring, u32 tail);
 724 unsigned int intel_ring_update_space(struct intel_ring *ring);
 725 void intel_ring_unpin(struct intel_ring *ring);
 726 void intel_ring_free(struct intel_ring *ring);
 727
 728 void intel_engine_stop(struct intel_engine_cs *engine);
 729 void intel_engine_cleanup(struct intel_engine_cs *engine);
 730
 731 void intel_legacy_submission_resume(struct drm_i915_private *dev_priv);
 732
 733 int __must_check intel_ring_cacheline_align(struct drm_i915_gem_request *req);
 734
 735 int intel_ring_wait_for_space(struct intel_ring *ring, unsigned int bytes);
 736 u32 __must_check *intel_ring_begin(struct drm_i915_gem_request *req,
 737                                    unsigned int n);
 738
 739 static inline void
 740 intel_ring_advance(struct drm_i915_gem_request *req, u32 *cs)
 741 {
 742         /* Dummy function.
 743          *
 744          * This serves as a placeholder in the code so that the reader
 745          * can compare against the preceding intel_ring_begin() and
 746          * check that the number of dwords emitted matches the space
 747          * reserved for the command packet (i.e. the value passed to
 748          * intel_ring_begin()).
 749          */
 750         GEM_BUG_ON((req->ring->vaddr + req->ring->emit) != cs);
 751 }
 752
 753 static inline u32
 754 intel_ring_wrap(const struct intel_ring *ring, u32 pos)
 755 {
 756         return pos & (ring->size - 1);
 757 }
 758
 759 static inline u32
 760 intel_ring_offset(const struct drm_i915_gem_request *req, void *addr)
 761 {
 762         /* Don't write ring->size (equivalent to 0) as that hangs some GPUs. */
 763         u32 offset = addr - req->ring->vaddr;
 764         GEM_BUG_ON(offset > req->ring->size);
 765         return intel_ring_wrap(req->ring, offset);
 766 }
 767
 768 static inline void
 769 assert_ring_tail_valid(const struct intel_ring *ring, unsigned int tail)
 770 {
 771         /* We could combine these into a single tail operation, but keeping
 772          * them as seperate tests will help identify the cause should one
 773          * ever fire.
 774          */
 775         GEM_BUG_ON(!IS_ALIGNED(tail, 8));
 776         GEM_BUG_ON(tail >= ring->size);
 777
 778         /*
 779          * "Ring Buffer Use"
 780          *      Gen2 BSpec "1. Programming Environment" / 1.4.4.6
 781          *      Gen3 BSpec "1c Memory Interface Functions" / 2.3.4.5
 782          *      Gen4+ BSpec "1c Memory Interface and Command Stream" / 5.3.4.5
 783          * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
 784          * same cacheline, the Head Pointer must not be greater than the Tail
 785          * Pointer."
 786          *
 787          * We use ring->head as the last known location of the actual RING_HEAD,
 788          * it may have advanced but in the worst case it is equally the same
 789          * as ring->head and so we should never program RING_TAIL to advance
 790          * into the same cacheline as ring->head.
 791          */
 792 #define cacheline(a) round_down(a, CACHELINE_BYTES)
 793         GEM_BUG_ON(cacheline(tail) == cacheline(ring->head) &&
 794                    tail < ring->head);
 795 #undef cacheline
 796 }
 797
 798 static inline unsigned int
 799 intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
 800 {
 801         /* Whilst writes to the tail are strictly order, there is no
 802          * serialisation between readers and the writers. The tail may be
 803          * read by i915_gem_request_retire() just as it is being updated
 804          * by execlists, as although the breadcrumb is complete, the context
 805          * switch hasn't been seen.
 806          */
 807         assert_ring_tail_valid(ring, tail);
 808         ring->tail = tail;
 809         return tail;
 810 }
 811
 812 void intel_engine_init_global_seqno(struct intel_engine_cs *engine, u32 seqno);
 813
 814 void intel_engine_setup_common(struct intel_engine_cs *engine);
 815 int intel_engine_init_common(struct intel_engine_cs *engine);
 816 int intel_engine_create_scratch(struct intel_engine_cs *engine, int size);
 817 void intel_engine_cleanup_common(struct intel_engine_cs *engine);
 818
 819 int intel_init_render_ring_buffer(struct intel_engine_cs *engine);
 820 int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine);
 821 int intel_init_blt_ring_buffer(struct intel_engine_cs *engine);
 822 int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine);
 823
 824 u64 intel_engine_get_active_head(struct intel_engine_cs *engine);
 825 u64 intel_engine_get_last_batch_head(struct intel_engine_cs *engine);
 826
 827 static inline u32 intel_engine_get_seqno(struct intel_engine_cs *engine)
 828 {
 829         return intel_read_status_page(engine, I915_GEM_HWS_INDEX);
 830 }
 831
 832 static inline u32 intel_engine_last_submit(struct intel_engine_cs *engine)
 833 {
 834         /* We are only peeking at the tail of the submit queue (and not the
 835          * queue itself) in order to gain a hint as to the current active
 836          * state of the engine. Callers are not expected to be taking
 837          * engine->timeline->lock, nor are they expected to be concerned
 838          * wtih serialising this hint with anything, so document it as
 839          * a hint and nothing more.
 840          */
 841         return READ_ONCE(engine->timeline->seqno);
 842 }
 843
 844 int init_workarounds_ring(struct intel_engine_cs *engine);
 845 int intel_ring_workarounds_emit(struct drm_i915_gem_request *req);
 846
 847 void intel_engine_get_instdone(struct intel_engine_cs *engine,
 848                                struct intel_instdone *instdone);
 849
 850 /*
 851  * Arbitrary size for largest possible 'add request' sequence. The code paths
 852  * are complex and variable. Empirical measurement shows that the worst case
 853  * is BDW at 192 bytes (6 + 6 + 36 dwords), then ILK at 136 bytes. However,
 854  * we need to allocate double the largest single packet within that emission
 855  * to account for tail wraparound (so 6 + 6 + 72 dwords for BDW).
 856  */
 857 #define MIN_SPACE_FOR_ADD_REQUEST 336
 858
 859 static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 860 {
 861         return engine->status_page.ggtt_offset + I915_GEM_HWS_INDEX_ADDR;
 862 }
 863
 864 static inline u32 intel_hws_preempt_done_address(struct intel_engine_cs *engine)
 865 {
 866         return engine->status_page.ggtt_offset + I915_GEM_HWS_PREEMPT_ADDR;
 867 }
 868
 869 /* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
 870 int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
 871
 872 static inline void intel_wait_init(struct intel_wait *wait,
 873                                    struct drm_i915_gem_request *rq)
 874 {
 875         wait->tsk = current;
 876         wait->request = rq;
 877 }
 878
 879 static inline void intel_wait_init_for_seqno(struct intel_wait *wait, u32 seqno)
 880 {
 881         wait->tsk = current;
 882         wait->seqno = seqno;
 883 }
 884
 885 static inline bool intel_wait_has_seqno(const struct intel_wait *wait)
 886 {
 887         return wait->seqno;
 888 }
 889
 890 static inline bool
 891 intel_wait_update_seqno(struct intel_wait *wait, u32 seqno)
 892 {
 893         wait->seqno = seqno;
 894         return intel_wait_has_seqno(wait);
 895 }
 896
 897 static inline bool
 898 intel_wait_update_request(struct intel_wait *wait,
 899                           const struct drm_i915_gem_request *rq)
 900 {
 901         return intel_wait_update_seqno(wait, i915_gem_request_global_seqno(rq));
 902 }
 903
 904 static inline bool
 905 intel_wait_check_seqno(const struct intel_wait *wait, u32 seqno)
 906 {
 907         return wait->seqno == seqno;
 908 }
 909
 910 static inline bool
 911 intel_wait_check_request(const struct intel_wait *wait,
 912                          const struct drm_i915_gem_request *rq)
 913 {
 914         return intel_wait_check_seqno(wait, i915_gem_request_global_seqno(rq));
 915 }
 916
 917 static inline bool intel_wait_complete(const struct intel_wait *wait)
 918 {
 919         return RB_EMPTY_NODE(&wait->node);
 920 }
 921
 922 bool intel_engine_add_wait(struct intel_engine_cs *engine,
 923                            struct intel_wait *wait);
 924 void intel_engine_remove_wait(struct intel_engine_cs *engine,
 925                               struct intel_wait *wait);
 926 void intel_engine_enable_signaling(struct drm_i915_gem_request *request,
 927                                    bool wakeup);
 928 void intel_engine_cancel_signaling(struct drm_i915_gem_request *request);
 929
 930 static inline bool intel_engine_has_waiter(const struct intel_engine_cs *engine)
 931 {
 932         return READ_ONCE(engine->breadcrumbs.irq_wait);
 933 }
 934
 935 unsigned int intel_engine_wakeup(struct intel_engine_cs *engine);
 936 #define ENGINE_WAKEUP_WAITER BIT(0)
 937 #define ENGINE_WAKEUP_ASLEEP BIT(1)
 938
 939 void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine);
 940 void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine);
 941
 942 void __intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine);
 943 void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine);
 944
 945 void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine);
 946 void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine);
 947 bool intel_breadcrumbs_busy(struct intel_engine_cs *engine);
 948
 949 static inline u32 *gen8_emit_pipe_control(u32 *batch, u32 flags, u32 offset)
 950 {
 951         memset(batch, 0, 6 * sizeof(u32));
 952
 953         batch[0] = GFX_OP_PIPE_CONTROL(6);
 954         batch[1] = flags;
 955         batch[2] = offset;
 956
 957         return batch + 6;
 958 }
 959
 960 static inline u32 *
 961 gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset)
 962 {
 963         /* We're using qword write, offset should be aligned to 8 bytes. */
 964         GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8));
 965
 966         /* w/a for post sync ops following a GPGPU operation we
 967          * need a prior CS_STALL, which is emitted by the flush
 968          * following the batch.
 969          */
 970         *cs++ = GFX_OP_PIPE_CONTROL(6);
 971         *cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL |
 972                 PIPE_CONTROL_QW_WRITE;
 973         *cs++ = gtt_offset;
 974         *cs++ = 0;
 975         *cs++ = value;
 976         /* We're thrashing one dword of HWS. */
 977         *cs++ = 0;
 978
 979         return cs;
 980 }
 981
 982 static inline u32 *
 983 gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset)
 984 {
 985         /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 986         GEM_BUG_ON(gtt_offset & (1 << 5));
 987         /* Offset should be aligned to 8 bytes for both (QW/DW) write types */
 988         GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8));
 989
 990         *cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
 991         *cs++ = gtt_offset | MI_FLUSH_DW_USE_GTT;
 992         *cs++ = 0;
 993         *cs++ = value;
 994
 995         return cs;
 996 }
 997
 998 bool intel_engine_is_idle(struct intel_engine_cs *engine);
 999 bool intel_engines_are_idle(struct drm_i915_private *dev_priv);
1000
1001 bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine);
1002
1003 void intel_engines_park(struct drm_i915_private *i915);
1004 void intel_engines_unpark(struct drm_i915_private *i915);
1005
1006 void intel_engines_reset_default_submission(struct drm_i915_private *i915);
1007 unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915);
1008
1009 bool intel_engine_can_store_dword(struct intel_engine_cs *engine);
1010
1011 void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *p);
1012
1013 struct intel_engine_cs *
1014 intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance);
1015
1016 static inline void intel_engine_context_in(struct intel_engine_cs *engine)
1017 {
1018         unsigned long flags;
1019
1020         if (READ_ONCE(engine->stats.enabled) == 0)
1021                 return;
1022
1023         spin_lock_irqsave(&engine->stats.lock, flags);
1024
1025         if (engine->stats.enabled > 0) {
1026                 if (engine->stats.active++ == 0)
1027                         engine->stats.start = ktime_get();
1028                 GEM_BUG_ON(engine->stats.active == 0);
1029         }
1030
1031         spin_unlock_irqrestore(&engine->stats.lock, flags);
1032 }
1033
1034 static inline void intel_engine_context_out(struct intel_engine_cs *engine)
1035 {
1036         unsigned long flags;
1037
1038         if (READ_ONCE(engine->stats.enabled) == 0)
1039                 return;
1040
1041         spin_lock_irqsave(&engine->stats.lock, flags);
1042
1043         if (engine->stats.enabled > 0) {
1044                 ktime_t last;
1045
1046                 if (engine->stats.active && --engine->stats.active == 0) {
1047                         /*
1048                          * Decrement the active context count and in case GPU
1049                          * is now idle add up to the running total.
1050                          */
1051                         last = ktime_sub(ktime_get(), engine->stats.start);
1052
1053                         engine->stats.total = ktime_add(engine->stats.total,
1054                                                         last);
1055                 } else if (engine->stats.active == 0) {
1056                         /*
1057                          * After turning on engine stats, context out might be
1058                          * the first event in which case we account from the
1059                          * time stats gathering was turned on.
1060                          */
1061                         last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1062
1063                         engine->stats.total = ktime_add(engine->stats.total,
1064                                                         last);
1065                 }
1066         }
1067
1068         spin_unlock_irqrestore(&engine->stats.lock, flags);
1069 }
1070
1071 int intel_enable_engine_stats(struct intel_engine_cs *engine);
1072 void intel_disable_engine_stats(struct intel_engine_cs *engine);
1073
1074 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine);
1075
1076 #endif /* _INTEL_RINGBUFFER_H_ */