drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_gem_render_state.h"
 138 #include "i915_vgpu.h"
 139 #include "intel_lrc_reg.h"
 140 #include "intel_mocs.h"
 141 #include "intel_reset.h"
 142 #include "intel_workarounds.h"
 143
 144 #define RING_EXECLIST_QFULL             (1 << 0x2)
 145 #define RING_EXECLIST1_VALID            (1 << 0x3)
 146 #define RING_EXECLIST0_VALID            (1 << 0x4)
 147 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 148 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 149 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 150
 151 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 152 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 153 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 154 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 155 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 156 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 157
 158 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 159          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 160
 161 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 162 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 163 #define WA_TAIL_DWORDS 2
 164 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 165
 166 #define ACTIVE_PRIORITY (I915_PRIORITY_NEWCLIENT | I915_PRIORITY_NOSEMAPHORE)
 167
 168 static int execlists_context_deferred_alloc(struct intel_context *ce,
 169                                             struct intel_engine_cs *engine);
 170 static void execlists_init_reg_state(u32 *reg_state,
 171                                      struct intel_context *ce,
 172                                      struct intel_engine_cs *engine,
 173                                      struct intel_ring *ring);
 174
 175 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 176 {
 177         return rb_entry(rb, struct i915_priolist, node);
 178 }
 179
 180 static inline int rq_prio(const struct i915_request *rq)
 181 {
 182         return rq->sched.attr.priority;
 183 }
 184
 185 static int effective_prio(const struct i915_request *rq)
 186 {
 187         int prio = rq_prio(rq);
 188
 189         /*
 190          * On unwinding the active request, we give it a priority bump
 191          * equivalent to a freshly submitted request. This protects it from
 192          * being gazumped again, but it would be preferable if we didn't
 193          * let it be gazumped in the first place!
 194          *
 195          * See __unwind_incomplete_requests()
 196          */
 197         if (~prio & ACTIVE_PRIORITY && __i915_request_has_started(rq)) {
 198                 /*
 199                  * After preemption, we insert the active request at the
 200                  * end of the new priority level. This means that we will be
 201                  * _lower_ priority than the preemptee all things equal (and
 202                  * so the preemption is valid), so adjust our comparison
 203                  * accordingly.
 204                  */
 205                 prio |= ACTIVE_PRIORITY;
 206                 prio--;
 207         }
 208
 209         /* Restrict mere WAIT boosts from triggering preemption */
 210         return prio | __NO_PREEMPTION;
 211 }
 212
 213 static int queue_prio(const struct intel_engine_execlists *execlists)
 214 {
 215         struct i915_priolist *p;
 216         struct rb_node *rb;
 217
 218         rb = rb_first_cached(&execlists->queue);
 219         if (!rb)
 220                 return INT_MIN;
 221
 222         /*
 223          * As the priolist[] are inverted, with the highest priority in [0],
 224          * we have to flip the index value to become priority.
 225          */
 226         p = to_priolist(rb);
 227         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 228 }
 229
 230 static inline bool need_preempt(const struct intel_engine_cs *engine,
 231                                 const struct i915_request *rq)
 232 {
 233         int last_prio;
 234
 235         if (!engine->preempt_context)
 236                 return false;
 237
 238         if (i915_request_completed(rq))
 239                 return false;
 240
 241         /*
 242          * Check if the current priority hint merits a preemption attempt.
 243          *
 244          * We record the highest value priority we saw during rescheduling
 245          * prior to this dequeue, therefore we know that if it is strictly
 246          * less than the current tail of ESLP[0], we do not need to force
 247          * a preempt-to-idle cycle.
 248          *
 249          * However, the priority hint is a mere hint that we may need to
 250          * preempt. If that hint is stale or we may be trying to preempt
 251          * ourselves, ignore the request.
 252          */
 253         last_prio = effective_prio(rq);
 254         if (!__execlists_need_preempt(engine->execlists.queue_priority_hint,
 255                                       last_prio))
 256                 return false;
 257
 258         /*
 259          * Check against the first request in ELSP[1], it will, thanks to the
 260          * power of PI, be the highest priority of that context.
 261          */
 262         if (!list_is_last(&rq->link, &engine->timeline.requests) &&
 263             rq_prio(list_next_entry(rq, link)) > last_prio)
 264                 return true;
 265
 266         /*
 267          * If the inflight context did not trigger the preemption, then maybe
 268          * it was the set of queued requests? Pick the highest priority in
 269          * the queue (the first active priolist) and see if it deserves to be
 270          * running instead of ELSP[0].
 271          *
 272          * The highest priority request in the queue can not be either
 273          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 274          * context, it's priority would not exceed ELSP[0] aka last_prio.
 275          */
 276         return queue_prio(&engine->execlists) > last_prio;
 277 }
 278
 279 __maybe_unused static inline bool
 280 assert_priority_queue(const struct i915_request *prev,
 281                       const struct i915_request *next)
 282 {
 283         const struct intel_engine_execlists *execlists =
 284                 &prev->engine->execlists;
 285
 286         /*
 287          * Without preemption, the prev may refer to the still active element
 288          * which we refuse to let go.
 289          *
 290          * Even with preemption, there are times when we think it is better not
 291          * to preempt and leave an ostensibly lower priority request in flight.
 292          */
 293         if (port_request(execlists->port) == prev)
 294                 return true;
 295
 296         return rq_prio(prev) >= rq_prio(next);
 297 }
 298
 299 /*
 300  * The context descriptor encodes various attributes of a context,
 301  * including its GTT address and some flags. Because it's fairly
 302  * expensive to calculate, we'll just do it once and cache the result,
 303  * which remains valid until the context is unpinned.
 304  *
 305  * This is what a descriptor looks like, from LSB to MSB::
 306  *
 307  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 308  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 309  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 310  *      bits 53-54:    mbz, reserved for use by hardware
 311  *      bits 55-63:    group ID, currently unused and set to 0
 312  *
 313  * Starting from Gen11, the upper dword of the descriptor has a new format:
 314  *
 315  *      bits 32-36:    reserved
 316  *      bits 37-47:    SW context ID
 317  *      bits 48:53:    engine instance
 318  *      bit 54:        mbz, reserved for use by hardware
 319  *      bits 55-60:    SW counter
 320  *      bits 61-63:    engine class
 321  *
 322  * engine info, SW context ID and SW counter need to form a unique number
 323  * (Context ID) per lrc.
 324  */
 325 static u64
 326 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 327 {
 328         struct i915_gem_context *ctx = ce->gem_context;
 329         u64 desc;
 330
 331         BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
 332         BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
 333
 334         desc = ctx->desc_template;                              /* bits  0-11 */
 335         GEM_BUG_ON(desc & GENMASK_ULL(63, 12));
 336
 337         desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
 338                                                                 /* bits 12-31 */
 339         GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
 340
 341         /*
 342          * The following 32bits are copied into the OA reports (dword 2).
 343          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
 344          * anything below.
 345          */
 346         if (INTEL_GEN(engine->i915) >= 11) {
 347                 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
 348                 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
 349                                                                 /* bits 37-47 */
 350
 351                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
 352                                                                 /* bits 48-53 */
 353
 354                 /* TODO: decide what to do with SW counter (bits 55-60) */
 355
 356                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
 357                                                                 /* bits 61-63 */
 358         } else {
 359                 GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
 360                 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;   /* bits 32-52 */
 361         }
 362
 363         return desc;
 364 }
 365
 366 static void unwind_wa_tail(struct i915_request *rq)
 367 {
 368         rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
 369         assert_ring_tail_valid(rq->ring, rq->tail);
 370 }
 371
 372 static struct i915_request *
 373 __unwind_incomplete_requests(struct intel_engine_cs *engine)
 374 {
 375         struct i915_request *rq, *rn, *active = NULL;
 376         struct list_head *uninitialized_var(pl);
 377         int prio = I915_PRIORITY_INVALID | ACTIVE_PRIORITY;
 378
 379         lockdep_assert_held(&engine->timeline.lock);
 380
 381         list_for_each_entry_safe_reverse(rq, rn,
 382                                          &engine->timeline.requests,
 383                                          link) {
 384                 if (i915_request_completed(rq))
 385                         break;
 386
 387                 __i915_request_unsubmit(rq);
 388                 unwind_wa_tail(rq);
 389
 390                 GEM_BUG_ON(rq->hw_context->active);
 391
 392                 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 393                 if (rq_prio(rq) != prio) {
 394                         prio = rq_prio(rq);
 395                         pl = i915_sched_lookup_priolist(engine, prio);
 396                 }
 397                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 398
 399                 list_add(&rq->sched.link, pl);
 400
 401                 active = rq;
 402         }
 403
 404         /*
 405          * The active request is now effectively the start of a new client
 406          * stream, so give it the equivalent small priority bump to prevent
 407          * it being gazumped a second time by another peer.
 408          *
 409          * Note we have to be careful not to apply a priority boost to a request
 410          * still spinning on its semaphores. If the request hasn't started, that
 411          * means it is still waiting for its dependencies to be signaled, and
 412          * if we apply a priority boost to this request, we will boost it past
 413          * its signalers and so break PI.
 414          *
 415          * One consequence of this preemption boost is that we may jump
 416          * over lesser priorities (such as I915_PRIORITY_WAIT), effectively
 417          * making those priorities non-preemptible. They will be moved forward
 418          * in the priority queue, but they will not gain immediate access to
 419          * the GPU.
 420          */
 421         if (~prio & ACTIVE_PRIORITY && __i915_request_has_started(active)) {
 422                 prio |= ACTIVE_PRIORITY;
 423                 active->sched.attr.priority = prio;
 424                 list_move_tail(&active->sched.link,
 425                                i915_sched_lookup_priolist(engine, prio));
 426         }
 427
 428         return active;
 429 }
 430
 431 struct i915_request *
 432 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
 433 {
 434         struct intel_engine_cs *engine =
 435                 container_of(execlists, typeof(*engine), execlists);
 436
 437         return __unwind_incomplete_requests(engine);
 438 }
 439
 440 static inline void
 441 execlists_context_status_change(struct i915_request *rq, unsigned long status)
 442 {
 443         /*
 444          * Only used when GVT-g is enabled now. When GVT-g is disabled,
 445          * The compiler should eliminate this function as dead-code.
 446          */
 447         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
 448                 return;
 449
 450         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
 451                                    status, rq);
 452 }
 453
 454 inline void
 455 execlists_user_begin(struct intel_engine_execlists *execlists,
 456                      const struct execlist_port *port)
 457 {
 458         execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER);
 459 }
 460
 461 inline void
 462 execlists_user_end(struct intel_engine_execlists *execlists)
 463 {
 464         execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
 465 }
 466
 467 static inline void
 468 execlists_context_schedule_in(struct i915_request *rq)
 469 {
 470         GEM_BUG_ON(rq->hw_context->active);
 471
 472         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
 473         intel_engine_context_in(rq->engine);
 474         rq->hw_context->active = rq->engine;
 475 }
 476
 477 static inline void
 478 execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
 479 {
 480         rq->hw_context->active = NULL;
 481         intel_engine_context_out(rq->engine);
 482         execlists_context_status_change(rq, status);
 483         trace_i915_request_out(rq);
 484 }
 485
 486 static u64 execlists_update_context(struct i915_request *rq)
 487 {
 488         struct intel_context *ce = rq->hw_context;
 489
 490         ce->lrc_reg_state[CTX_RING_TAIL + 1] =
 491                 intel_ring_set_tail(rq->ring, rq->tail);
 492
 493         /*
 494          * Make sure the context image is complete before we submit it to HW.
 495          *
 496          * Ostensibly, writes (including the WCB) should be flushed prior to
 497          * an uncached write such as our mmio register access, the empirical
 498          * evidence (esp. on Braswell) suggests that the WC write into memory
 499          * may not be visible to the HW prior to the completion of the UC
 500          * register write and that we may begin execution from the context
 501          * before its image is complete leading to invalid PD chasing.
 502          *
 503          * Furthermore, Braswell, at least, wants a full mb to be sure that
 504          * the writes are coherent in memory (visible to the GPU) prior to
 505          * execution, and not just visible to other CPUs (as is the result of
 506          * wmb).
 507          */
 508         mb();
 509         return ce->lrc_desc;
 510 }
 511
 512 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
 513 {
 514         if (execlists->ctrl_reg) {
 515                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
 516                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
 517         } else {
 518                 writel(upper_32_bits(desc), execlists->submit_reg);
 519                 writel(lower_32_bits(desc), execlists->submit_reg);
 520         }
 521 }
 522
 523 static void execlists_submit_ports(struct intel_engine_cs *engine)
 524 {
 525         struct intel_engine_execlists *execlists = &engine->execlists;
 526         struct execlist_port *port = execlists->port;
 527         unsigned int n;
 528
 529         /*
 530          * We can skip acquiring intel_runtime_pm_get() here as it was taken
 531          * on our behalf by the request (see i915_gem_mark_busy()) and it will
 532          * not be relinquished until the device is idle (see
 533          * i915_gem_idle_work_handler()). As a precaution, we make sure
 534          * that all ELSP are drained i.e. we have processed the CSB,
 535          * before allowing ourselves to idle and calling intel_runtime_pm_put().
 536          */
 537         GEM_BUG_ON(!engine->i915->gt.awake);
 538
 539         /*
 540          * ELSQ note: the submit queue is not cleared after being submitted
 541          * to the HW so we need to make sure we always clean it up. This is
 542          * currently ensured by the fact that we always write the same number
 543          * of elsq entries, keep this in mind before changing the loop below.
 544          */
 545         for (n = execlists_num_ports(execlists); n--; ) {
 546                 struct i915_request *rq;
 547                 unsigned int count;
 548                 u64 desc;
 549
 550                 rq = port_unpack(&port[n], &count);
 551                 if (rq) {
 552                         GEM_BUG_ON(count > !n);
 553                         if (!count++)
 554                                 execlists_context_schedule_in(rq);
 555                         port_set(&port[n], port_pack(rq, count));
 556                         desc = execlists_update_context(rq);
 557                         GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
 558
 559                         GEM_TRACE("%s in[%d]:  ctx=%d.%d, fence %llx:%lld (current %d), prio=%d\n",
 560                                   engine->name, n,
 561                                   port[n].context_id, count,
 562                                   rq->fence.context, rq->fence.seqno,
 563                                   hwsp_seqno(rq),
 564                                   rq_prio(rq));
 565                 } else {
 566                         GEM_BUG_ON(!n);
 567                         desc = 0;
 568                 }
 569
 570                 write_desc(execlists, desc, n);
 571         }
 572
 573         /* we need to manually load the submit queue */
 574         if (execlists->ctrl_reg)
 575                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
 576
 577         execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
 578 }
 579
 580 static bool ctx_single_port_submission(const struct intel_context *ce)
 581 {
 582         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
 583                 i915_gem_context_force_single_submission(ce->gem_context));
 584 }
 585
 586 static bool can_merge_ctx(const struct intel_context *prev,
 587                           const struct intel_context *next)
 588 {
 589         if (prev != next)
 590                 return false;
 591
 592         if (ctx_single_port_submission(prev))
 593                 return false;
 594
 595         return true;
 596 }
 597
 598 static bool can_merge_rq(const struct i915_request *prev,
 599                          const struct i915_request *next)
 600 {
 601         GEM_BUG_ON(!assert_priority_queue(prev, next));
 602
 603         if (!can_merge_ctx(prev->hw_context, next->hw_context))
 604                 return false;
 605
 606         return true;
 607 }
 608
 609 static void port_assign(struct execlist_port *port, struct i915_request *rq)
 610 {
 611         GEM_BUG_ON(rq == port_request(port));
 612
 613         if (port_isset(port))
 614                 i915_request_put(port_request(port));
 615
 616         port_set(port, port_pack(i915_request_get(rq), port_count(port)));
 617 }
 618
 619 static void inject_preempt_context(struct intel_engine_cs *engine)
 620 {
 621         struct intel_engine_execlists *execlists = &engine->execlists;
 622         struct intel_context *ce = engine->preempt_context;
 623         unsigned int n;
 624
 625         GEM_BUG_ON(execlists->preempt_complete_status !=
 626                    upper_32_bits(ce->lrc_desc));
 627
 628         /*
 629          * Switch to our empty preempt context so
 630          * the state of the GPU is known (idle).
 631          */
 632         GEM_TRACE("%s\n", engine->name);
 633         for (n = execlists_num_ports(execlists); --n; )
 634                 write_desc(execlists, 0, n);
 635
 636         write_desc(execlists, ce->lrc_desc, n);
 637
 638         /* we need to manually load the submit queue */
 639         if (execlists->ctrl_reg)
 640                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
 641
 642         execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
 643         execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
 644
 645         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
 646 }
 647
 648 static void complete_preempt_context(struct intel_engine_execlists *execlists)
 649 {
 650         GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
 651
 652         if (inject_preempt_hang(execlists))
 653                 return;
 654
 655         execlists_cancel_port_requests(execlists);
 656         __unwind_incomplete_requests(container_of(execlists,
 657                                                   struct intel_engine_cs,
 658                                                   execlists));
 659 }
 660
 661 static void execlists_dequeue(struct intel_engine_cs *engine)
 662 {
 663         struct intel_engine_execlists * const execlists = &engine->execlists;
 664         struct execlist_port *port = execlists->port;
 665         const struct execlist_port * const last_port =
 666                 &execlists->port[execlists->port_mask];
 667         struct i915_request *last = port_request(port);
 668         struct rb_node *rb;
 669         bool submit = false;
 670
 671         /*
 672          * Hardware submission is through 2 ports. Conceptually each port
 673          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
 674          * static for a context, and unique to each, so we only execute
 675          * requests belonging to a single context from each ring. RING_HEAD
 676          * is maintained by the CS in the context image, it marks the place
 677          * where it got up to last time, and through RING_TAIL we tell the CS
 678          * where we want to execute up to this time.
 679          *
 680          * In this list the requests are in order of execution. Consecutive
 681          * requests from the same context are adjacent in the ringbuffer. We
 682          * can combine these requests into a single RING_TAIL update:
 683          *
 684          *              RING_HEAD...req1...req2
 685          *                                    ^- RING_TAIL
 686          * since to execute req2 the CS must first execute req1.
 687          *
 688          * Our goal then is to point each port to the end of a consecutive
 689          * sequence of requests as being the most optimal (fewest wake ups
 690          * and context switches) submission.
 691          */
 692
 693         if (last) {
 694                 /*
 695                  * Don't resubmit or switch until all outstanding
 696                  * preemptions (lite-restore) are seen. Then we
 697                  * know the next preemption status we see corresponds
 698                  * to this ELSP update.
 699                  */
 700                 GEM_BUG_ON(!execlists_is_active(execlists,
 701                                                 EXECLISTS_ACTIVE_USER));
 702                 GEM_BUG_ON(!port_count(&port[0]));
 703
 704                 /*
 705                  * If we write to ELSP a second time before the HW has had
 706                  * a chance to respond to the previous write, we can confuse
 707                  * the HW and hit "undefined behaviour". After writing to ELSP,
 708                  * we must then wait until we see a context-switch event from
 709                  * the HW to indicate that it has had a chance to respond.
 710                  */
 711                 if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
 712                         return;
 713
 714                 if (need_preempt(engine, last)) {
 715                         inject_preempt_context(engine);
 716                         return;
 717                 }
 718
 719                 /*
 720                  * In theory, we could coalesce more requests onto
 721                  * the second port (the first port is active, with
 722                  * no preemptions pending). However, that means we
 723                  * then have to deal with the possible lite-restore
 724                  * of the second port (as we submit the ELSP, there
 725                  * may be a context-switch) but also we may complete
 726                  * the resubmission before the context-switch. Ergo,
 727                  * coalescing onto the second port will cause a
 728                  * preemption event, but we cannot predict whether
 729                  * that will affect port[0] or port[1].
 730                  *
 731                  * If the second port is already active, we can wait
 732                  * until the next context-switch before contemplating
 733                  * new requests. The GPU will be busy and we should be
 734                  * able to resubmit the new ELSP before it idles,
 735                  * avoiding pipeline bubbles (momentary pauses where
 736                  * the driver is unable to keep up the supply of new
 737                  * work). However, we have to double check that the
 738                  * priorities of the ports haven't been switch.
 739                  */
 740                 if (port_count(&port[1]))
 741                         return;
 742
 743                 /*
 744                  * WaIdleLiteRestore:bdw,skl
 745                  * Apply the wa NOOPs to prevent
 746                  * ring:HEAD == rq:TAIL as we resubmit the
 747                  * request. See gen8_emit_fini_breadcrumb() for
 748                  * where we prepare the padding after the
 749                  * end of the request.
 750                  */
 751                 last->tail = last->wa_tail;
 752         }
 753
 754         while ((rb = rb_first_cached(&execlists->queue))) {
 755                 struct i915_priolist *p = to_priolist(rb);
 756                 struct i915_request *rq, *rn;
 757                 int i;
 758
 759                 priolist_for_each_request_consume(rq, rn, p, i) {
 760                         /*
 761                          * Can we combine this request with the current port?
 762                          * It has to be the same context/ringbuffer and not
 763                          * have any exceptions (e.g. GVT saying never to
 764                          * combine contexts).
 765                          *
 766                          * If we can combine the requests, we can execute both
 767                          * by updating the RING_TAIL to point to the end of the
 768                          * second request, and so we never need to tell the
 769                          * hardware about the first.
 770                          */
 771                         if (last && !can_merge_rq(last, rq)) {
 772                                 /*
 773                                  * If we are on the second port and cannot
 774                                  * combine this request with the last, then we
 775                                  * are done.
 776                                  */
 777                                 if (port == last_port)
 778                                         goto done;
 779
 780                                 /*
 781                                  * We must not populate both ELSP[] with the
 782                                  * same LRCA, i.e. we must submit 2 different
 783                                  * contexts if we submit 2 ELSP.
 784                                  */
 785                                 if (last->hw_context == rq->hw_context)
 786                                         goto done;
 787
 788                                 /*
 789                                  * If GVT overrides us we only ever submit
 790                                  * port[0], leaving port[1] empty. Note that we
 791                                  * also have to be careful that we don't queue
 792                                  * the same context (even though a different
 793                                  * request) to the second port.
 794                                  */
 795                                 if (ctx_single_port_submission(last->hw_context) ||
 796                                     ctx_single_port_submission(rq->hw_context))
 797                                         goto done;
 798
 799
 800                                 if (submit)
 801                                         port_assign(port, last);
 802                                 port++;
 803
 804                                 GEM_BUG_ON(port_isset(port));
 805                         }
 806
 807                         list_del_init(&rq->sched.link);
 808
 809                         __i915_request_submit(rq);
 810                         trace_i915_request_in(rq, port_index(port, execlists));
 811
 812                         last = rq;
 813                         submit = true;
 814                 }
 815
 816                 rb_erase_cached(&p->node, &execlists->queue);
 817                 i915_priolist_free(p);
 818         }
 819
 820 done:
 821         /*
 822          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
 823          *
 824          * We choose the priority hint such that if we add a request of greater
 825          * priority than this, we kick the submission tasklet to decide on
 826          * the right order of submitting the requests to hardware. We must
 827          * also be prepared to reorder requests as they are in-flight on the
 828          * HW. We derive the priority hint then as the first "hole" in
 829          * the HW submission ports and if there are no available slots,
 830          * the priority of the lowest executing request, i.e. last.
 831          *
 832          * When we do receive a higher priority request ready to run from the
 833          * user, see queue_request(), the priority hint is bumped to that
 834          * request triggering preemption on the next dequeue (or subsequent
 835          * interrupt for secondary ports).
 836          */
 837         execlists->queue_priority_hint = queue_prio(execlists);
 838
 839         if (submit) {
 840                 port_assign(port, last);
 841                 execlists_submit_ports(engine);
 842         }
 843
 844         /* We must always keep the beast fed if we have work piled up */
 845         GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
 846                    !port_isset(execlists->port));
 847
 848         /* Re-evaluate the executing context setup after each preemptive kick */
 849         if (last)
 850                 execlists_user_begin(execlists, execlists->port);
 851
 852         /* If the engine is now idle, so should be the flag; and vice versa. */
 853         GEM_BUG_ON(execlists_is_active(&engine->execlists,
 854                                        EXECLISTS_ACTIVE_USER) ==
 855                    !port_isset(engine->execlists.port));
 856 }
 857
 858 void
 859 execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
 860 {
 861         struct execlist_port *port = execlists->port;
 862         unsigned int num_ports = execlists_num_ports(execlists);
 863
 864         while (num_ports-- && port_isset(port)) {
 865                 struct i915_request *rq = port_request(port);
 866
 867                 GEM_TRACE("%s:port%u fence %llx:%lld, (current %d)\n",
 868                           rq->engine->name,
 869                           (unsigned int)(port - execlists->port),
 870                           rq->fence.context, rq->fence.seqno,
 871                           hwsp_seqno(rq));
 872
 873                 GEM_BUG_ON(!execlists->active);
 874                 execlists_context_schedule_out(rq,
 875                                                i915_request_completed(rq) ?
 876                                                INTEL_CONTEXT_SCHEDULE_OUT :
 877                                                INTEL_CONTEXT_SCHEDULE_PREEMPTED);
 878
 879                 i915_request_put(rq);
 880
 881                 memset(port, 0, sizeof(*port));
 882                 port++;
 883         }
 884
 885         execlists_clear_all_active(execlists);
 886 }
 887
 888 static inline void
 889 invalidate_csb_entries(const u32 *first, const u32 *last)
 890 {
 891         clflush((void *)first);
 892         clflush((void *)last);
 893 }
 894
 895 static inline bool
 896 reset_in_progress(const struct intel_engine_execlists *execlists)
 897 {
 898         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
 899 }
 900
 901 static void process_csb(struct intel_engine_cs *engine)
 902 {
 903         struct intel_engine_execlists * const execlists = &engine->execlists;
 904         struct execlist_port *port = execlists->port;
 905         const u32 * const buf = execlists->csb_status;
 906         const u8 num_entries = execlists->csb_size;
 907         u8 head, tail;
 908
 909         lockdep_assert_held(&engine->timeline.lock);
 910
 911         /*
 912          * Note that csb_write, csb_status may be either in HWSP or mmio.
 913          * When reading from the csb_write mmio register, we have to be
 914          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
 915          * the low 4bits. As it happens we know the next 4bits are always
 916          * zero and so we can simply masked off the low u8 of the register
 917          * and treat it identically to reading from the HWSP (without having
 918          * to use explicit shifting and masking, and probably bifurcating
 919          * the code to handle the legacy mmio read).
 920          */
 921         head = execlists->csb_head;
 922         tail = READ_ONCE(*execlists->csb_write);
 923         GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
 924         if (unlikely(head == tail))
 925                 return;
 926
 927         /*
 928          * Hopefully paired with a wmb() in HW!
 929          *
 930          * We must complete the read of the write pointer before any reads
 931          * from the CSB, so that we do not see stale values. Without an rmb
 932          * (lfence) the HW may speculatively perform the CSB[] reads *before*
 933          * we perform the READ_ONCE(*csb_write).
 934          */
 935         rmb();
 936
 937         do {
 938                 struct i915_request *rq;
 939                 unsigned int status;
 940                 unsigned int count;
 941
 942                 if (++head == num_entries)
 943                         head = 0;
 944
 945                 /*
 946                  * We are flying near dragons again.
 947                  *
 948                  * We hold a reference to the request in execlist_port[]
 949                  * but no more than that. We are operating in softirq
 950                  * context and so cannot hold any mutex or sleep. That
 951                  * prevents us stopping the requests we are processing
 952                  * in port[] from being retired simultaneously (the
 953                  * breadcrumb will be complete before we see the
 954                  * context-switch). As we only hold the reference to the
 955                  * request, any pointer chasing underneath the request
 956                  * is subject to a potential use-after-free. Thus we
 957                  * store all of the bookkeeping within port[] as
 958                  * required, and avoid using unguarded pointers beneath
 959                  * request itself. The same applies to the atomic
 960                  * status notifier.
 961                  */
 962
 963                 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
 964                           engine->name, head,
 965                           buf[2 * head + 0], buf[2 * head + 1],
 966                           execlists->active);
 967
 968                 status = buf[2 * head];
 969                 if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
 970                               GEN8_CTX_STATUS_PREEMPTED))
 971                         execlists_set_active(execlists,
 972                                              EXECLISTS_ACTIVE_HWACK);
 973                 if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
 974                         execlists_clear_active(execlists,
 975                                                EXECLISTS_ACTIVE_HWACK);
 976
 977                 if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
 978                         continue;
 979
 980                 /* We should never get a COMPLETED | IDLE_ACTIVE! */
 981                 GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
 982
 983                 if (status & GEN8_CTX_STATUS_COMPLETE &&
 984                     buf[2*head + 1] == execlists->preempt_complete_status) {
 985                         GEM_TRACE("%s preempt-idle\n", engine->name);
 986                         complete_preempt_context(execlists);
 987                         continue;
 988                 }
 989
 990                 if (status & GEN8_CTX_STATUS_PREEMPTED &&
 991                     execlists_is_active(execlists,
 992                                         EXECLISTS_ACTIVE_PREEMPT))
 993                         continue;
 994
 995                 GEM_BUG_ON(!execlists_is_active(execlists,
 996                                                 EXECLISTS_ACTIVE_USER));
 997
 998                 rq = port_unpack(port, &count);
 999                 GEM_TRACE("%s out[0]: ctx=%d.%d, fence %llx:%lld (current %d), prio=%d\n",
1000                           engine->name,
1001                           port->context_id, count,
1002                           rq ? rq->fence.context : 0,
1003                           rq ? rq->fence.seqno : 0,
1004                           rq ? hwsp_seqno(rq) : 0,
1005                           rq ? rq_prio(rq) : 0);
1006
1007                 /* Check the context/desc id for this event matches */
1008                 GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
1009
1010                 GEM_BUG_ON(count == 0);
1011                 if (--count == 0) {
1012                         /*
1013                          * On the final event corresponding to the
1014                          * submission of this context, we expect either
1015                          * an element-switch event or a completion
1016                          * event (and on completion, the active-idle
1017                          * marker). No more preemptions, lite-restore
1018                          * or otherwise.
1019                          */
1020                         GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
1021                         GEM_BUG_ON(port_isset(&port[1]) &&
1022                                    !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
1023                         GEM_BUG_ON(!port_isset(&port[1]) &&
1024                                    !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
1025
1026                         /*
1027                          * We rely on the hardware being strongly
1028                          * ordered, that the breadcrumb write is
1029                          * coherent (visible from the CPU) before the
1030                          * user interrupt and CSB is processed.
1031                          */
1032                         GEM_BUG_ON(!i915_request_completed(rq));
1033
1034                         execlists_context_schedule_out(rq,
1035                                                        INTEL_CONTEXT_SCHEDULE_OUT);
1036                         i915_request_put(rq);
1037
1038                         GEM_TRACE("%s completed ctx=%d\n",
1039                                   engine->name, port->context_id);
1040
1041                         port = execlists_port_complete(execlists, port);
1042                         if (port_isset(port))
1043                                 execlists_user_begin(execlists, port);
1044                         else
1045                                 execlists_user_end(execlists);
1046                 } else {
1047                         port_set(port, port_pack(rq, count));
1048                 }
1049         } while (head != tail);
1050
1051         execlists->csb_head = head;
1052
1053         /*
1054          * Gen11 has proven to fail wrt global observation point between
1055          * entry and tail update, failing on the ordering and thus
1056          * we see an old entry in the context status buffer.
1057          *
1058          * Forcibly evict out entries for the next gpu csb update,
1059          * to increase the odds that we get a fresh entries with non
1060          * working hardware. The cost for doing so comes out mostly with
1061          * the wash as hardware, working or not, will need to do the
1062          * invalidation before.
1063          */
1064         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
1065 }
1066
1067 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
1068 {
1069         lockdep_assert_held(&engine->timeline.lock);
1070
1071         process_csb(engine);
1072         if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
1073                 execlists_dequeue(engine);
1074 }
1075
1076 /*
1077  * Check the unread Context Status Buffers and manage the submission of new
1078  * contexts to the ELSP accordingly.
1079  */
1080 static void execlists_submission_tasklet(unsigned long data)
1081 {
1082         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1083         unsigned long flags;
1084
1085         GEM_TRACE("%s awake?=%d, active=%x\n",
1086                   engine->name,
1087                   !!engine->i915->gt.awake,
1088                   engine->execlists.active);
1089
1090         spin_lock_irqsave(&engine->timeline.lock, flags);
1091         __execlists_submission_tasklet(engine);
1092         spin_unlock_irqrestore(&engine->timeline.lock, flags);
1093 }
1094
1095 static void queue_request(struct intel_engine_cs *engine,
1096                           struct i915_sched_node *node,
1097                           int prio)
1098 {
1099         list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
1100 }
1101
1102 static void __submit_queue_imm(struct intel_engine_cs *engine)
1103 {
1104         struct intel_engine_execlists * const execlists = &engine->execlists;
1105
1106         if (reset_in_progress(execlists))
1107                 return; /* defer until we restart the engine following reset */
1108
1109         if (execlists->tasklet.func == execlists_submission_tasklet)
1110                 __execlists_submission_tasklet(engine);
1111         else
1112                 tasklet_hi_schedule(&execlists->tasklet);
1113 }
1114
1115 static void submit_queue(struct intel_engine_cs *engine, int prio)
1116 {
1117         if (prio > engine->execlists.queue_priority_hint) {
1118                 engine->execlists.queue_priority_hint = prio;
1119                 __submit_queue_imm(engine);
1120         }
1121 }
1122
1123 static void execlists_submit_request(struct i915_request *request)
1124 {
1125         struct intel_engine_cs *engine = request->engine;
1126         unsigned long flags;
1127
1128         /* Will be called from irq-context when using foreign fences. */
1129         spin_lock_irqsave(&engine->timeline.lock, flags);
1130
1131         queue_request(engine, &request->sched, rq_prio(request));
1132
1133         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1134         GEM_BUG_ON(list_empty(&request->sched.link));
1135
1136         submit_queue(engine, rq_prio(request));
1137
1138         spin_unlock_irqrestore(&engine->timeline.lock, flags);
1139 }
1140
1141 static void __execlists_context_fini(struct intel_context *ce)
1142 {
1143         intel_ring_put(ce->ring);
1144
1145         GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
1146         i915_gem_object_put(ce->state->obj);
1147 }
1148
1149 static void execlists_context_destroy(struct kref *kref)
1150 {
1151         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1152
1153         GEM_BUG_ON(intel_context_is_pinned(ce));
1154
1155         if (ce->state)
1156                 __execlists_context_fini(ce);
1157
1158         intel_context_free(ce);
1159 }
1160
1161 static int __context_pin(struct i915_vma *vma)
1162 {
1163         unsigned int flags;
1164         int err;
1165
1166         flags = PIN_GLOBAL | PIN_HIGH;
1167         flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
1168
1169         err = i915_vma_pin(vma, 0, 0, flags);
1170         if (err)
1171                 return err;
1172
1173         vma->obj->pin_global++;
1174         vma->obj->mm.dirty = true;
1175
1176         return 0;
1177 }
1178
1179 static void __context_unpin(struct i915_vma *vma)
1180 {
1181         vma->obj->pin_global--;
1182         __i915_vma_unpin(vma);
1183 }
1184
1185 static void execlists_context_unpin(struct intel_context *ce)
1186 {
1187         struct intel_engine_cs *engine;
1188
1189         /*
1190          * The tasklet may still be using a pointer to our state, via an
1191          * old request. However, since we know we only unpin the context
1192          * on retirement of the following request, we know that the last
1193          * request referencing us will have had a completion CS interrupt.
1194          * If we see that it is still active, it means that the tasklet hasn't
1195          * had the chance to run yet; let it run before we teardown the
1196          * reference it may use.
1197          */
1198         engine = READ_ONCE(ce->active);
1199         if (unlikely(engine)) {
1200                 unsigned long flags;
1201
1202                 spin_lock_irqsave(&engine->timeline.lock, flags);
1203                 process_csb(engine);
1204                 spin_unlock_irqrestore(&engine->timeline.lock, flags);
1205
1206                 GEM_BUG_ON(READ_ONCE(ce->active));
1207         }
1208
1209         i915_gem_context_unpin_hw_id(ce->gem_context);
1210
1211         intel_ring_unpin(ce->ring);
1212
1213         i915_gem_object_unpin_map(ce->state->obj);
1214         __context_unpin(ce->state);
1215 }
1216
1217 static void
1218 __execlists_update_reg_state(struct intel_context *ce,
1219                              struct intel_engine_cs *engine)
1220 {
1221         struct intel_ring *ring = ce->ring;
1222         u32 *regs = ce->lrc_reg_state;
1223
1224         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
1225         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1226
1227         regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(ring->vma);
1228         regs[CTX_RING_HEAD + 1] = ring->head;
1229         regs[CTX_RING_TAIL + 1] = ring->tail;
1230
1231         /* RPCS */
1232         if (engine->class == RENDER_CLASS)
1233                 regs[CTX_R_PWR_CLK_STATE + 1] =
1234                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
1235 }
1236
1237 static int
1238 __execlists_context_pin(struct intel_context *ce,
1239                         struct intel_engine_cs *engine)
1240 {
1241         void *vaddr;
1242         int ret;
1243
1244         GEM_BUG_ON(!ce->gem_context->ppgtt);
1245
1246         ret = execlists_context_deferred_alloc(ce, engine);
1247         if (ret)
1248                 goto err;
1249         GEM_BUG_ON(!ce->state);
1250
1251         ret = __context_pin(ce->state);
1252         if (ret)
1253                 goto err;
1254
1255         vaddr = i915_gem_object_pin_map(ce->state->obj,
1256                                         i915_coherent_map_type(engine->i915) |
1257                                         I915_MAP_OVERRIDE);
1258         if (IS_ERR(vaddr)) {
1259                 ret = PTR_ERR(vaddr);
1260                 goto unpin_vma;
1261         }
1262
1263         ret = intel_ring_pin(ce->ring);
1264         if (ret)
1265                 goto unpin_map;
1266
1267         ret = i915_gem_context_pin_hw_id(ce->gem_context);
1268         if (ret)
1269                 goto unpin_ring;
1270
1271         ce->lrc_desc = lrc_descriptor(ce, engine);
1272         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
1273         __execlists_update_reg_state(ce, engine);
1274
1275         return 0;
1276
1277 unpin_ring:
1278         intel_ring_unpin(ce->ring);
1279 unpin_map:
1280         i915_gem_object_unpin_map(ce->state->obj);
1281 unpin_vma:
1282         __context_unpin(ce->state);
1283 err:
1284         return ret;
1285 }
1286
1287 static int execlists_context_pin(struct intel_context *ce)
1288 {
1289         return __execlists_context_pin(ce, ce->engine);
1290 }
1291
1292 static void execlists_context_reset(struct intel_context *ce)
1293 {
1294         /*
1295          * Because we emit WA_TAIL_DWORDS there may be a disparity
1296          * between our bookkeeping in ce->ring->head and ce->ring->tail and
1297          * that stored in context. As we only write new commands from
1298          * ce->ring->tail onwards, everything before that is junk. If the GPU
1299          * starts reading from its RING_HEAD from the context, it may try to
1300          * execute that junk and die.
1301          *
1302          * The contexts that are stilled pinned on resume belong to the
1303          * kernel, and are local to each engine. All other contexts will
1304          * have their head/tail sanitized upon pinning before use, so they
1305          * will never see garbage,
1306          *
1307          * So to avoid that we reset the context images upon resume. For
1308          * simplicity, we just zero everything out.
1309          */
1310         intel_ring_reset(ce->ring, 0);
1311         __execlists_update_reg_state(ce, ce->engine);
1312 }
1313
1314 static const struct intel_context_ops execlists_context_ops = {
1315         .pin = execlists_context_pin,
1316         .unpin = execlists_context_unpin,
1317
1318         .enter = intel_context_enter_engine,
1319         .exit = intel_context_exit_engine,
1320
1321         .reset = execlists_context_reset,
1322         .destroy = execlists_context_destroy,
1323 };
1324
1325 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
1326 {
1327         u32 *cs;
1328
1329         GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
1330
1331         cs = intel_ring_begin(rq, 6);
1332         if (IS_ERR(cs))
1333                 return PTR_ERR(cs);
1334
1335         /*
1336          * Check if we have been preempted before we even get started.
1337          *
1338          * After this point i915_request_started() reports true, even if
1339          * we get preempted and so are no longer running.
1340          */
1341         *cs++ = MI_ARB_CHECK;
1342         *cs++ = MI_NOOP;
1343
1344         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1345         *cs++ = rq->timeline->hwsp_offset;
1346         *cs++ = 0;
1347         *cs++ = rq->fence.seqno - 1;
1348
1349         intel_ring_advance(rq, cs);
1350
1351         /* Record the updated position of the request's payload */
1352         rq->infix = intel_ring_offset(rq, cs);
1353
1354         return 0;
1355 }
1356
1357 static int emit_pdps(struct i915_request *rq)
1358 {
1359         const struct intel_engine_cs * const engine = rq->engine;
1360         struct i915_hw_ppgtt * const ppgtt = rq->gem_context->ppgtt;
1361         int err, i;
1362         u32 *cs;
1363
1364         GEM_BUG_ON(intel_vgpu_active(rq->i915));
1365
1366         /*
1367          * Beware ye of the dragons, this sequence is magic!
1368          *
1369          * Small changes to this sequence can cause anything from
1370          * GPU hangs to forcewake errors and machine lockups!
1371          */
1372
1373         /* Flush any residual operations from the context load */
1374         err = engine->emit_flush(rq, EMIT_FLUSH);
1375         if (err)
1376                 return err;
1377
1378         /* Magic required to prevent forcewake errors! */
1379         err = engine->emit_flush(rq, EMIT_INVALIDATE);
1380         if (err)
1381                 return err;
1382
1383         cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
1384         if (IS_ERR(cs))
1385                 return PTR_ERR(cs);
1386
1387         /* Ensure the LRI have landed before we invalidate & continue */
1388         *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
1389         for (i = GEN8_3LVL_PDPES; i--; ) {
1390                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
1391                 u32 base = engine->mmio_base;
1392
1393                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
1394                 *cs++ = upper_32_bits(pd_daddr);
1395                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
1396                 *cs++ = lower_32_bits(pd_daddr);
1397         }
1398         *cs++ = MI_NOOP;
1399
1400         intel_ring_advance(rq, cs);
1401
1402         /* Be doubly sure the LRI have landed before proceeding */
1403         err = engine->emit_flush(rq, EMIT_FLUSH);
1404         if (err)
1405                 return err;
1406
1407         /* Re-invalidate the TLB for luck */
1408         return engine->emit_flush(rq, EMIT_INVALIDATE);
1409 }
1410
1411 static int execlists_request_alloc(struct i915_request *request)
1412 {
1413         int ret;
1414
1415         GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1416
1417         /*
1418          * Flush enough space to reduce the likelihood of waiting after
1419          * we start building the request - in which case we will just
1420          * have to repeat work.
1421          */
1422         request->reserved_space += EXECLISTS_REQUEST_SIZE;
1423
1424         /*
1425          * Note that after this point, we have committed to using
1426          * this request as it is being used to both track the
1427          * state of engine initialisation and liveness of the
1428          * golden renderstate above. Think twice before you try
1429          * to cancel/unwind this request now.
1430          */
1431
1432         /* Unconditionally invalidate GPU caches and TLBs. */
1433         if (i915_vm_is_4lvl(&request->gem_context->ppgtt->vm))
1434                 ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1435         else
1436                 ret = emit_pdps(request);
1437         if (ret)
1438                 return ret;
1439
1440         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
1441         return 0;
1442 }
1443
1444 /*
1445  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1446  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1447  * but there is a slight complication as this is applied in WA batch where the
1448  * values are only initialized once so we cannot take register value at the
1449  * beginning and reuse it further; hence we save its value to memory, upload a
1450  * constant value with bit21 set and then we restore it back with the saved value.
1451  * To simplify the WA, a constant value is formed by using the default value
1452  * of this register. This shouldn't be a problem because we are only modifying
1453  * it for a short period and this batch in non-premptible. We can ofcourse
1454  * use additional instructions that read the actual value of the register
1455  * at that time and set our bit of interest but it makes the WA complicated.
1456  *
1457  * This WA is also required for Gen9 so extracting as a function avoids
1458  * code duplication.
1459  */
1460 static u32 *
1461 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1462 {
1463         /* NB no one else is allowed to scribble over scratch + 256! */
1464         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1465         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1466         *batch++ = i915_scratch_offset(engine->i915) + 256;
1467         *batch++ = 0;
1468
1469         *batch++ = MI_LOAD_REGISTER_IMM(1);
1470         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1471         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1472
1473         batch = gen8_emit_pipe_control(batch,
1474                                        PIPE_CONTROL_CS_STALL |
1475                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1476                                        0);
1477
1478         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1479         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1480         *batch++ = i915_scratch_offset(engine->i915) + 256;
1481         *batch++ = 0;
1482
1483         return batch;
1484 }
1485
1486 /*
1487  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1488  * initialized at the beginning and shared across all contexts but this field
1489  * helps us to have multiple batches at different offsets and select them based
1490  * on a criteria. At the moment this batch always start at the beginning of the page
1491  * and at this point we don't have multiple wa_ctx batch buffers.
1492  *
1493  * The number of WA applied are not known at the beginning; we use this field
1494  * to return the no of DWORDS written.
1495  *
1496  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1497  * so it adds NOOPs as padding to make it cacheline aligned.
1498  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1499  * makes a complete batch buffer.
1500  */
1501 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1502 {
1503         /* WaDisableCtxRestoreArbitration:bdw,chv */
1504         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1505
1506         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1507         if (IS_BROADWELL(engine->i915))
1508                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1509
1510         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1511         /* Actual scratch location is at 128 bytes offset */
1512         batch = gen8_emit_pipe_control(batch,
1513                                        PIPE_CONTROL_FLUSH_L3 |
1514                                        PIPE_CONTROL_GLOBAL_GTT_IVB |
1515                                        PIPE_CONTROL_CS_STALL |
1516                                        PIPE_CONTROL_QW_WRITE,
1517                                        i915_scratch_offset(engine->i915) +
1518                                        2 * CACHELINE_BYTES);
1519
1520         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1521
1522         /* Pad to end of cacheline */
1523         while ((unsigned long)batch % CACHELINE_BYTES)
1524                 *batch++ = MI_NOOP;
1525
1526         /*
1527          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1528          * execution depends on the length specified in terms of cache lines
1529          * in the register CTX_RCS_INDIRECT_CTX
1530          */
1531
1532         return batch;
1533 }
1534
1535 struct lri {
1536         i915_reg_t reg;
1537         u32 value;
1538 };
1539
1540 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1541 {
1542         GEM_BUG_ON(!count || count > 63);
1543
1544         *batch++ = MI_LOAD_REGISTER_IMM(count);
1545         do {
1546                 *batch++ = i915_mmio_reg_offset(lri->reg);
1547                 *batch++ = lri->value;
1548         } while (lri++, --count);
1549         *batch++ = MI_NOOP;
1550
1551         return batch;
1552 }
1553
1554 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1555 {
1556         static const struct lri lri[] = {
1557                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1558                 {
1559                         COMMON_SLICE_CHICKEN2,
1560                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1561                                        0),
1562                 },
1563
1564                 /* BSpec: 11391 */
1565                 {
1566                         FF_SLICE_CHICKEN,
1567                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1568                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1569                 },
1570
1571                 /* BSpec: 11299 */
1572                 {
1573                         _3D_CHICKEN3,
1574                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1575                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1576                 }
1577         };
1578
1579         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1580
1581         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1582         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1583
1584         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1585
1586         /* WaMediaPoolStateCmdInWABB:bxt,glk */
1587         if (HAS_POOLED_EU(engine->i915)) {
1588                 /*
1589                  * EU pool configuration is setup along with golden context
1590                  * during context initialization. This value depends on
1591                  * device type (2x6 or 3x6) and needs to be updated based
1592                  * on which subslice is disabled especially for 2x6
1593                  * devices, however it is safe to load default
1594                  * configuration of 3x6 device instead of masking off
1595                  * corresponding bits because HW ignores bits of a disabled
1596                  * subslice and drops down to appropriate config. Please
1597                  * see render_state_setup() in i915_gem_render_state.c for
1598                  * possible configurations, to avoid duplication they are
1599                  * not shown here again.
1600                  */
1601                 *batch++ = GEN9_MEDIA_POOL_STATE;
1602                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1603                 *batch++ = 0x00777000;
1604                 *batch++ = 0;
1605                 *batch++ = 0;
1606                 *batch++ = 0;
1607         }
1608
1609         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1610
1611         /* Pad to end of cacheline */
1612         while ((unsigned long)batch % CACHELINE_BYTES)
1613                 *batch++ = MI_NOOP;
1614
1615         return batch;
1616 }
1617
1618 static u32 *
1619 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1620 {
1621         int i;
1622
1623         /*
1624          * WaPipeControlBefore3DStateSamplePattern: cnl
1625          *
1626          * Ensure the engine is idle prior to programming a
1627          * 3DSTATE_SAMPLE_PATTERN during a context restore.
1628          */
1629         batch = gen8_emit_pipe_control(batch,
1630                                        PIPE_CONTROL_CS_STALL,
1631                                        0);
1632         /*
1633          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1634          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1635          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1636          * confusing. Since gen8_emit_pipe_control() already advances the
1637          * batch by 6 dwords, we advance the other 10 here, completing a
1638          * cacheline. It's not clear if the workaround requires this padding
1639          * before other commands, or if it's just the regular padding we would
1640          * already have for the workaround bb, so leave it here for now.
1641          */
1642         for (i = 0; i < 10; i++)
1643                 *batch++ = MI_NOOP;
1644
1645         /* Pad to end of cacheline */
1646         while ((unsigned long)batch % CACHELINE_BYTES)
1647                 *batch++ = MI_NOOP;
1648
1649         return batch;
1650 }
1651
1652 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
1653
1654 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
1655 {
1656         struct drm_i915_gem_object *obj;
1657         struct i915_vma *vma;
1658         int err;
1659
1660         obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
1661         if (IS_ERR(obj))
1662                 return PTR_ERR(obj);
1663
1664         vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL);
1665         if (IS_ERR(vma)) {
1666                 err = PTR_ERR(vma);
1667                 goto err;
1668         }
1669
1670         err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
1671         if (err)
1672                 goto err;
1673
1674         engine->wa_ctx.vma = vma;
1675         return 0;
1676
1677 err:
1678         i915_gem_object_put(obj);
1679         return err;
1680 }
1681
1682 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
1683 {
1684         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1685 }
1686
1687 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1688
1689 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
1690 {
1691         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1692         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
1693                                             &wa_ctx->per_ctx };
1694         wa_bb_func_t wa_bb_fn[2];
1695         struct page *page;
1696         void *batch, *batch_ptr;
1697         unsigned int i;
1698         int ret;
1699
1700         if (GEM_DEBUG_WARN_ON(engine->id != RCS0))
1701                 return -EINVAL;
1702
1703         switch (INTEL_GEN(engine->i915)) {
1704         case 11:
1705                 return 0;
1706         case 10:
1707                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
1708                 wa_bb_fn[1] = NULL;
1709                 break;
1710         case 9:
1711                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1712                 wa_bb_fn[1] = NULL;
1713                 break;
1714         case 8:
1715                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1716                 wa_bb_fn[1] = NULL;
1717                 break;
1718         default:
1719                 MISSING_CASE(INTEL_GEN(engine->i915));
1720                 return 0;
1721         }
1722
1723         ret = lrc_setup_wa_ctx(engine);
1724         if (ret) {
1725                 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
1726                 return ret;
1727         }
1728
1729         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
1730         batch = batch_ptr = kmap_atomic(page);
1731
1732         /*
1733          * Emit the two workaround batch buffers, recording the offset from the
1734          * start of the workaround batch buffer object for each and their
1735          * respective sizes.
1736          */
1737         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1738                 wa_bb[i]->offset = batch_ptr - batch;
1739                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1740                                                   CACHELINE_BYTES))) {
1741                         ret = -EINVAL;
1742                         break;
1743                 }
1744                 if (wa_bb_fn[i])
1745                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1746                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1747         }
1748
1749         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
1750
1751         kunmap_atomic(batch);
1752         if (ret)
1753                 lrc_destroy_wa_ctx(engine);
1754
1755         return ret;
1756 }
1757
1758 static void enable_execlists(struct intel_engine_cs *engine)
1759 {
1760         struct drm_i915_private *dev_priv = engine->i915;
1761
1762         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
1763
1764         if (INTEL_GEN(dev_priv) >= 11)
1765                 I915_WRITE(RING_MODE_GEN7(engine),
1766                            _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE));
1767         else
1768                 I915_WRITE(RING_MODE_GEN7(engine),
1769                            _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
1770
1771         I915_WRITE(RING_MI_MODE(engine->mmio_base),
1772                    _MASKED_BIT_DISABLE(STOP_RING));
1773
1774         I915_WRITE(RING_HWS_PGA(engine->mmio_base),
1775                    i915_ggtt_offset(engine->status_page.vma));
1776         POSTING_READ(RING_HWS_PGA(engine->mmio_base));
1777 }
1778
1779 static bool unexpected_starting_state(struct intel_engine_cs *engine)
1780 {
1781         struct drm_i915_private *dev_priv = engine->i915;
1782         bool unexpected = false;
1783
1784         if (I915_READ(RING_MI_MODE(engine->mmio_base)) & STOP_RING) {
1785                 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
1786                 unexpected = true;
1787         }
1788
1789         return unexpected;
1790 }
1791
1792 static int execlists_resume(struct intel_engine_cs *engine)
1793 {
1794         intel_engine_apply_workarounds(engine);
1795         intel_engine_apply_whitelist(engine);
1796
1797         intel_mocs_init_engine(engine);
1798
1799         intel_engine_reset_breadcrumbs(engine);
1800
1801         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
1802                 struct drm_printer p = drm_debug_printer(__func__);
1803
1804                 intel_engine_dump(engine, &p, NULL);
1805         }
1806
1807         enable_execlists(engine);
1808
1809         return 0;
1810 }
1811
1812 static void execlists_reset_prepare(struct intel_engine_cs *engine)
1813 {
1814         struct intel_engine_execlists * const execlists = &engine->execlists;
1815         unsigned long flags;
1816
1817         GEM_TRACE("%s: depth<-%d\n", engine->name,
1818                   atomic_read(&execlists->tasklet.count));
1819
1820         /*
1821          * Prevent request submission to the hardware until we have
1822          * completed the reset in i915_gem_reset_finish(). If a request
1823          * is completed by one engine, it may then queue a request
1824          * to a second via its execlists->tasklet *just* as we are
1825          * calling engine->resume() and also writing the ELSP.
1826          * Turning off the execlists->tasklet until the reset is over
1827          * prevents the race.
1828          */
1829         __tasklet_disable_sync_once(&execlists->tasklet);
1830         GEM_BUG_ON(!reset_in_progress(execlists));
1831
1832         intel_engine_stop_cs(engine);
1833
1834         /* And flush any current direct submission. */
1835         spin_lock_irqsave(&engine->timeline.lock, flags);
1836         spin_unlock_irqrestore(&engine->timeline.lock, flags);
1837 }
1838
1839 static bool lrc_regs_ok(const struct i915_request *rq)
1840 {
1841         const struct intel_ring *ring = rq->ring;
1842         const u32 *regs = rq->hw_context->lrc_reg_state;
1843
1844         /* Quick spot check for the common signs of context corruption */
1845
1846         if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
1847             (RING_CTL_SIZE(ring->size) | RING_VALID))
1848                 return false;
1849
1850         if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
1851                 return false;
1852
1853         return true;
1854 }
1855
1856 static void reset_csb_pointers(struct intel_engine_execlists *execlists)
1857 {
1858         const unsigned int reset_value = execlists->csb_size - 1;
1859
1860         /*
1861          * After a reset, the HW starts writing into CSB entry [0]. We
1862          * therefore have to set our HEAD pointer back one entry so that
1863          * the *first* entry we check is entry 0. To complicate this further,
1864          * as we don't wait for the first interrupt after reset, we have to
1865          * fake the HW write to point back to the last entry so that our
1866          * inline comparison of our cached head position against the last HW
1867          * write works even before the first interrupt.
1868          */
1869         execlists->csb_head = reset_value;
1870         WRITE_ONCE(*execlists->csb_write, reset_value);
1871         wmb(); /* Make sure this is visible to HW (paranoia?) */
1872
1873         invalidate_csb_entries(&execlists->csb_status[0],
1874                                &execlists->csb_status[reset_value]);
1875 }
1876
1877 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
1878 {
1879         struct intel_engine_execlists * const execlists = &engine->execlists;
1880         struct intel_context *ce;
1881         struct i915_request *rq;
1882         u32 *regs;
1883
1884         process_csb(engine); /* drain preemption events */
1885
1886         /* Following the reset, we need to reload the CSB read/write pointers */
1887         reset_csb_pointers(&engine->execlists);
1888
1889         /*
1890          * Save the currently executing context, even if we completed
1891          * its request, it was still running at the time of the
1892          * reset and will have been clobbered.
1893          */
1894         if (!port_isset(execlists->port))
1895                 goto out_clear;
1896
1897         ce = port_request(execlists->port)->hw_context;
1898
1899         /*
1900          * Catch up with any missed context-switch interrupts.
1901          *
1902          * Ideally we would just read the remaining CSB entries now that we
1903          * know the gpu is idle. However, the CSB registers are sometimes^W
1904          * often trashed across a GPU reset! Instead we have to rely on
1905          * guessing the missed context-switch events by looking at what
1906          * requests were completed.
1907          */
1908         execlists_cancel_port_requests(execlists);
1909
1910         /* Push back any incomplete requests for replay after the reset. */
1911         rq = __unwind_incomplete_requests(engine);
1912         if (!rq)
1913                 goto out_replay;
1914
1915         if (rq->hw_context != ce) { /* caught just before a CS event */
1916                 rq = NULL;
1917                 goto out_replay;
1918         }
1919
1920         /*
1921          * If this request hasn't started yet, e.g. it is waiting on a
1922          * semaphore, we need to avoid skipping the request or else we
1923          * break the signaling chain. However, if the context is corrupt
1924          * the request will not restart and we will be stuck with a wedged
1925          * device. It is quite often the case that if we issue a reset
1926          * while the GPU is loading the context image, that the context
1927          * image becomes corrupt.
1928          *
1929          * Otherwise, if we have not started yet, the request should replay
1930          * perfectly and we do not need to flag the result as being erroneous.
1931          */
1932         if (!i915_request_started(rq) && lrc_regs_ok(rq))
1933                 goto out_replay;
1934
1935         /*
1936          * If the request was innocent, we leave the request in the ELSP
1937          * and will try to replay it on restarting. The context image may
1938          * have been corrupted by the reset, in which case we may have
1939          * to service a new GPU hang, but more likely we can continue on
1940          * without impact.
1941          *
1942          * If the request was guilty, we presume the context is corrupt
1943          * and have to at least restore the RING register in the context
1944          * image back to the expected values to skip over the guilty request.
1945          */
1946         i915_reset_request(rq, stalled);
1947         if (!stalled && lrc_regs_ok(rq))
1948                 goto out_replay;
1949
1950         /*
1951          * We want a simple context + ring to execute the breadcrumb update.
1952          * We cannot rely on the context being intact across the GPU hang,
1953          * so clear it and rebuild just what we need for the breadcrumb.
1954          * All pending requests for this context will be zapped, and any
1955          * future request will be after userspace has had the opportunity
1956          * to recreate its own state.
1957          */
1958         regs = ce->lrc_reg_state;
1959         if (engine->pinned_default_state) {
1960                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1961                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1962                        engine->context_size - PAGE_SIZE);
1963         }
1964         execlists_init_reg_state(regs, ce, engine, ce->ring);
1965
1966         /* Rerun the request; its payload has been neutered (if guilty). */
1967 out_replay:
1968         ce->ring->head =
1969                 rq ? intel_ring_wrap(ce->ring, rq->head) : ce->ring->tail;
1970         intel_ring_update_space(ce->ring);
1971         __execlists_update_reg_state(ce, engine);
1972
1973 out_clear:
1974         execlists_clear_all_active(execlists);
1975 }
1976
1977 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
1978 {
1979         unsigned long flags;
1980
1981         GEM_TRACE("%s\n", engine->name);
1982
1983         spin_lock_irqsave(&engine->timeline.lock, flags);
1984
1985         __execlists_reset(engine, stalled);
1986
1987         spin_unlock_irqrestore(&engine->timeline.lock, flags);
1988 }
1989
1990 static void nop_submission_tasklet(unsigned long data)
1991 {
1992         /* The driver is wedged; don't process any more events. */
1993 }
1994
1995 static void execlists_cancel_requests(struct intel_engine_cs *engine)
1996 {
1997         struct intel_engine_execlists * const execlists = &engine->execlists;
1998         struct i915_request *rq, *rn;
1999         struct rb_node *rb;
2000         unsigned long flags;
2001
2002         GEM_TRACE("%s\n", engine->name);
2003
2004         /*
2005          * Before we call engine->cancel_requests(), we should have exclusive
2006          * access to the submission state. This is arranged for us by the
2007          * caller disabling the interrupt generation, the tasklet and other
2008          * threads that may then access the same state, giving us a free hand
2009          * to reset state. However, we still need to let lockdep be aware that
2010          * we know this state may be accessed in hardirq context, so we
2011          * disable the irq around this manipulation and we want to keep
2012          * the spinlock focused on its duties and not accidentally conflate
2013          * coverage to the submission's irq state. (Similarly, although we
2014          * shouldn't need to disable irq around the manipulation of the
2015          * submission's irq state, we also wish to remind ourselves that
2016          * it is irq state.)
2017          */
2018         spin_lock_irqsave(&engine->timeline.lock, flags);
2019
2020         __execlists_reset(engine, true);
2021
2022         /* Mark all executing requests as skipped. */
2023         list_for_each_entry(rq, &engine->timeline.requests, link) {
2024                 if (!i915_request_signaled(rq))
2025                         dma_fence_set_error(&rq->fence, -EIO);
2026
2027                 i915_request_mark_complete(rq);
2028         }
2029
2030         /* Flush the queued requests to the timeline list (for retiring). */
2031         while ((rb = rb_first_cached(&execlists->queue))) {
2032                 struct i915_priolist *p = to_priolist(rb);
2033                 int i;
2034
2035                 priolist_for_each_request_consume(rq, rn, p, i) {
2036                         list_del_init(&rq->sched.link);
2037                         __i915_request_submit(rq);
2038                         dma_fence_set_error(&rq->fence, -EIO);
2039                         i915_request_mark_complete(rq);
2040                 }
2041
2042                 rb_erase_cached(&p->node, &execlists->queue);
2043                 i915_priolist_free(p);
2044         }
2045
2046         /* Remaining _unready_ requests will be nop'ed when submitted */
2047
2048         execlists->queue_priority_hint = INT_MIN;
2049         execlists->queue = RB_ROOT_CACHED;
2050         GEM_BUG_ON(port_isset(execlists->port));
2051
2052         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
2053         execlists->tasklet.func = nop_submission_tasklet;
2054
2055         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2056 }
2057
2058 static void execlists_reset_finish(struct intel_engine_cs *engine)
2059 {
2060         struct intel_engine_execlists * const execlists = &engine->execlists;
2061
2062         /*
2063          * After a GPU reset, we may have requests to replay. Do so now while
2064          * we still have the forcewake to be sure that the GPU is not allowed
2065          * to sleep before we restart and reload a context.
2066          */
2067         GEM_BUG_ON(!reset_in_progress(execlists));
2068         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
2069                 execlists->tasklet.func(execlists->tasklet.data);
2070
2071         if (__tasklet_enable(&execlists->tasklet))
2072                 /* And kick in case we missed a new request submission. */
2073                 tasklet_hi_schedule(&execlists->tasklet);
2074         GEM_TRACE("%s: depth->%d\n", engine->name,
2075                   atomic_read(&execlists->tasklet.count));
2076 }
2077
2078 static int gen8_emit_bb_start(struct i915_request *rq,
2079                               u64 offset, u32 len,
2080                               const unsigned int flags)
2081 {
2082         u32 *cs;
2083
2084         cs = intel_ring_begin(rq, 4);
2085         if (IS_ERR(cs))
2086                 return PTR_ERR(cs);
2087
2088         /*
2089          * WaDisableCtxRestoreArbitration:bdw,chv
2090          *
2091          * We don't need to perform MI_ARB_ENABLE as often as we do (in
2092          * particular all the gen that do not need the w/a at all!), if we
2093          * took care to make sure that on every switch into this context
2094          * (both ordinary and for preemption) that arbitrartion was enabled
2095          * we would be fine.  However, for gen8 there is another w/a that
2096          * requires us to not preempt inside GPGPU execution, so we keep
2097          * arbitration disabled for gen8 batches. Arbitration will be
2098          * re-enabled before we close the request
2099          * (engine->emit_fini_breadcrumb).
2100          */
2101         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2102
2103         /* FIXME(BDW+): Address space and security selectors. */
2104         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2105                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2106         *cs++ = lower_32_bits(offset);
2107         *cs++ = upper_32_bits(offset);
2108
2109         intel_ring_advance(rq, cs);
2110
2111         return 0;
2112 }
2113
2114 static int gen9_emit_bb_start(struct i915_request *rq,
2115                               u64 offset, u32 len,
2116                               const unsigned int flags)
2117 {
2118         u32 *cs;
2119
2120         cs = intel_ring_begin(rq, 6);
2121         if (IS_ERR(cs))
2122                 return PTR_ERR(cs);
2123
2124         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2125
2126         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2127                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2128         *cs++ = lower_32_bits(offset);
2129         *cs++ = upper_32_bits(offset);
2130
2131         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2132         *cs++ = MI_NOOP;
2133
2134         intel_ring_advance(rq, cs);
2135
2136         return 0;
2137 }
2138
2139 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
2140 {
2141         ENGINE_WRITE(engine, RING_IMR,
2142                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
2143         ENGINE_POSTING_READ(engine, RING_IMR);
2144 }
2145
2146 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
2147 {
2148         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
2149 }
2150
2151 static int gen8_emit_flush(struct i915_request *request, u32 mode)
2152 {
2153         u32 cmd, *cs;
2154
2155         cs = intel_ring_begin(request, 4);
2156         if (IS_ERR(cs))
2157                 return PTR_ERR(cs);
2158
2159         cmd = MI_FLUSH_DW + 1;
2160
2161         /* We always require a command barrier so that subsequent
2162          * commands, such as breadcrumb interrupts, are strictly ordered
2163          * wrt the contents of the write cache being flushed to memory
2164          * (and thus being coherent from the CPU).
2165          */
2166         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2167
2168         if (mode & EMIT_INVALIDATE) {
2169                 cmd |= MI_INVALIDATE_TLB;
2170                 if (request->engine->class == VIDEO_DECODE_CLASS)
2171                         cmd |= MI_INVALIDATE_BSD;
2172         }
2173
2174         *cs++ = cmd;
2175         *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2176         *cs++ = 0; /* upper addr */
2177         *cs++ = 0; /* value */
2178         intel_ring_advance(request, cs);
2179
2180         return 0;
2181 }
2182
2183 static int gen8_emit_flush_render(struct i915_request *request,
2184                                   u32 mode)
2185 {
2186         struct intel_engine_cs *engine = request->engine;
2187         u32 scratch_addr =
2188                 i915_scratch_offset(engine->i915) + 2 * CACHELINE_BYTES;
2189         bool vf_flush_wa = false, dc_flush_wa = false;
2190         u32 *cs, flags = 0;
2191         int len;
2192
2193         flags |= PIPE_CONTROL_CS_STALL;
2194
2195         if (mode & EMIT_FLUSH) {
2196                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2197                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2198                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2199                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
2200         }
2201
2202         if (mode & EMIT_INVALIDATE) {
2203                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2204                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2205                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2206                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2207                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2208                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2209                 flags |= PIPE_CONTROL_QW_WRITE;
2210                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2211
2212                 /*
2213                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
2214                  * pipe control.
2215                  */
2216                 if (IS_GEN(request->i915, 9))
2217                         vf_flush_wa = true;
2218
2219                 /* WaForGAMHang:kbl */
2220                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
2221                         dc_flush_wa = true;
2222         }
2223
2224         len = 6;
2225
2226         if (vf_flush_wa)
2227                 len += 6;
2228
2229         if (dc_flush_wa)
2230                 len += 12;
2231
2232         cs = intel_ring_begin(request, len);
2233         if (IS_ERR(cs))
2234                 return PTR_ERR(cs);
2235
2236         if (vf_flush_wa)
2237                 cs = gen8_emit_pipe_control(cs, 0, 0);
2238
2239         if (dc_flush_wa)
2240                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
2241                                             0);
2242
2243         cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2244
2245         if (dc_flush_wa)
2246                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
2247
2248         intel_ring_advance(request, cs);
2249
2250         return 0;
2251 }
2252
2253 /*
2254  * Reserve space for 2 NOOPs at the end of each request to be
2255  * used as a workaround for not being allowed to do lite
2256  * restore with HEAD==TAIL (WaIdleLiteRestore).
2257  */
2258 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
2259 {
2260         /* Ensure there's always at least one preemption point per-request. */
2261         *cs++ = MI_ARB_CHECK;
2262         *cs++ = MI_NOOP;
2263         request->wa_tail = intel_ring_offset(request, cs);
2264
2265         return cs;
2266 }
2267
2268 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
2269 {
2270         cs = gen8_emit_ggtt_write(cs,
2271                                   request->fence.seqno,
2272                                   request->timeline->hwsp_offset,
2273                                   0);
2274
2275         cs = gen8_emit_ggtt_write(cs,
2276                                   intel_engine_next_hangcheck_seqno(request->engine),
2277                                   I915_GEM_HWS_HANGCHECK_ADDR,
2278                                   MI_FLUSH_DW_STORE_INDEX);
2279
2280
2281         *cs++ = MI_USER_INTERRUPT;
2282         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2283
2284         request->tail = intel_ring_offset(request, cs);
2285         assert_ring_tail_valid(request->ring, request->tail);
2286
2287         return gen8_emit_wa_tail(request, cs);
2288 }
2289
2290 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
2291 {
2292         cs = gen8_emit_ggtt_write_rcs(cs,
2293                                       request->fence.seqno,
2294                                       request->timeline->hwsp_offset,
2295                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2296                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2297                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
2298                                       PIPE_CONTROL_FLUSH_ENABLE |
2299                                       PIPE_CONTROL_CS_STALL);
2300
2301         cs = gen8_emit_ggtt_write_rcs(cs,
2302                                       intel_engine_next_hangcheck_seqno(request->engine),
2303                                       I915_GEM_HWS_HANGCHECK_ADDR,
2304                                       PIPE_CONTROL_STORE_DATA_INDEX);
2305
2306         *cs++ = MI_USER_INTERRUPT;
2307         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2308
2309         request->tail = intel_ring_offset(request, cs);
2310         assert_ring_tail_valid(request->ring, request->tail);
2311
2312         return gen8_emit_wa_tail(request, cs);
2313 }
2314
2315 static int gen8_init_rcs_context(struct i915_request *rq)
2316 {
2317         int ret;
2318
2319         ret = intel_engine_emit_ctx_wa(rq);
2320         if (ret)
2321                 return ret;
2322
2323         ret = intel_rcs_context_init_mocs(rq);
2324         /*
2325          * Failing to program the MOCS is non-fatal.The system will not
2326          * run at peak performance. So generate an error and carry on.
2327          */
2328         if (ret)
2329                 DRM_ERROR("MOCS failed to program: expect performance issues.\n");
2330
2331         return i915_gem_render_state_emit(rq);
2332 }
2333
2334 /**
2335  * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
2336  * @engine: Engine Command Streamer.
2337  */
2338 void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
2339 {
2340         struct drm_i915_private *dev_priv;
2341
2342         /*
2343          * Tasklet cannot be active at this point due intel_mark_active/idle
2344          * so this is just for documentation.
2345          */
2346         if (WARN_ON(test_bit(TASKLET_STATE_SCHED,
2347                              &engine->execlists.tasklet.state)))
2348                 tasklet_kill(&engine->execlists.tasklet);
2349
2350         dev_priv = engine->i915;
2351
2352         if (engine->buffer) {
2353                 WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
2354         }
2355
2356         if (engine->cleanup)
2357                 engine->cleanup(engine);
2358
2359         intel_engine_cleanup_common(engine);
2360
2361         lrc_destroy_wa_ctx(engine);
2362
2363         engine->i915 = NULL;
2364         dev_priv->engine[engine->id] = NULL;
2365         kfree(engine);
2366 }
2367
2368 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
2369 {
2370         engine->submit_request = execlists_submit_request;
2371         engine->cancel_requests = execlists_cancel_requests;
2372         engine->schedule = i915_schedule;
2373         engine->execlists.tasklet.func = execlists_submission_tasklet;
2374
2375         engine->reset.prepare = execlists_reset_prepare;
2376         engine->reset.reset = execlists_reset;
2377         engine->reset.finish = execlists_reset_finish;
2378
2379         engine->park = NULL;
2380         engine->unpark = NULL;
2381
2382         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
2383         if (!intel_vgpu_active(engine->i915))
2384                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
2385         if (engine->preempt_context &&
2386             HAS_LOGICAL_RING_PREEMPTION(engine->i915))
2387                 engine->flags |= I915_ENGINE_HAS_PREEMPTION;
2388 }
2389
2390 static void
2391 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
2392 {
2393         /* Default vfuncs which can be overriden by each engine. */
2394         engine->resume = execlists_resume;
2395
2396         engine->reset.prepare = execlists_reset_prepare;
2397         engine->reset.reset = execlists_reset;
2398         engine->reset.finish = execlists_reset_finish;
2399
2400         engine->cops = &execlists_context_ops;
2401         engine->request_alloc = execlists_request_alloc;
2402
2403         engine->emit_flush = gen8_emit_flush;
2404         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
2405         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
2406
2407         engine->set_default_submission = intel_execlists_set_default_submission;
2408
2409         if (INTEL_GEN(engine->i915) < 11) {
2410                 engine->irq_enable = gen8_logical_ring_enable_irq;
2411                 engine->irq_disable = gen8_logical_ring_disable_irq;
2412         } else {
2413                 /*
2414                  * TODO: On Gen11 interrupt masks need to be clear
2415                  * to allow C6 entry. Keep interrupts enabled at
2416                  * and take the hit of generating extra interrupts
2417                  * until a more refined solution exists.
2418                  */
2419         }
2420         if (IS_GEN(engine->i915, 8))
2421                 engine->emit_bb_start = gen8_emit_bb_start;
2422         else
2423                 engine->emit_bb_start = gen9_emit_bb_start;
2424 }
2425
2426 static inline void
2427 logical_ring_default_irqs(struct intel_engine_cs *engine)
2428 {
2429         unsigned int shift = 0;
2430
2431         if (INTEL_GEN(engine->i915) < 11) {
2432                 const u8 irq_shifts[] = {
2433                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
2434                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
2435                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
2436                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
2437                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
2438                 };
2439
2440                 shift = irq_shifts[engine->id];
2441         }
2442
2443         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
2444         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
2445 }
2446
2447 static int
2448 logical_ring_setup(struct intel_engine_cs *engine)
2449 {
2450         int err;
2451
2452         err = intel_engine_setup_common(engine);
2453         if (err)
2454                 return err;
2455
2456         /* Intentionally left blank. */
2457         engine->buffer = NULL;
2458
2459         tasklet_init(&engine->execlists.tasklet,
2460                      execlists_submission_tasklet, (unsigned long)engine);
2461
2462         logical_ring_default_vfuncs(engine);
2463         logical_ring_default_irqs(engine);
2464
2465         return 0;
2466 }
2467
2468 static int logical_ring_init(struct intel_engine_cs *engine)
2469 {
2470         struct drm_i915_private *i915 = engine->i915;
2471         struct intel_engine_execlists * const execlists = &engine->execlists;
2472         u32 base = engine->mmio_base;
2473         int ret;
2474
2475         ret = intel_engine_init_common(engine);
2476         if (ret)
2477                 return ret;
2478
2479         intel_engine_init_workarounds(engine);
2480
2481         if (HAS_LOGICAL_RING_ELSQ(i915)) {
2482                 execlists->submit_reg = i915->uncore.regs +
2483                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
2484                 execlists->ctrl_reg = i915->uncore.regs +
2485                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
2486         } else {
2487                 execlists->submit_reg = i915->uncore.regs +
2488                         i915_mmio_reg_offset(RING_ELSP(base));
2489         }
2490
2491         execlists->preempt_complete_status = ~0u;
2492         if (engine->preempt_context)
2493                 execlists->preempt_complete_status =
2494                         upper_32_bits(engine->preempt_context->lrc_desc);
2495
2496         execlists->csb_status =
2497                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
2498
2499         execlists->csb_write =
2500                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
2501
2502         if (INTEL_GEN(engine->i915) < 11)
2503                 execlists->csb_size = GEN8_CSB_ENTRIES;
2504         else
2505                 execlists->csb_size = GEN11_CSB_ENTRIES;
2506
2507         reset_csb_pointers(execlists);
2508
2509         return 0;
2510 }
2511
2512 int logical_render_ring_init(struct intel_engine_cs *engine)
2513 {
2514         int ret;
2515
2516         ret = logical_ring_setup(engine);
2517         if (ret)
2518                 return ret;
2519
2520         /* Override some for render ring. */
2521         engine->init_context = gen8_init_rcs_context;
2522         engine->emit_flush = gen8_emit_flush_render;
2523         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
2524
2525         ret = logical_ring_init(engine);
2526         if (ret)
2527                 return ret;
2528
2529         ret = intel_init_workaround_bb(engine);
2530         if (ret) {
2531                 /*
2532                  * We continue even if we fail to initialize WA batch
2533                  * because we only expect rare glitches but nothing
2534                  * critical to prevent us from using GPU
2535                  */
2536                 DRM_ERROR("WA batch buffer initialization failed: %d\n",
2537                           ret);
2538         }
2539
2540         intel_engine_init_whitelist(engine);
2541
2542         return 0;
2543 }
2544
2545 int logical_xcs_ring_init(struct intel_engine_cs *engine)
2546 {
2547         int err;
2548
2549         err = logical_ring_setup(engine);
2550         if (err)
2551                 return err;
2552
2553         return logical_ring_init(engine);
2554 }
2555
2556 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
2557 {
2558         u32 indirect_ctx_offset;
2559
2560         switch (INTEL_GEN(engine->i915)) {
2561         default:
2562                 MISSING_CASE(INTEL_GEN(engine->i915));
2563                 /* fall through */
2564         case 11:
2565                 indirect_ctx_offset =
2566                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2567                 break;
2568         case 10:
2569                 indirect_ctx_offset =
2570                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2571                 break;
2572         case 9:
2573                 indirect_ctx_offset =
2574                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2575                 break;
2576         case 8:
2577                 indirect_ctx_offset =
2578                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2579                 break;
2580         }
2581
2582         return indirect_ctx_offset;
2583 }
2584
2585 static void execlists_init_reg_state(u32 *regs,
2586                                      struct intel_context *ce,
2587                                      struct intel_engine_cs *engine,
2588                                      struct intel_ring *ring)
2589 {
2590         struct i915_hw_ppgtt *ppgtt = ce->gem_context->ppgtt;
2591         bool rcs = engine->class == RENDER_CLASS;
2592         u32 base = engine->mmio_base;
2593
2594         /* A context is actually a big batch buffer with several
2595          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
2596          * values we are setting here are only for the first context restore:
2597          * on a subsequent save, the GPU will recreate this batchbuffer with new
2598          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
2599          * we are not initializing here).
2600          */
2601         regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
2602                                  MI_LRI_FORCE_POSTED;
2603
2604         CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
2605                 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
2606                 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
2607         if (INTEL_GEN(engine->i915) < 11) {
2608                 regs[CTX_CONTEXT_CONTROL + 1] |=
2609                         _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
2610                                             CTX_CTRL_RS_CTX_ENABLE);
2611         }
2612         CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
2613         CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
2614         CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
2615         CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
2616                 RING_CTL_SIZE(ring->size) | RING_VALID);
2617         CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
2618         CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
2619         CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
2620         CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
2621         CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
2622         CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
2623         if (rcs) {
2624                 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2625
2626                 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
2627                 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
2628                         RING_INDIRECT_CTX_OFFSET(base), 0);
2629                 if (wa_ctx->indirect_ctx.size) {
2630                         u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
2631
2632                         regs[CTX_RCS_INDIRECT_CTX + 1] =
2633                                 (ggtt_offset + wa_ctx->indirect_ctx.offset) |
2634                                 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
2635
2636                         regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
2637                                 intel_lr_indirect_ctx_offset(engine) << 6;
2638                 }
2639
2640                 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
2641                 if (wa_ctx->per_ctx.size) {
2642                         u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
2643
2644                         regs[CTX_BB_PER_CTX_PTR + 1] =
2645                                 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
2646                 }
2647         }
2648
2649         regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
2650
2651         CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
2652         /* PDP values well be assigned later if needed */
2653         CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(base, 3), 0);
2654         CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(base, 3), 0);
2655         CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(base, 2), 0);
2656         CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(base, 2), 0);
2657         CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(base, 1), 0);
2658         CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(base, 1), 0);
2659         CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(base, 0), 0);
2660         CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(base, 0), 0);
2661
2662         if (i915_vm_is_4lvl(&ppgtt->vm)) {
2663                 /* 64b PPGTT (48bit canonical)
2664                  * PDP0_DESCRIPTOR contains the base address to PML4 and
2665                  * other PDP Descriptors are ignored.
2666                  */
2667                 ASSIGN_CTX_PML4(ppgtt, regs);
2668         } else {
2669                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
2670                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
2671                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
2672                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
2673         }
2674
2675         if (rcs) {
2676                 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
2677                 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
2678
2679                 i915_oa_init_reg_state(engine, ce, regs);
2680         }
2681
2682         regs[CTX_END] = MI_BATCH_BUFFER_END;
2683         if (INTEL_GEN(engine->i915) >= 10)
2684                 regs[CTX_END] |= BIT(0);
2685 }
2686
2687 static int
2688 populate_lr_context(struct intel_context *ce,
2689                     struct drm_i915_gem_object *ctx_obj,
2690                     struct intel_engine_cs *engine,
2691                     struct intel_ring *ring)
2692 {
2693         void *vaddr;
2694         u32 *regs;
2695         int ret;
2696
2697         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
2698         if (IS_ERR(vaddr)) {
2699                 ret = PTR_ERR(vaddr);
2700                 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
2701                 return ret;
2702         }
2703
2704         if (engine->default_state) {
2705                 /*
2706                  * We only want to copy over the template context state;
2707                  * skipping over the headers reserved for GuC communication,
2708                  * leaving those as zero.
2709                  */
2710                 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
2711                 void *defaults;
2712
2713                 defaults = i915_gem_object_pin_map(engine->default_state,
2714                                                    I915_MAP_WB);
2715                 if (IS_ERR(defaults)) {
2716                         ret = PTR_ERR(defaults);
2717                         goto err_unpin_ctx;
2718                 }
2719
2720                 memcpy(vaddr + start, defaults + start, engine->context_size);
2721                 i915_gem_object_unpin_map(engine->default_state);
2722         }
2723
2724         /* The second page of the context object contains some fields which must
2725          * be set up prior to the first execution. */
2726         regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
2727         execlists_init_reg_state(regs, ce, engine, ring);
2728         if (!engine->default_state)
2729                 regs[CTX_CONTEXT_CONTROL + 1] |=
2730                         _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
2731         if (ce->gem_context == engine->i915->preempt_context &&
2732             INTEL_GEN(engine->i915) < 11)
2733                 regs[CTX_CONTEXT_CONTROL + 1] |=
2734                         _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
2735                                            CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT);
2736
2737         ret = 0;
2738 err_unpin_ctx:
2739         __i915_gem_object_flush_map(ctx_obj,
2740                                     LRC_HEADER_PAGES * PAGE_SIZE,
2741                                     engine->context_size);
2742         i915_gem_object_unpin_map(ctx_obj);
2743         return ret;
2744 }
2745
2746 static struct i915_timeline *get_timeline(struct i915_gem_context *ctx)
2747 {
2748         if (ctx->timeline)
2749                 return i915_timeline_get(ctx->timeline);
2750         else
2751                 return i915_timeline_create(ctx->i915, NULL);
2752 }
2753
2754 static int execlists_context_deferred_alloc(struct intel_context *ce,
2755                                             struct intel_engine_cs *engine)
2756 {
2757         struct drm_i915_gem_object *ctx_obj;
2758         struct i915_vma *vma;
2759         u32 context_size;
2760         struct intel_ring *ring;
2761         struct i915_timeline *timeline;
2762         int ret;
2763
2764         if (ce->state)
2765                 return 0;
2766
2767         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
2768
2769         /*
2770          * Before the actual start of the context image, we insert a few pages
2771          * for our own use and for sharing with the GuC.
2772          */
2773         context_size += LRC_HEADER_PAGES * PAGE_SIZE;
2774
2775         ctx_obj = i915_gem_object_create(engine->i915, context_size);
2776         if (IS_ERR(ctx_obj))
2777                 return PTR_ERR(ctx_obj);
2778
2779         vma = i915_vma_instance(ctx_obj, &engine->i915->ggtt.vm, NULL);
2780         if (IS_ERR(vma)) {
2781                 ret = PTR_ERR(vma);
2782                 goto error_deref_obj;
2783         }
2784
2785         timeline = get_timeline(ce->gem_context);
2786         if (IS_ERR(timeline)) {
2787                 ret = PTR_ERR(timeline);
2788                 goto error_deref_obj;
2789         }
2790
2791         ring = intel_engine_create_ring(engine,
2792                                         timeline,
2793                                         ce->gem_context->ring_size);
2794         i915_timeline_put(timeline);
2795         if (IS_ERR(ring)) {
2796                 ret = PTR_ERR(ring);
2797                 goto error_deref_obj;
2798         }
2799
2800         ret = populate_lr_context(ce, ctx_obj, engine, ring);
2801         if (ret) {
2802                 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
2803                 goto error_ring_free;
2804         }
2805
2806         ce->ring = ring;
2807         ce->state = vma;
2808
2809         return 0;
2810
2811 error_ring_free:
2812         intel_ring_put(ring);
2813 error_deref_obj:
2814         i915_gem_object_put(ctx_obj);
2815         return ret;
2816 }
2817
2818 void intel_execlists_show_requests(struct intel_engine_cs *engine,
2819                                    struct drm_printer *m,
2820                                    void (*show_request)(struct drm_printer *m,
2821                                                         struct i915_request *rq,
2822                                                         const char *prefix),
2823                                    unsigned int max)
2824 {
2825         const struct intel_engine_execlists *execlists = &engine->execlists;
2826         struct i915_request *rq, *last;
2827         unsigned long flags;
2828         unsigned int count;
2829         struct rb_node *rb;
2830
2831         spin_lock_irqsave(&engine->timeline.lock, flags);
2832
2833         last = NULL;
2834         count = 0;
2835         list_for_each_entry(rq, &engine->timeline.requests, link) {
2836                 if (count++ < max - 1)
2837                         show_request(m, rq, "\t\tE ");
2838                 else
2839                         last = rq;
2840         }
2841         if (last) {
2842                 if (count > max) {
2843                         drm_printf(m,
2844                                    "\t\t...skipping %d executing requests...\n",
2845                                    count - max);
2846                 }
2847                 show_request(m, last, "\t\tE ");
2848         }
2849
2850         last = NULL;
2851         count = 0;
2852         if (execlists->queue_priority_hint != INT_MIN)
2853                 drm_printf(m, "\t\tQueue priority hint: %d\n",
2854                            execlists->queue_priority_hint);
2855         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
2856                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
2857                 int i;
2858
2859                 priolist_for_each_request(rq, p, i) {
2860                         if (count++ < max - 1)
2861                                 show_request(m, rq, "\t\tQ ");
2862                         else
2863                                 last = rq;
2864                 }
2865         }
2866         if (last) {
2867                 if (count > max) {
2868                         drm_printf(m,
2869                                    "\t\t...skipping %d queued requests...\n",
2870                                    count - max);
2871                 }
2872                 show_request(m, last, "\t\tQ ");
2873         }
2874
2875         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2876 }
2877
2878 void intel_lr_context_reset(struct intel_engine_cs *engine,
2879                             struct intel_context *ce,
2880                             u32 head,
2881                             bool scrub)
2882 {
2883         /*
2884          * We want a simple context + ring to execute the breadcrumb update.
2885          * We cannot rely on the context being intact across the GPU hang,
2886          * so clear it and rebuild just what we need for the breadcrumb.
2887          * All pending requests for this context will be zapped, and any
2888          * future request will be after userspace has had the opportunity
2889          * to recreate its own state.
2890          */
2891         if (scrub) {
2892                 u32 *regs = ce->lrc_reg_state;
2893
2894                 if (engine->pinned_default_state) {
2895                         memcpy(regs, /* skip restoring the vanilla PPHWSP */
2896                                engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
2897                                engine->context_size - PAGE_SIZE);
2898                 }
2899                 execlists_init_reg_state(regs, ce, engine, ce->ring);
2900         }
2901
2902         /* Rerun the request; its payload has been neutered (if guilty). */
2903         ce->ring->head = head;
2904         intel_ring_update_space(ce->ring);
2905
2906         __execlists_update_reg_state(ce, engine);
2907 }
2908
2909 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2910 #include "selftest_lrc.c"
2911 #endif