drivers/gpu/drm/i915/gt/intel_ringbuffer.c

   1 /*
   2  * Copyright © 2008-2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *    Zou Nan hai <nanhai.zou@intel.com>
  26  *    Xiang Hai hao<haihao.xiang@intel.com>
  27  *
  28  */
  29
  30 #include <linux/log2.h>
  31
  32 #include <drm/i915_drm.h>
  33
  34 #include "i915_drv.h"
  35 #include "i915_gem_render_state.h"
  36 #include "i915_trace.h"
  37 #include "intel_reset.h"
  38 #include "intel_workarounds.h"
  39
  40 /* Rough estimate of the typical request size, performing a flush,
  41  * set-context and then emitting the batch.
  42  */
  43 #define LEGACY_REQUEST_SIZE 200
  44
  45 unsigned int intel_ring_update_space(struct intel_ring *ring)
  46 {
  47         unsigned int space;
  48
  49         space = __intel_ring_space(ring->head, ring->emit, ring->size);
  50
  51         ring->space = space;
  52         return space;
  53 }
  54
  55 static int
  56 gen2_render_ring_flush(struct i915_request *rq, u32 mode)
  57 {
  58         unsigned int num_store_dw;
  59         u32 cmd, *cs;
  60
  61         cmd = MI_FLUSH;
  62         num_store_dw = 0;
  63         if (mode & EMIT_INVALIDATE)
  64                 cmd |= MI_READ_FLUSH;
  65         if (mode & EMIT_FLUSH)
  66                 num_store_dw = 4;
  67
  68         cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
  69         if (IS_ERR(cs))
  70                 return PTR_ERR(cs);
  71
  72         *cs++ = cmd;
  73         while (num_store_dw--) {
  74                 *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
  75                 *cs++ = i915_scratch_offset(rq->i915);
  76                 *cs++ = 0;
  77         }
  78         *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
  79
  80         intel_ring_advance(rq, cs);
  81
  82         return 0;
  83 }
  84
  85 static int
  86 gen4_render_ring_flush(struct i915_request *rq, u32 mode)
  87 {
  88         u32 cmd, *cs;
  89         int i;
  90
  91         /*
  92          * read/write caches:
  93          *
  94          * I915_GEM_DOMAIN_RENDER is always invalidated, but is
  95          * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
  96          * also flushed at 2d versus 3d pipeline switches.
  97          *
  98          * read-only caches:
  99          *
 100          * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
 101          * MI_READ_FLUSH is set, and is always flushed on 965.
 102          *
 103          * I915_GEM_DOMAIN_COMMAND may not exist?
 104          *
 105          * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
 106          * invalidated when MI_EXE_FLUSH is set.
 107          *
 108          * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
 109          * invalidated with every MI_FLUSH.
 110          *
 111          * TLBs:
 112          *
 113          * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
 114          * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
 115          * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
 116          * are flushed at any MI_FLUSH.
 117          */
 118
 119         cmd = MI_FLUSH;
 120         if (mode & EMIT_INVALIDATE) {
 121                 cmd |= MI_EXE_FLUSH;
 122                 if (IS_G4X(rq->i915) || IS_GEN(rq->i915, 5))
 123                         cmd |= MI_INVALIDATE_ISP;
 124         }
 125
 126         i = 2;
 127         if (mode & EMIT_INVALIDATE)
 128                 i += 20;
 129
 130         cs = intel_ring_begin(rq, i);
 131         if (IS_ERR(cs))
 132                 return PTR_ERR(cs);
 133
 134         *cs++ = cmd;
 135
 136         /*
 137          * A random delay to let the CS invalidate take effect? Without this
 138          * delay, the GPU relocation path fails as the CS does not see
 139          * the updated contents. Just as important, if we apply the flushes
 140          * to the EMIT_FLUSH branch (i.e. immediately after the relocation
 141          * write and before the invalidate on the next batch), the relocations
 142          * still fail. This implies that is a delay following invalidation
 143          * that is required to reset the caches as opposed to a delay to
 144          * ensure the memory is written.
 145          */
 146         if (mode & EMIT_INVALIDATE) {
 147                 *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
 148                 *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
 149                 *cs++ = 0;
 150                 *cs++ = 0;
 151
 152                 for (i = 0; i < 12; i++)
 153                         *cs++ = MI_FLUSH;
 154
 155                 *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
 156                 *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
 157                 *cs++ = 0;
 158                 *cs++ = 0;
 159         }
 160
 161         *cs++ = cmd;
 162
 163         intel_ring_advance(rq, cs);
 164
 165         return 0;
 166 }
 167
 168 /*
 169  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 170  * implementing two workarounds on gen6.  From section 1.4.7.1
 171  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 172  *
 173  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 174  * produced by non-pipelined state commands), software needs to first
 175  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 176  * 0.
 177  *
 178  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 179  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 180  *
 181  * And the workaround for these two requires this workaround first:
 182  *
 183  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 184  * BEFORE the pipe-control with a post-sync op and no write-cache
 185  * flushes.
 186  *
 187  * And this last workaround is tricky because of the requirements on
 188  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 189  * volume 2 part 1:
 190  *
 191  *     "1 of the following must also be set:
 192  *      - Render Target Cache Flush Enable ([12] of DW1)
 193  *      - Depth Cache Flush Enable ([0] of DW1)
 194  *      - Stall at Pixel Scoreboard ([1] of DW1)
 195  *      - Depth Stall ([13] of DW1)
 196  *      - Post-Sync Operation ([13] of DW1)
 197  *      - Notify Enable ([8] of DW1)"
 198  *
 199  * The cache flushes require the workaround flush that triggered this
 200  * one, so we can't use it.  Depth stall would trigger the same.
 201  * Post-sync nonzero is what triggered this second workaround, so we
 202  * can't use that one either.  Notify enable is IRQs, which aren't
 203  * really our business.  That leaves only stall at scoreboard.
 204  */
 205 static int
 206 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
 207 {
 208         u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 209         u32 *cs;
 210
 211         cs = intel_ring_begin(rq, 6);
 212         if (IS_ERR(cs))
 213                 return PTR_ERR(cs);
 214
 215         *cs++ = GFX_OP_PIPE_CONTROL(5);
 216         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 217         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 218         *cs++ = 0; /* low dword */
 219         *cs++ = 0; /* high dword */
 220         *cs++ = MI_NOOP;
 221         intel_ring_advance(rq, cs);
 222
 223         cs = intel_ring_begin(rq, 6);
 224         if (IS_ERR(cs))
 225                 return PTR_ERR(cs);
 226
 227         *cs++ = GFX_OP_PIPE_CONTROL(5);
 228         *cs++ = PIPE_CONTROL_QW_WRITE;
 229         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 230         *cs++ = 0;
 231         *cs++ = 0;
 232         *cs++ = MI_NOOP;
 233         intel_ring_advance(rq, cs);
 234
 235         return 0;
 236 }
 237
 238 static int
 239 gen6_render_ring_flush(struct i915_request *rq, u32 mode)
 240 {
 241         u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 242         u32 *cs, flags = 0;
 243         int ret;
 244
 245         /* Force SNB workarounds for PIPE_CONTROL flushes */
 246         ret = gen6_emit_post_sync_nonzero_flush(rq);
 247         if (ret)
 248                 return ret;
 249
 250         /* Just flush everything.  Experiments have shown that reducing the
 251          * number of bits based on the write domains has little performance
 252          * impact.
 253          */
 254         if (mode & EMIT_FLUSH) {
 255                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 256                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 257                 /*
 258                  * Ensure that any following seqno writes only happen
 259                  * when the render cache is indeed flushed.
 260                  */
 261                 flags |= PIPE_CONTROL_CS_STALL;
 262         }
 263         if (mode & EMIT_INVALIDATE) {
 264                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 265                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 266                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 267                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 268                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 269                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 270                 /*
 271                  * TLB invalidate requires a post-sync write.
 272                  */
 273                 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 274         }
 275
 276         cs = intel_ring_begin(rq, 4);
 277         if (IS_ERR(cs))
 278                 return PTR_ERR(cs);
 279
 280         *cs++ = GFX_OP_PIPE_CONTROL(4);
 281         *cs++ = flags;
 282         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 283         *cs++ = 0;
 284         intel_ring_advance(rq, cs);
 285
 286         return 0;
 287 }
 288
 289 static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 290 {
 291         /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
 292         *cs++ = GFX_OP_PIPE_CONTROL(4);
 293         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 294         *cs++ = 0;
 295         *cs++ = 0;
 296
 297         *cs++ = GFX_OP_PIPE_CONTROL(4);
 298         *cs++ = PIPE_CONTROL_QW_WRITE;
 299         *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
 300         *cs++ = 0;
 301
 302         /* Finally we can flush and with it emit the breadcrumb */
 303         *cs++ = GFX_OP_PIPE_CONTROL(4);
 304         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 305                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 306                  PIPE_CONTROL_DC_FLUSH_ENABLE |
 307                  PIPE_CONTROL_QW_WRITE |
 308                  PIPE_CONTROL_CS_STALL);
 309         *cs++ = rq->timeline->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT;
 310         *cs++ = rq->fence.seqno;
 311
 312         *cs++ = MI_USER_INTERRUPT;
 313         *cs++ = MI_NOOP;
 314
 315         rq->tail = intel_ring_offset(rq, cs);
 316         assert_ring_tail_valid(rq->ring, rq->tail);
 317
 318         return cs;
 319 }
 320
 321 static int
 322 gen7_render_ring_cs_stall_wa(struct i915_request *rq)
 323 {
 324         u32 *cs;
 325
 326         cs = intel_ring_begin(rq, 4);
 327         if (IS_ERR(cs))
 328                 return PTR_ERR(cs);
 329
 330         *cs++ = GFX_OP_PIPE_CONTROL(4);
 331         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 332         *cs++ = 0;
 333         *cs++ = 0;
 334         intel_ring_advance(rq, cs);
 335
 336         return 0;
 337 }
 338
 339 static int
 340 gen7_render_ring_flush(struct i915_request *rq, u32 mode)
 341 {
 342         u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 343         u32 *cs, flags = 0;
 344
 345         /*
 346          * Ensure that any following seqno writes only happen when the render
 347          * cache is indeed flushed.
 348          *
 349          * Workaround: 4th PIPE_CONTROL command (except the ones with only
 350          * read-cache invalidate bits set) must have the CS_STALL bit set. We
 351          * don't try to be clever and just set it unconditionally.
 352          */
 353         flags |= PIPE_CONTROL_CS_STALL;
 354
 355         /* Just flush everything.  Experiments have shown that reducing the
 356          * number of bits based on the write domains has little performance
 357          * impact.
 358          */
 359         if (mode & EMIT_FLUSH) {
 360                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 361                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 362                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 363                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 364         }
 365         if (mode & EMIT_INVALIDATE) {
 366                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 367                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 368                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 369                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 370                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 371                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 372                 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
 373                 /*
 374                  * TLB invalidate requires a post-sync write.
 375                  */
 376                 flags |= PIPE_CONTROL_QW_WRITE;
 377                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
 378
 379                 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
 380
 381                 /* Workaround: we must issue a pipe_control with CS-stall bit
 382                  * set before a pipe_control command that has the state cache
 383                  * invalidate bit set. */
 384                 gen7_render_ring_cs_stall_wa(rq);
 385         }
 386
 387         cs = intel_ring_begin(rq, 4);
 388         if (IS_ERR(cs))
 389                 return PTR_ERR(cs);
 390
 391         *cs++ = GFX_OP_PIPE_CONTROL(4);
 392         *cs++ = flags;
 393         *cs++ = scratch_addr;
 394         *cs++ = 0;
 395         intel_ring_advance(rq, cs);
 396
 397         return 0;
 398 }
 399
 400 static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 401 {
 402         *cs++ = GFX_OP_PIPE_CONTROL(4);
 403         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 404                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 405                  PIPE_CONTROL_DC_FLUSH_ENABLE |
 406                  PIPE_CONTROL_FLUSH_ENABLE |
 407                  PIPE_CONTROL_QW_WRITE |
 408                  PIPE_CONTROL_GLOBAL_GTT_IVB |
 409                  PIPE_CONTROL_CS_STALL);
 410         *cs++ = rq->timeline->hwsp_offset;
 411         *cs++ = rq->fence.seqno;
 412
 413         *cs++ = MI_USER_INTERRUPT;
 414         *cs++ = MI_NOOP;
 415
 416         rq->tail = intel_ring_offset(rq, cs);
 417         assert_ring_tail_valid(rq->ring, rq->tail);
 418
 419         return cs;
 420 }
 421
 422 static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 423 {
 424         GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 425         GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 426
 427         *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 428         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 429         *cs++ = rq->fence.seqno;
 430
 431         *cs++ = MI_USER_INTERRUPT;
 432
 433         rq->tail = intel_ring_offset(rq, cs);
 434         assert_ring_tail_valid(rq->ring, rq->tail);
 435
 436         return cs;
 437 }
 438
 439 #define GEN7_XCS_WA 32
 440 static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 441 {
 442         int i;
 443
 444         GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 445         GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 446
 447         *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 448         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 449         *cs++ = rq->fence.seqno;
 450
 451         for (i = 0; i < GEN7_XCS_WA; i++) {
 452                 *cs++ = MI_STORE_DWORD_INDEX;
 453                 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
 454                 *cs++ = rq->fence.seqno;
 455         }
 456
 457         *cs++ = MI_FLUSH_DW;
 458         *cs++ = 0;
 459         *cs++ = 0;
 460
 461         *cs++ = MI_USER_INTERRUPT;
 462         *cs++ = MI_NOOP;
 463
 464         rq->tail = intel_ring_offset(rq, cs);
 465         assert_ring_tail_valid(rq->ring, rq->tail);
 466
 467         return cs;
 468 }
 469 #undef GEN7_XCS_WA
 470
 471 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
 472 {
 473         /*
 474          * Keep the render interrupt unmasked as this papers over
 475          * lost interrupts following a reset.
 476          */
 477         if (engine->class == RENDER_CLASS) {
 478                 if (INTEL_GEN(engine->i915) >= 6)
 479                         mask &= ~BIT(0);
 480                 else
 481                         mask &= ~I915_USER_INTERRUPT;
 482         }
 483
 484         intel_engine_set_hwsp_writemask(engine, mask);
 485 }
 486
 487 static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys)
 488 {
 489         struct drm_i915_private *dev_priv = engine->i915;
 490         u32 addr;
 491
 492         addr = lower_32_bits(phys);
 493         if (INTEL_GEN(dev_priv) >= 4)
 494                 addr |= (phys >> 28) & 0xf0;
 495
 496         I915_WRITE(HWS_PGA, addr);
 497 }
 498
 499 static struct page *status_page(struct intel_engine_cs *engine)
 500 {
 501         struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
 502
 503         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
 504         return sg_page(obj->mm.pages->sgl);
 505 }
 506
 507 static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
 508 {
 509         set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine))));
 510         set_hwstam(engine, ~0u);
 511 }
 512
 513 static void set_hwsp(struct intel_engine_cs *engine, u32 offset)
 514 {
 515         struct drm_i915_private *dev_priv = engine->i915;
 516         i915_reg_t hwsp;
 517
 518         /*
 519          * The ring status page addresses are no longer next to the rest of
 520          * the ring registers as of gen7.
 521          */
 522         if (IS_GEN(dev_priv, 7)) {
 523                 switch (engine->id) {
 524                 /*
 525                  * No more rings exist on Gen7. Default case is only to shut up
 526                  * gcc switch check warning.
 527                  */
 528                 default:
 529                         GEM_BUG_ON(engine->id);
 530                         /* fallthrough */
 531                 case RCS0:
 532                         hwsp = RENDER_HWS_PGA_GEN7;
 533                         break;
 534                 case BCS0:
 535                         hwsp = BLT_HWS_PGA_GEN7;
 536                         break;
 537                 case VCS0:
 538                         hwsp = BSD_HWS_PGA_GEN7;
 539                         break;
 540                 case VECS0:
 541                         hwsp = VEBOX_HWS_PGA_GEN7;
 542                         break;
 543                 }
 544         } else if (IS_GEN(dev_priv, 6)) {
 545                 hwsp = RING_HWS_PGA_GEN6(engine->mmio_base);
 546         } else {
 547                 hwsp = RING_HWS_PGA(engine->mmio_base);
 548         }
 549
 550         I915_WRITE(hwsp, offset);
 551         POSTING_READ(hwsp);
 552 }
 553
 554 static void flush_cs_tlb(struct intel_engine_cs *engine)
 555 {
 556         struct drm_i915_private *dev_priv = engine->i915;
 557
 558         if (!IS_GEN_RANGE(dev_priv, 6, 7))
 559                 return;
 560
 561         /* ring should be idle before issuing a sync flush*/
 562         WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
 563
 564         ENGINE_WRITE(engine, RING_INSTPM,
 565                      _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
 566                                         INSTPM_SYNC_FLUSH));
 567         if (intel_wait_for_register(engine->uncore,
 568                                     RING_INSTPM(engine->mmio_base),
 569                                     INSTPM_SYNC_FLUSH, 0,
 570                                     1000))
 571                 DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
 572                           engine->name);
 573 }
 574
 575 static void ring_setup_status_page(struct intel_engine_cs *engine)
 576 {
 577         set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma));
 578         set_hwstam(engine, ~0u);
 579
 580         flush_cs_tlb(engine);
 581 }
 582
 583 static bool stop_ring(struct intel_engine_cs *engine)
 584 {
 585         struct drm_i915_private *dev_priv = engine->i915;
 586
 587         if (INTEL_GEN(dev_priv) > 2) {
 588                 ENGINE_WRITE(engine,
 589                              RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING));
 590                 if (intel_wait_for_register(engine->uncore,
 591                                             RING_MI_MODE(engine->mmio_base),
 592                                             MODE_IDLE,
 593                                             MODE_IDLE,
 594                                             1000)) {
 595                         DRM_ERROR("%s : timed out trying to stop ring\n",
 596                                   engine->name);
 597
 598                         /*
 599                          * Sometimes we observe that the idle flag is not
 600                          * set even though the ring is empty. So double
 601                          * check before giving up.
 602                          */
 603                         if (ENGINE_READ(engine, RING_HEAD) !=
 604                             ENGINE_READ(engine, RING_TAIL))
 605                                 return false;
 606                 }
 607         }
 608
 609         ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL));
 610
 611         ENGINE_WRITE(engine, RING_HEAD, 0);
 612         ENGINE_WRITE(engine, RING_TAIL, 0);
 613
 614         /* The ring must be empty before it is disabled */
 615         ENGINE_WRITE(engine, RING_CTL, 0);
 616
 617         return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0;
 618 }
 619
 620 static int xcs_resume(struct intel_engine_cs *engine)
 621 {
 622         struct drm_i915_private *dev_priv = engine->i915;
 623         struct intel_ring *ring = engine->buffer;
 624         int ret = 0;
 625
 626         GEM_TRACE("%s: ring:{HEAD:%04x, TAIL:%04x}\n",
 627                   engine->name, ring->head, ring->tail);
 628
 629         intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
 630
 631         if (!stop_ring(engine)) {
 632                 /* G45 ring initialization often fails to reset head to zero */
 633                 DRM_DEBUG_DRIVER("%s head not reset to zero "
 634                                 "ctl %08x head %08x tail %08x start %08x\n",
 635                                 engine->name,
 636                                 ENGINE_READ(engine, RING_CTL),
 637                                 ENGINE_READ(engine, RING_HEAD),
 638                                 ENGINE_READ(engine, RING_TAIL),
 639                                 ENGINE_READ(engine, RING_START));
 640
 641                 if (!stop_ring(engine)) {
 642                         DRM_ERROR("failed to set %s head to zero "
 643                                   "ctl %08x head %08x tail %08x start %08x\n",
 644                                   engine->name,
 645                                   ENGINE_READ(engine, RING_CTL),
 646                                   ENGINE_READ(engine, RING_HEAD),
 647                                   ENGINE_READ(engine, RING_TAIL),
 648                                   ENGINE_READ(engine, RING_START));
 649                         ret = -EIO;
 650                         goto out;
 651                 }
 652         }
 653
 654         if (HWS_NEEDS_PHYSICAL(dev_priv))
 655                 ring_setup_phys_status_page(engine);
 656         else
 657                 ring_setup_status_page(engine);
 658
 659         intel_engine_reset_breadcrumbs(engine);
 660
 661         /* Enforce ordering by reading HEAD register back */
 662         ENGINE_READ(engine, RING_HEAD);
 663
 664         /* Initialize the ring. This must happen _after_ we've cleared the ring
 665          * registers with the above sequence (the readback of the HEAD registers
 666          * also enforces ordering), otherwise the hw might lose the new ring
 667          * register values. */
 668         ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma));
 669
 670         /* WaClearRingBufHeadRegAtInit:ctg,elk */
 671         if (ENGINE_READ(engine, RING_HEAD))
 672                 DRM_DEBUG_DRIVER("%s initialization failed [head=%08x], fudging\n",
 673                                  engine->name, ENGINE_READ(engine, RING_HEAD));
 674
 675         /* Check that the ring offsets point within the ring! */
 676         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
 677         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
 678         intel_ring_update_space(ring);
 679
 680         /* First wake the ring up to an empty/idle ring */
 681         ENGINE_WRITE(engine, RING_HEAD, ring->head);
 682         ENGINE_WRITE(engine, RING_TAIL, ring->head);
 683         ENGINE_POSTING_READ(engine, RING_TAIL);
 684
 685         ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID);
 686
 687         /* If the head is still not zero, the ring is dead */
 688         if (intel_wait_for_register(engine->uncore,
 689                                     RING_CTL(engine->mmio_base),
 690                                     RING_VALID, RING_VALID,
 691                                     50)) {
 692                 DRM_ERROR("%s initialization failed "
 693                           "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
 694                           engine->name,
 695                           ENGINE_READ(engine, RING_CTL),
 696                           ENGINE_READ(engine, RING_CTL) & RING_VALID,
 697                           ENGINE_READ(engine, RING_HEAD), ring->head,
 698                           ENGINE_READ(engine, RING_TAIL), ring->tail,
 699                           ENGINE_READ(engine, RING_START),
 700                           i915_ggtt_offset(ring->vma));
 701                 ret = -EIO;
 702                 goto out;
 703         }
 704
 705         if (INTEL_GEN(dev_priv) > 2)
 706                 ENGINE_WRITE(engine,
 707                              RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
 708
 709         /* Now awake, let it get started */
 710         if (ring->tail != ring->head) {
 711                 ENGINE_WRITE(engine, RING_TAIL, ring->tail);
 712                 ENGINE_POSTING_READ(engine, RING_TAIL);
 713         }
 714
 715         /* Papering over lost _interrupts_ immediately following the restart */
 716         intel_engine_queue_breadcrumbs(engine);
 717 out:
 718         intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
 719
 720         return ret;
 721 }
 722
 723 static void reset_prepare(struct intel_engine_cs *engine)
 724 {
 725         intel_engine_stop_cs(engine);
 726 }
 727
 728 static void reset_ring(struct intel_engine_cs *engine, bool stalled)
 729 {
 730         struct i915_timeline *tl = &engine->timeline;
 731         struct i915_request *pos, *rq;
 732         unsigned long flags;
 733         u32 head;
 734
 735         rq = NULL;
 736         spin_lock_irqsave(&tl->lock, flags);
 737         list_for_each_entry(pos, &tl->requests, link) {
 738                 if (!i915_request_completed(pos)) {
 739                         rq = pos;
 740                         break;
 741                 }
 742         }
 743
 744         /*
 745          * The guilty request will get skipped on a hung engine.
 746          *
 747          * Users of client default contexts do not rely on logical
 748          * state preserved between batches so it is safe to execute
 749          * queued requests following the hang. Non default contexts
 750          * rely on preserved state, so skipping a batch loses the
 751          * evolution of the state and it needs to be considered corrupted.
 752          * Executing more queued batches on top of corrupted state is
 753          * risky. But we take the risk by trying to advance through
 754          * the queued requests in order to make the client behaviour
 755          * more predictable around resets, by not throwing away random
 756          * amount of batches it has prepared for execution. Sophisticated
 757          * clients can use gem_reset_stats_ioctl and dma fence status
 758          * (exported via sync_file info ioctl on explicit fences) to observe
 759          * when it loses the context state and should rebuild accordingly.
 760          *
 761          * The context ban, and ultimately the client ban, mechanism are safety
 762          * valves if client submission ends up resulting in nothing more than
 763          * subsequent hangs.
 764          */
 765
 766         if (rq) {
 767                 /*
 768                  * Try to restore the logical GPU state to match the
 769                  * continuation of the request queue. If we skip the
 770                  * context/PD restore, then the next request may try to execute
 771                  * assuming that its context is valid and loaded on the GPU and
 772                  * so may try to access invalid memory, prompting repeated GPU
 773                  * hangs.
 774                  *
 775                  * If the request was guilty, we still restore the logical
 776                  * state in case the next request requires it (e.g. the
 777                  * aliasing ppgtt), but skip over the hung batch.
 778                  *
 779                  * If the request was innocent, we try to replay the request
 780                  * with the restored context.
 781                  */
 782                 i915_reset_request(rq, stalled);
 783
 784                 GEM_BUG_ON(rq->ring != engine->buffer);
 785                 head = rq->head;
 786         } else {
 787                 head = engine->buffer->tail;
 788         }
 789         engine->buffer->head = intel_ring_wrap(engine->buffer, head);
 790
 791         spin_unlock_irqrestore(&tl->lock, flags);
 792 }
 793
 794 static void reset_finish(struct intel_engine_cs *engine)
 795 {
 796 }
 797
 798 static int intel_rcs_ctx_init(struct i915_request *rq)
 799 {
 800         int ret;
 801
 802         ret = intel_engine_emit_ctx_wa(rq);
 803         if (ret != 0)
 804                 return ret;
 805
 806         ret = i915_gem_render_state_emit(rq);
 807         if (ret)
 808                 return ret;
 809
 810         return 0;
 811 }
 812
 813 static int rcs_resume(struct intel_engine_cs *engine)
 814 {
 815         struct drm_i915_private *dev_priv = engine->i915;
 816
 817         /*
 818          * Disable CONSTANT_BUFFER before it is loaded from the context
 819          * image. For as it is loaded, it is executed and the stored
 820          * address may no longer be valid, leading to a GPU hang.
 821          *
 822          * This imposes the requirement that userspace reload their
 823          * CONSTANT_BUFFER on every batch, fortunately a requirement
 824          * they are already accustomed to from before contexts were
 825          * enabled.
 826          */
 827         if (IS_GEN(dev_priv, 4))
 828                 I915_WRITE(ECOSKPD,
 829                            _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE));
 830
 831         /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
 832         if (IS_GEN_RANGE(dev_priv, 4, 6))
 833                 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
 834
 835         /* We need to disable the AsyncFlip performance optimisations in order
 836          * to use MI_WAIT_FOR_EVENT within the CS. It should already be
 837          * programmed to '1' on all products.
 838          *
 839          * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
 840          */
 841         if (IS_GEN_RANGE(dev_priv, 6, 7))
 842                 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
 843
 844         /* Required for the hardware to program scanline values for waiting */
 845         /* WaEnableFlushTlbInvalidationMode:snb */
 846         if (IS_GEN(dev_priv, 6))
 847                 I915_WRITE(GFX_MODE,
 848                            _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
 849
 850         /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
 851         if (IS_GEN(dev_priv, 7))
 852                 I915_WRITE(GFX_MODE_GEN7,
 853                            _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
 854                            _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
 855
 856         if (IS_GEN(dev_priv, 6)) {
 857                 /* From the Sandybridge PRM, volume 1 part 3, page 24:
 858                  * "If this bit is set, STCunit will have LRA as replacement
 859                  *  policy. [...] This bit must be reset.  LRA replacement
 860                  *  policy is not supported."
 861                  */
 862                 I915_WRITE(CACHE_MODE_0,
 863                            _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
 864         }
 865
 866         if (IS_GEN_RANGE(dev_priv, 6, 7))
 867                 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
 868
 869         return xcs_resume(engine);
 870 }
 871
 872 static void cancel_requests(struct intel_engine_cs *engine)
 873 {
 874         struct i915_request *request;
 875         unsigned long flags;
 876
 877         spin_lock_irqsave(&engine->timeline.lock, flags);
 878
 879         /* Mark all submitted requests as skipped. */
 880         list_for_each_entry(request, &engine->timeline.requests, link) {
 881                 if (!i915_request_signaled(request))
 882                         dma_fence_set_error(&request->fence, -EIO);
 883
 884                 i915_request_mark_complete(request);
 885         }
 886
 887         /* Remaining _unready_ requests will be nop'ed when submitted */
 888
 889         spin_unlock_irqrestore(&engine->timeline.lock, flags);
 890 }
 891
 892 static void i9xx_submit_request(struct i915_request *request)
 893 {
 894         i915_request_submit(request);
 895
 896         ENGINE_WRITE(request->engine, RING_TAIL,
 897                      intel_ring_set_tail(request->ring, request->tail));
 898 }
 899
 900 static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 901 {
 902         GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 903         GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 904
 905         *cs++ = MI_FLUSH;
 906
 907         *cs++ = MI_STORE_DWORD_INDEX;
 908         *cs++ = I915_GEM_HWS_SEQNO_ADDR;
 909         *cs++ = rq->fence.seqno;
 910
 911         *cs++ = MI_USER_INTERRUPT;
 912         *cs++ = MI_NOOP;
 913
 914         rq->tail = intel_ring_offset(rq, cs);
 915         assert_ring_tail_valid(rq->ring, rq->tail);
 916
 917         return cs;
 918 }
 919
 920 #define GEN5_WA_STORES 8 /* must be at least 1! */
 921 static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 922 {
 923         int i;
 924
 925         GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 926         GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 927
 928         *cs++ = MI_FLUSH;
 929
 930         BUILD_BUG_ON(GEN5_WA_STORES < 1);
 931         for (i = 0; i < GEN5_WA_STORES; i++) {
 932                 *cs++ = MI_STORE_DWORD_INDEX;
 933                 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
 934                 *cs++ = rq->fence.seqno;
 935         }
 936
 937         *cs++ = MI_USER_INTERRUPT;
 938
 939         rq->tail = intel_ring_offset(rq, cs);
 940         assert_ring_tail_valid(rq->ring, rq->tail);
 941
 942         return cs;
 943 }
 944 #undef GEN5_WA_STORES
 945
 946 static void
 947 gen5_irq_enable(struct intel_engine_cs *engine)
 948 {
 949         gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
 950 }
 951
 952 static void
 953 gen5_irq_disable(struct intel_engine_cs *engine)
 954 {
 955         gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
 956 }
 957
 958 static void
 959 i9xx_irq_enable(struct intel_engine_cs *engine)
 960 {
 961         engine->i915->irq_mask &= ~engine->irq_enable_mask;
 962         intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
 963         intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
 964 }
 965
 966 static void
 967 i9xx_irq_disable(struct intel_engine_cs *engine)
 968 {
 969         engine->i915->irq_mask |= engine->irq_enable_mask;
 970         intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
 971 }
 972
 973 static void
 974 i8xx_irq_enable(struct intel_engine_cs *engine)
 975 {
 976         struct drm_i915_private *dev_priv = engine->i915;
 977
 978         dev_priv->irq_mask &= ~engine->irq_enable_mask;
 979         I915_WRITE16(GEN2_IMR, dev_priv->irq_mask);
 980         POSTING_READ16(RING_IMR(engine->mmio_base));
 981 }
 982
 983 static void
 984 i8xx_irq_disable(struct intel_engine_cs *engine)
 985 {
 986         struct drm_i915_private *dev_priv = engine->i915;
 987
 988         dev_priv->irq_mask |= engine->irq_enable_mask;
 989         I915_WRITE16(GEN2_IMR, dev_priv->irq_mask);
 990 }
 991
 992 static int
 993 bsd_ring_flush(struct i915_request *rq, u32 mode)
 994 {
 995         u32 *cs;
 996
 997         cs = intel_ring_begin(rq, 2);
 998         if (IS_ERR(cs))
 999                 return PTR_ERR(cs);
1000
1001         *cs++ = MI_FLUSH;
1002         *cs++ = MI_NOOP;
1003         intel_ring_advance(rq, cs);
1004         return 0;
1005 }
1006
1007 static void
1008 gen6_irq_enable(struct intel_engine_cs *engine)
1009 {
1010         ENGINE_WRITE(engine, RING_IMR,
1011                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
1012
1013         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1014         ENGINE_POSTING_READ(engine, RING_IMR);
1015
1016         gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
1017 }
1018
1019 static void
1020 gen6_irq_disable(struct intel_engine_cs *engine)
1021 {
1022         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
1023         gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
1024 }
1025
1026 static void
1027 hsw_vebox_irq_enable(struct intel_engine_cs *engine)
1028 {
1029         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
1030
1031         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1032         ENGINE_POSTING_READ(engine, RING_IMR);
1033
1034         gen6_unmask_pm_irq(engine->i915, engine->irq_enable_mask);
1035 }
1036
1037 static void
1038 hsw_vebox_irq_disable(struct intel_engine_cs *engine)
1039 {
1040         ENGINE_WRITE(engine, RING_IMR, ~0);
1041         gen6_mask_pm_irq(engine->i915, engine->irq_enable_mask);
1042 }
1043
1044 static int
1045 i965_emit_bb_start(struct i915_request *rq,
1046                    u64 offset, u32 length,
1047                    unsigned int dispatch_flags)
1048 {
1049         u32 *cs;
1050
1051         cs = intel_ring_begin(rq, 2);
1052         if (IS_ERR(cs))
1053                 return PTR_ERR(cs);
1054
1055         *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags &
1056                 I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965);
1057         *cs++ = offset;
1058         intel_ring_advance(rq, cs);
1059
1060         return 0;
1061 }
1062
1063 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
1064 #define I830_BATCH_LIMIT SZ_256K
1065 #define I830_TLB_ENTRIES (2)
1066 #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1067 static int
1068 i830_emit_bb_start(struct i915_request *rq,
1069                    u64 offset, u32 len,
1070                    unsigned int dispatch_flags)
1071 {
1072         u32 *cs, cs_offset = i915_scratch_offset(rq->i915);
1073
1074         GEM_BUG_ON(rq->i915->gt.scratch->size < I830_WA_SIZE);
1075
1076         cs = intel_ring_begin(rq, 6);
1077         if (IS_ERR(cs))
1078                 return PTR_ERR(cs);
1079
1080         /* Evict the invalid PTE TLBs */
1081         *cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
1082         *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
1083         *cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
1084         *cs++ = cs_offset;
1085         *cs++ = 0xdeadbeef;
1086         *cs++ = MI_NOOP;
1087         intel_ring_advance(rq, cs);
1088
1089         if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1090                 if (len > I830_BATCH_LIMIT)
1091                         return -ENOSPC;
1092
1093                 cs = intel_ring_begin(rq, 6 + 2);
1094                 if (IS_ERR(cs))
1095                         return PTR_ERR(cs);
1096
1097                 /* Blit the batch (which has now all relocs applied) to the
1098                  * stable batch scratch bo area (so that the CS never
1099                  * stumbles over its tlb invalidation bug) ...
1100                  */
1101                 *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA;
1102                 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
1103                 *cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
1104                 *cs++ = cs_offset;
1105                 *cs++ = 4096;
1106                 *cs++ = offset;
1107
1108                 *cs++ = MI_FLUSH;
1109                 *cs++ = MI_NOOP;
1110                 intel_ring_advance(rq, cs);
1111
1112                 /* ... and execute it. */
1113                 offset = cs_offset;
1114         }
1115
1116         cs = intel_ring_begin(rq, 2);
1117         if (IS_ERR(cs))
1118                 return PTR_ERR(cs);
1119
1120         *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1121         *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1122                 MI_BATCH_NON_SECURE);
1123         intel_ring_advance(rq, cs);
1124
1125         return 0;
1126 }
1127
1128 static int
1129 i915_emit_bb_start(struct i915_request *rq,
1130                    u64 offset, u32 len,
1131                    unsigned int dispatch_flags)
1132 {
1133         u32 *cs;
1134
1135         cs = intel_ring_begin(rq, 2);
1136         if (IS_ERR(cs))
1137                 return PTR_ERR(cs);
1138
1139         *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1140         *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1141                 MI_BATCH_NON_SECURE);
1142         intel_ring_advance(rq, cs);
1143
1144         return 0;
1145 }
1146
1147 int intel_ring_pin(struct intel_ring *ring)
1148 {
1149         struct i915_vma *vma = ring->vma;
1150         enum i915_map_type map = i915_coherent_map_type(vma->vm->i915);
1151         unsigned int flags;
1152         void *addr;
1153         int ret;
1154
1155         GEM_BUG_ON(ring->vaddr);
1156
1157         ret = i915_timeline_pin(ring->timeline);
1158         if (ret)
1159                 return ret;
1160
1161         flags = PIN_GLOBAL;
1162
1163         /* Ring wraparound at offset 0 sometimes hangs. No idea why. */
1164         flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
1165
1166         if (vma->obj->stolen)
1167                 flags |= PIN_MAPPABLE;
1168         else
1169                 flags |= PIN_HIGH;
1170
1171         ret = i915_vma_pin(vma, 0, 0, flags);
1172         if (unlikely(ret))
1173                 goto unpin_timeline;
1174
1175         if (i915_vma_is_map_and_fenceable(vma))
1176                 addr = (void __force *)i915_vma_pin_iomap(vma);
1177         else
1178                 addr = i915_gem_object_pin_map(vma->obj, map);
1179         if (IS_ERR(addr)) {
1180                 ret = PTR_ERR(addr);
1181                 goto unpin_ring;
1182         }
1183
1184         vma->obj->pin_global++;
1185
1186         ring->vaddr = addr;
1187         return 0;
1188
1189 unpin_ring:
1190         i915_vma_unpin(vma);
1191 unpin_timeline:
1192         i915_timeline_unpin(ring->timeline);
1193         return ret;
1194 }
1195
1196 void intel_ring_reset(struct intel_ring *ring, u32 tail)
1197 {
1198         GEM_BUG_ON(!intel_ring_offset_valid(ring, tail));
1199
1200         ring->tail = tail;
1201         ring->head = tail;
1202         ring->emit = tail;
1203         intel_ring_update_space(ring);
1204 }
1205
1206 void intel_ring_unpin(struct intel_ring *ring)
1207 {
1208         GEM_BUG_ON(!ring->vma);
1209         GEM_BUG_ON(!ring->vaddr);
1210
1211         /* Discard any unused bytes beyond that submitted to hw. */
1212         intel_ring_reset(ring, ring->tail);
1213
1214         if (i915_vma_is_map_and_fenceable(ring->vma))
1215                 i915_vma_unpin_iomap(ring->vma);
1216         else
1217                 i915_gem_object_unpin_map(ring->vma->obj);
1218         ring->vaddr = NULL;
1219
1220         ring->vma->obj->pin_global--;
1221         i915_vma_unpin(ring->vma);
1222
1223         i915_timeline_unpin(ring->timeline);
1224 }
1225
1226 static struct i915_vma *
1227 intel_ring_create_vma(struct drm_i915_private *dev_priv, int size)
1228 {
1229         struct i915_address_space *vm = &dev_priv->ggtt.vm;
1230         struct drm_i915_gem_object *obj;
1231         struct i915_vma *vma;
1232
1233         obj = i915_gem_object_create_stolen(dev_priv, size);
1234         if (!obj)
1235                 obj = i915_gem_object_create_internal(dev_priv, size);
1236         if (IS_ERR(obj))
1237                 return ERR_CAST(obj);
1238
1239         /*
1240          * Mark ring buffers as read-only from GPU side (so no stray overwrites)
1241          * if supported by the platform's GGTT.
1242          */
1243         if (vm->has_read_only)
1244                 i915_gem_object_set_readonly(obj);
1245
1246         vma = i915_vma_instance(obj, vm, NULL);
1247         if (IS_ERR(vma))
1248                 goto err;
1249
1250         return vma;
1251
1252 err:
1253         i915_gem_object_put(obj);
1254         return vma;
1255 }
1256
1257 struct intel_ring *
1258 intel_engine_create_ring(struct intel_engine_cs *engine,
1259                          struct i915_timeline *timeline,
1260                          int size)
1261 {
1262         struct intel_ring *ring;
1263         struct i915_vma *vma;
1264
1265         GEM_BUG_ON(!is_power_of_2(size));
1266         GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
1267         GEM_BUG_ON(timeline == &engine->timeline);
1268         lockdep_assert_held(&engine->i915->drm.struct_mutex);
1269
1270         ring = kzalloc(sizeof(*ring), GFP_KERNEL);
1271         if (!ring)
1272                 return ERR_PTR(-ENOMEM);
1273
1274         kref_init(&ring->ref);
1275         INIT_LIST_HEAD(&ring->request_list);
1276         ring->timeline = i915_timeline_get(timeline);
1277
1278         ring->size = size;
1279         /* Workaround an erratum on the i830 which causes a hang if
1280          * the TAIL pointer points to within the last 2 cachelines
1281          * of the buffer.
1282          */
1283         ring->effective_size = size;
1284         if (IS_I830(engine->i915) || IS_I845G(engine->i915))
1285                 ring->effective_size -= 2 * CACHELINE_BYTES;
1286
1287         intel_ring_update_space(ring);
1288
1289         vma = intel_ring_create_vma(engine->i915, size);
1290         if (IS_ERR(vma)) {
1291                 kfree(ring);
1292                 return ERR_CAST(vma);
1293         }
1294         ring->vma = vma;
1295
1296         return ring;
1297 }
1298
1299 void intel_ring_free(struct kref *ref)
1300 {
1301         struct intel_ring *ring = container_of(ref, typeof(*ring), ref);
1302         struct drm_i915_gem_object *obj = ring->vma->obj;
1303
1304         i915_vma_close(ring->vma);
1305         __i915_gem_object_release_unless_active(obj);
1306
1307         i915_timeline_put(ring->timeline);
1308         kfree(ring);
1309 }
1310
1311 static void __ring_context_fini(struct intel_context *ce)
1312 {
1313         GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
1314         i915_gem_object_put(ce->state->obj);
1315 }
1316
1317 static void ring_context_destroy(struct kref *ref)
1318 {
1319         struct intel_context *ce = container_of(ref, typeof(*ce), ref);
1320
1321         GEM_BUG_ON(intel_context_is_pinned(ce));
1322
1323         if (ce->state)
1324                 __ring_context_fini(ce);
1325
1326         intel_context_free(ce);
1327 }
1328
1329 static int __context_pin_ppgtt(struct i915_gem_context *ctx)
1330 {
1331         struct i915_hw_ppgtt *ppgtt;
1332         int err = 0;
1333
1334         ppgtt = ctx->ppgtt ?: ctx->i915->mm.aliasing_ppgtt;
1335         if (ppgtt)
1336                 err = gen6_ppgtt_pin(ppgtt);
1337
1338         return err;
1339 }
1340
1341 static void __context_unpin_ppgtt(struct i915_gem_context *ctx)
1342 {
1343         struct i915_hw_ppgtt *ppgtt;
1344
1345         ppgtt = ctx->ppgtt ?: ctx->i915->mm.aliasing_ppgtt;
1346         if (ppgtt)
1347                 gen6_ppgtt_unpin(ppgtt);
1348 }
1349
1350 static int __context_pin(struct intel_context *ce)
1351 {
1352         struct i915_vma *vma;
1353         int err;
1354
1355         vma = ce->state;
1356         if (!vma)
1357                 return 0;
1358
1359         err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
1360         if (err)
1361                 return err;
1362
1363         /*
1364          * And mark is as a globally pinned object to let the shrinker know
1365          * it cannot reclaim the object until we release it.
1366          */
1367         vma->obj->pin_global++;
1368         vma->obj->mm.dirty = true;
1369
1370         return 0;
1371 }
1372
1373 static void __context_unpin(struct intel_context *ce)
1374 {
1375         struct i915_vma *vma;
1376
1377         vma = ce->state;
1378         if (!vma)
1379                 return;
1380
1381         vma->obj->pin_global--;
1382         i915_vma_unpin(vma);
1383 }
1384
1385 static void ring_context_unpin(struct intel_context *ce)
1386 {
1387         __context_unpin_ppgtt(ce->gem_context);
1388         __context_unpin(ce);
1389 }
1390
1391 static struct i915_vma *
1392 alloc_context_vma(struct intel_engine_cs *engine)
1393 {
1394         struct drm_i915_private *i915 = engine->i915;
1395         struct drm_i915_gem_object *obj;
1396         struct i915_vma *vma;
1397         int err;
1398
1399         obj = i915_gem_object_create_shmem(i915, engine->context_size);
1400         if (IS_ERR(obj))
1401                 return ERR_CAST(obj);
1402
1403         /*
1404          * Try to make the context utilize L3 as well as LLC.
1405          *
1406          * On VLV we don't have L3 controls in the PTEs so we
1407          * shouldn't touch the cache level, especially as that
1408          * would make the object snooped which might have a
1409          * negative performance impact.
1410          *
1411          * Snooping is required on non-llc platforms in execlist
1412          * mode, but since all GGTT accesses use PAT entry 0 we
1413          * get snooping anyway regardless of cache_level.
1414          *
1415          * This is only applicable for Ivy Bridge devices since
1416          * later platforms don't have L3 control bits in the PTE.
1417          */
1418         if (IS_IVYBRIDGE(i915))
1419                 i915_gem_object_set_cache_coherency(obj, I915_CACHE_L3_LLC);
1420
1421         if (engine->default_state) {
1422                 void *defaults, *vaddr;
1423
1424                 vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
1425                 if (IS_ERR(vaddr)) {
1426                         err = PTR_ERR(vaddr);
1427                         goto err_obj;
1428                 }
1429
1430                 defaults = i915_gem_object_pin_map(engine->default_state,
1431                                                    I915_MAP_WB);
1432                 if (IS_ERR(defaults)) {
1433                         err = PTR_ERR(defaults);
1434                         goto err_map;
1435                 }
1436
1437                 memcpy(vaddr, defaults, engine->context_size);
1438                 i915_gem_object_unpin_map(engine->default_state);
1439
1440                 i915_gem_object_flush_map(obj);
1441                 i915_gem_object_unpin_map(obj);
1442         }
1443
1444         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
1445         if (IS_ERR(vma)) {
1446                 err = PTR_ERR(vma);
1447                 goto err_obj;
1448         }
1449
1450         return vma;
1451
1452 err_map:
1453         i915_gem_object_unpin_map(obj);
1454 err_obj:
1455         i915_gem_object_put(obj);
1456         return ERR_PTR(err);
1457 }
1458
1459 static int ring_context_pin(struct intel_context *ce)
1460 {
1461         struct intel_engine_cs *engine = ce->engine;
1462         int err;
1463
1464         /* One ringbuffer to rule them all */
1465         GEM_BUG_ON(!engine->buffer);
1466         ce->ring = engine->buffer;
1467
1468         if (!ce->state && engine->context_size) {
1469                 struct i915_vma *vma;
1470
1471                 vma = alloc_context_vma(engine);
1472                 if (IS_ERR(vma))
1473                         return PTR_ERR(vma);
1474
1475                 ce->state = vma;
1476         }
1477
1478         err = __context_pin(ce);
1479         if (err)
1480                 return err;
1481
1482         err = __context_pin_ppgtt(ce->gem_context);
1483         if (err)
1484                 goto err_unpin;
1485
1486         return 0;
1487
1488 err_unpin:
1489         __context_unpin(ce);
1490         return err;
1491 }
1492
1493 static void ring_context_reset(struct intel_context *ce)
1494 {
1495         intel_ring_reset(ce->ring, 0);
1496 }
1497
1498 static const struct intel_context_ops ring_context_ops = {
1499         .pin = ring_context_pin,
1500         .unpin = ring_context_unpin,
1501
1502         .enter = intel_context_enter_engine,
1503         .exit = intel_context_exit_engine,
1504
1505         .reset = ring_context_reset,
1506         .destroy = ring_context_destroy,
1507 };
1508
1509 static int load_pd_dir(struct i915_request *rq,
1510                        const struct i915_hw_ppgtt *ppgtt)
1511 {
1512         const struct intel_engine_cs * const engine = rq->engine;
1513         u32 *cs;
1514
1515         cs = intel_ring_begin(rq, 6);
1516         if (IS_ERR(cs))
1517                 return PTR_ERR(cs);
1518
1519         *cs++ = MI_LOAD_REGISTER_IMM(1);
1520         *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
1521         *cs++ = PP_DIR_DCLV_2G;
1522
1523         *cs++ = MI_LOAD_REGISTER_IMM(1);
1524         *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1525         *cs++ = ppgtt->pd.base.ggtt_offset << 10;
1526
1527         intel_ring_advance(rq, cs);
1528
1529         return 0;
1530 }
1531
1532 static int flush_pd_dir(struct i915_request *rq)
1533 {
1534         const struct intel_engine_cs * const engine = rq->engine;
1535         u32 *cs;
1536
1537         cs = intel_ring_begin(rq, 4);
1538         if (IS_ERR(cs))
1539                 return PTR_ERR(cs);
1540
1541         /* Stall until the page table load is complete */
1542         *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1543         *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1544         *cs++ = i915_scratch_offset(rq->i915);
1545         *cs++ = MI_NOOP;
1546
1547         intel_ring_advance(rq, cs);
1548         return 0;
1549 }
1550
1551 static inline int mi_set_context(struct i915_request *rq, u32 flags)
1552 {
1553         struct drm_i915_private *i915 = rq->i915;
1554         struct intel_engine_cs *engine = rq->engine;
1555         enum intel_engine_id id;
1556         const int num_engines =
1557                 IS_HSW_GT1(i915) ? RUNTIME_INFO(i915)->num_engines - 1 : 0;
1558         bool force_restore = false;
1559         int len;
1560         u32 *cs;
1561
1562         flags |= MI_MM_SPACE_GTT;
1563         if (IS_HASWELL(i915))
1564                 /* These flags are for resource streamer on HSW+ */
1565                 flags |= HSW_MI_RS_SAVE_STATE_EN | HSW_MI_RS_RESTORE_STATE_EN;
1566         else
1567                 /* We need to save the extended state for powersaving modes */
1568                 flags |= MI_SAVE_EXT_STATE_EN | MI_RESTORE_EXT_STATE_EN;
1569
1570         len = 4;
1571         if (IS_GEN(i915, 7))
1572                 len += 2 + (num_engines ? 4 * num_engines + 6 : 0);
1573         else if (IS_GEN(i915, 5))
1574                 len += 2;
1575         if (flags & MI_FORCE_RESTORE) {
1576                 GEM_BUG_ON(flags & MI_RESTORE_INHIBIT);
1577                 flags &= ~MI_FORCE_RESTORE;
1578                 force_restore = true;
1579                 len += 2;
1580         }
1581
1582         cs = intel_ring_begin(rq, len);
1583         if (IS_ERR(cs))
1584                 return PTR_ERR(cs);
1585
1586         /* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
1587         if (IS_GEN(i915, 7)) {
1588                 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1589                 if (num_engines) {
1590                         struct intel_engine_cs *signaller;
1591
1592                         *cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1593                         for_each_engine(signaller, i915, id) {
1594                                 if (signaller == engine)
1595                                         continue;
1596
1597                                 *cs++ = i915_mmio_reg_offset(
1598                                            RING_PSMI_CTL(signaller->mmio_base));
1599                                 *cs++ = _MASKED_BIT_ENABLE(
1600                                                 GEN6_PSMI_SLEEP_MSG_DISABLE);
1601                         }
1602                 }
1603         } else if (IS_GEN(i915, 5)) {
1604                 /*
1605                  * This w/a is only listed for pre-production ilk a/b steppings,
1606                  * but is also mentioned for programming the powerctx. To be
1607                  * safe, just apply the workaround; we do not use SyncFlush so
1608                  * this should never take effect and so be a no-op!
1609                  */
1610                 *cs++ = MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN;
1611         }
1612
1613         if (force_restore) {
1614                 /*
1615                  * The HW doesn't handle being told to restore the current
1616                  * context very well. Quite often it likes goes to go off and
1617                  * sulk, especially when it is meant to be reloading PP_DIR.
1618                  * A very simple fix to force the reload is to simply switch
1619                  * away from the current context and back again.
1620                  *
1621                  * Note that the kernel_context will contain random state
1622                  * following the INHIBIT_RESTORE. We accept this since we
1623                  * never use the kernel_context state; it is merely a
1624                  * placeholder we use to flush other contexts.
1625                  */
1626                 *cs++ = MI_SET_CONTEXT;
1627                 *cs++ = i915_ggtt_offset(engine->kernel_context->state) |
1628                         MI_MM_SPACE_GTT |
1629                         MI_RESTORE_INHIBIT;
1630         }
1631
1632         *cs++ = MI_NOOP;
1633         *cs++ = MI_SET_CONTEXT;
1634         *cs++ = i915_ggtt_offset(rq->hw_context->state) | flags;
1635         /*
1636          * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
1637          * WaMiSetContext_Hang:snb,ivb,vlv
1638          */
1639         *cs++ = MI_NOOP;
1640
1641         if (IS_GEN(i915, 7)) {
1642                 if (num_engines) {
1643                         struct intel_engine_cs *signaller;
1644                         i915_reg_t last_reg = {}; /* keep gcc quiet */
1645
1646                         *cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1647                         for_each_engine(signaller, i915, id) {
1648                                 if (signaller == engine)
1649                                         continue;
1650
1651                                 last_reg = RING_PSMI_CTL(signaller->mmio_base);
1652                                 *cs++ = i915_mmio_reg_offset(last_reg);
1653                                 *cs++ = _MASKED_BIT_DISABLE(
1654                                                 GEN6_PSMI_SLEEP_MSG_DISABLE);
1655                         }
1656
1657                         /* Insert a delay before the next switch! */
1658                         *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1659                         *cs++ = i915_mmio_reg_offset(last_reg);
1660                         *cs++ = i915_scratch_offset(rq->i915);
1661                         *cs++ = MI_NOOP;
1662                 }
1663                 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1664         } else if (IS_GEN(i915, 5)) {
1665                 *cs++ = MI_SUSPEND_FLUSH;
1666         }
1667
1668         intel_ring_advance(rq, cs);
1669
1670         return 0;
1671 }
1672
1673 static int remap_l3(struct i915_request *rq, int slice)
1674 {
1675         u32 *cs, *remap_info = rq->i915->l3_parity.remap_info[slice];
1676         int i;
1677
1678         if (!remap_info)
1679                 return 0;
1680
1681         cs = intel_ring_begin(rq, GEN7_L3LOG_SIZE/4 * 2 + 2);
1682         if (IS_ERR(cs))
1683                 return PTR_ERR(cs);
1684
1685         /*
1686          * Note: We do not worry about the concurrent register cacheline hang
1687          * here because no other code should access these registers other than
1688          * at initialization time.
1689          */
1690         *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
1691         for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
1692                 *cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
1693                 *cs++ = remap_info[i];
1694         }
1695         *cs++ = MI_NOOP;
1696         intel_ring_advance(rq, cs);
1697
1698         return 0;
1699 }
1700
1701 static int switch_context(struct i915_request *rq)
1702 {
1703         struct intel_engine_cs *engine = rq->engine;
1704         struct i915_gem_context *ctx = rq->gem_context;
1705         struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
1706         unsigned int unwind_mm = 0;
1707         u32 hw_flags = 0;
1708         int ret, i;
1709
1710         GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
1711
1712         if (ppgtt) {
1713                 int loops;
1714
1715                 /*
1716                  * Baytail takes a little more convincing that it really needs
1717                  * to reload the PD between contexts. It is not just a little
1718                  * longer, as adding more stalls after the load_pd_dir (i.e.
1719                  * adding a long loop around flush_pd_dir) is not as effective
1720                  * as reloading the PD umpteen times. 32 is derived from
1721                  * experimentation (gem_exec_parallel/fds) and has no good
1722                  * explanation.
1723                  */
1724                 loops = 1;
1725                 if (engine->id == BCS0 && IS_VALLEYVIEW(engine->i915))
1726                         loops = 32;
1727
1728                 do {
1729                         ret = load_pd_dir(rq, ppgtt);
1730                         if (ret)
1731                                 goto err;
1732                 } while (--loops);
1733
1734                 if (ppgtt->pd_dirty_engines & engine->mask) {
1735                         unwind_mm = engine->mask;
1736                         ppgtt->pd_dirty_engines &= ~unwind_mm;
1737                         hw_flags = MI_FORCE_RESTORE;
1738                 }
1739         }
1740
1741         if (rq->hw_context->state) {
1742                 GEM_BUG_ON(engine->id != RCS0);
1743
1744                 /*
1745                  * The kernel context(s) is treated as pure scratch and is not
1746                  * expected to retain any state (as we sacrifice it during
1747                  * suspend and on resume it may be corrupted). This is ok,
1748                  * as nothing actually executes using the kernel context; it
1749                  * is purely used for flushing user contexts.
1750                  */
1751                 if (i915_gem_context_is_kernel(ctx))
1752                         hw_flags = MI_RESTORE_INHIBIT;
1753
1754                 ret = mi_set_context(rq, hw_flags);
1755                 if (ret)
1756                         goto err_mm;
1757         }
1758
1759         if (ppgtt) {
1760                 ret = engine->emit_flush(rq, EMIT_INVALIDATE);
1761                 if (ret)
1762                         goto err_mm;
1763
1764                 ret = flush_pd_dir(rq);
1765                 if (ret)
1766                         goto err_mm;
1767
1768                 /*
1769                  * Not only do we need a full barrier (post-sync write) after
1770                  * invalidating the TLBs, but we need to wait a little bit
1771                  * longer. Whether this is merely delaying us, or the
1772                  * subsequent flush is a key part of serialising with the
1773                  * post-sync op, this extra pass appears vital before a
1774                  * mm switch!
1775                  */
1776                 ret = engine->emit_flush(rq, EMIT_INVALIDATE);
1777                 if (ret)
1778                         goto err_mm;
1779
1780                 ret = engine->emit_flush(rq, EMIT_FLUSH);
1781                 if (ret)
1782                         goto err_mm;
1783         }
1784
1785         if (ctx->remap_slice) {
1786                 for (i = 0; i < MAX_L3_SLICES; i++) {
1787                         if (!(ctx->remap_slice & BIT(i)))
1788                                 continue;
1789
1790                         ret = remap_l3(rq, i);
1791                         if (ret)
1792                                 goto err_mm;
1793                 }
1794
1795                 ctx->remap_slice = 0;
1796         }
1797
1798         return 0;
1799
1800 err_mm:
1801         if (unwind_mm)
1802                 ppgtt->pd_dirty_engines |= unwind_mm;
1803 err:
1804         return ret;
1805 }
1806
1807 static int ring_request_alloc(struct i915_request *request)
1808 {
1809         int ret;
1810
1811         GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1812         GEM_BUG_ON(request->timeline->has_initial_breadcrumb);
1813
1814         /*
1815          * Flush enough space to reduce the likelihood of waiting after
1816          * we start building the request - in which case we will just
1817          * have to repeat work.
1818          */
1819         request->reserved_space += LEGACY_REQUEST_SIZE;
1820
1821         /* Unconditionally invalidate GPU caches and TLBs. */
1822         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1823         if (ret)
1824                 return ret;
1825
1826         ret = switch_context(request);
1827         if (ret)
1828                 return ret;
1829
1830         request->reserved_space -= LEGACY_REQUEST_SIZE;
1831         return 0;
1832 }
1833
1834 static noinline int wait_for_space(struct intel_ring *ring, unsigned int bytes)
1835 {
1836         struct i915_request *target;
1837         long timeout;
1838
1839         if (intel_ring_update_space(ring) >= bytes)
1840                 return 0;
1841
1842         GEM_BUG_ON(list_empty(&ring->request_list));
1843         list_for_each_entry(target, &ring->request_list, ring_link) {
1844                 /* Would completion of this request free enough space? */
1845                 if (bytes <= __intel_ring_space(target->postfix,
1846                                                 ring->emit, ring->size))
1847                         break;
1848         }
1849
1850         if (WARN_ON(&target->ring_link == &ring->request_list))
1851                 return -ENOSPC;
1852
1853         timeout = i915_request_wait(target,
1854                                     I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
1855                                     MAX_SCHEDULE_TIMEOUT);
1856         if (timeout < 0)
1857                 return timeout;
1858
1859         i915_request_retire_upto(target);
1860
1861         intel_ring_update_space(ring);
1862         GEM_BUG_ON(ring->space < bytes);
1863         return 0;
1864 }
1865
1866 u32 *intel_ring_begin(struct i915_request *rq, unsigned int num_dwords)
1867 {
1868         struct intel_ring *ring = rq->ring;
1869         const unsigned int remain_usable = ring->effective_size - ring->emit;
1870         const unsigned int bytes = num_dwords * sizeof(u32);
1871         unsigned int need_wrap = 0;
1872         unsigned int total_bytes;
1873         u32 *cs;
1874
1875         /* Packets must be qword aligned. */
1876         GEM_BUG_ON(num_dwords & 1);
1877
1878         total_bytes = bytes + rq->reserved_space;
1879         GEM_BUG_ON(total_bytes > ring->effective_size);
1880
1881         if (unlikely(total_bytes > remain_usable)) {
1882                 const int remain_actual = ring->size - ring->emit;
1883
1884                 if (bytes > remain_usable) {
1885                         /*
1886                          * Not enough space for the basic request. So need to
1887                          * flush out the remainder and then wait for
1888                          * base + reserved.
1889                          */
1890                         total_bytes += remain_actual;
1891                         need_wrap = remain_actual | 1;
1892                 } else  {
1893                         /*
1894                          * The base request will fit but the reserved space
1895                          * falls off the end. So we don't need an immediate
1896                          * wrap and only need to effectively wait for the
1897                          * reserved size from the start of ringbuffer.
1898                          */
1899                         total_bytes = rq->reserved_space + remain_actual;
1900                 }
1901         }
1902
1903         if (unlikely(total_bytes > ring->space)) {
1904                 int ret;
1905
1906                 /*
1907                  * Space is reserved in the ringbuffer for finalising the
1908                  * request, as that cannot be allowed to fail. During request
1909                  * finalisation, reserved_space is set to 0 to stop the
1910                  * overallocation and the assumption is that then we never need
1911                  * to wait (which has the risk of failing with EINTR).
1912                  *
1913                  * See also i915_request_alloc() and i915_request_add().
1914                  */
1915                 GEM_BUG_ON(!rq->reserved_space);
1916
1917                 ret = wait_for_space(ring, total_bytes);
1918                 if (unlikely(ret))
1919                         return ERR_PTR(ret);
1920         }
1921
1922         if (unlikely(need_wrap)) {
1923                 need_wrap &= ~1;
1924                 GEM_BUG_ON(need_wrap > ring->space);
1925                 GEM_BUG_ON(ring->emit + need_wrap > ring->size);
1926                 GEM_BUG_ON(!IS_ALIGNED(need_wrap, sizeof(u64)));
1927
1928                 /* Fill the tail with MI_NOOP */
1929                 memset64(ring->vaddr + ring->emit, 0, need_wrap / sizeof(u64));
1930                 ring->space -= need_wrap;
1931                 ring->emit = 0;
1932         }
1933
1934         GEM_BUG_ON(ring->emit > ring->size - bytes);
1935         GEM_BUG_ON(ring->space < bytes);
1936         cs = ring->vaddr + ring->emit;
1937         GEM_DEBUG_EXEC(memset32(cs, POISON_INUSE, bytes / sizeof(*cs)));
1938         ring->emit += bytes;
1939         ring->space -= bytes;
1940
1941         return cs;
1942 }
1943
1944 /* Align the ring tail to a cacheline boundary */
1945 int intel_ring_cacheline_align(struct i915_request *rq)
1946 {
1947         int num_dwords;
1948         void *cs;
1949
1950         num_dwords = (rq->ring->emit & (CACHELINE_BYTES - 1)) / sizeof(u32);
1951         if (num_dwords == 0)
1952                 return 0;
1953
1954         num_dwords = CACHELINE_DWORDS - num_dwords;
1955         GEM_BUG_ON(num_dwords & 1);
1956
1957         cs = intel_ring_begin(rq, num_dwords);
1958         if (IS_ERR(cs))
1959                 return PTR_ERR(cs);
1960
1961         memset64(cs, (u64)MI_NOOP << 32 | MI_NOOP, num_dwords / 2);
1962         intel_ring_advance(rq, cs);
1963
1964         GEM_BUG_ON(rq->ring->emit & (CACHELINE_BYTES - 1));
1965         return 0;
1966 }
1967
1968 static void gen6_bsd_submit_request(struct i915_request *request)
1969 {
1970         struct intel_uncore *uncore = request->engine->uncore;
1971
1972         intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
1973
1974        /* Every tail move must follow the sequence below */
1975
1976         /* Disable notification that the ring is IDLE. The GT
1977          * will then assume that it is busy and bring it out of rc6.
1978          */
1979         intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
1980                               _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1981
1982         /* Clear the context id. Here be magic! */
1983         intel_uncore_write64_fw(uncore, GEN6_BSD_RNCID, 0x0);
1984
1985         /* Wait for the ring not to be idle, i.e. for it to wake up. */
1986         if (__intel_wait_for_register_fw(uncore,
1987                                          GEN6_BSD_SLEEP_PSMI_CONTROL,
1988                                          GEN6_BSD_SLEEP_INDICATOR,
1989                                          0,
1990                                          1000, 0, NULL))
1991                 DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
1992
1993         /* Now that the ring is fully powered up, update the tail */
1994         i9xx_submit_request(request);
1995
1996         /* Let the ring send IDLE messages to the GT again,
1997          * and so let it sleep to conserve power when idle.
1998          */
1999         intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
2000                               _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
2001
2002         intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
2003 }
2004
2005 static int mi_flush_dw(struct i915_request *rq, u32 flags)
2006 {
2007         u32 cmd, *cs;
2008
2009         cs = intel_ring_begin(rq, 4);
2010         if (IS_ERR(cs))
2011                 return PTR_ERR(cs);
2012
2013         cmd = MI_FLUSH_DW;
2014
2015         /*
2016          * We always require a command barrier so that subsequent
2017          * commands, such as breadcrumb interrupts, are strictly ordered
2018          * wrt the contents of the write cache being flushed to memory
2019          * (and thus being coherent from the CPU).
2020          */
2021         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2022
2023         /*
2024          * Bspec vol 1c.3 - blitter engine command streamer:
2025          * "If ENABLED, all TLBs will be invalidated once the flush
2026          * operation is complete. This bit is only valid when the
2027          * Post-Sync Operation field is a value of 1h or 3h."
2028          */
2029         cmd |= flags;
2030
2031         *cs++ = cmd;
2032         *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2033         *cs++ = 0;
2034         *cs++ = MI_NOOP;
2035
2036         intel_ring_advance(rq, cs);
2037
2038         return 0;
2039 }
2040
2041 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
2042 {
2043         return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
2044 }
2045
2046 static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
2047 {
2048         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
2049 }
2050
2051 static int
2052 hsw_emit_bb_start(struct i915_request *rq,
2053                   u64 offset, u32 len,
2054                   unsigned int dispatch_flags)
2055 {
2056         u32 *cs;
2057
2058         cs = intel_ring_begin(rq, 2);
2059         if (IS_ERR(cs))
2060                 return PTR_ERR(cs);
2061
2062         *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
2063                 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW);
2064         /* bit0-7 is the length on GEN6+ */
2065         *cs++ = offset;
2066         intel_ring_advance(rq, cs);
2067
2068         return 0;
2069 }
2070
2071 static int
2072 gen6_emit_bb_start(struct i915_request *rq,
2073                    u64 offset, u32 len,
2074                    unsigned int dispatch_flags)
2075 {
2076         u32 *cs;
2077
2078         cs = intel_ring_begin(rq, 2);
2079         if (IS_ERR(cs))
2080                 return PTR_ERR(cs);
2081
2082         *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
2083                 0 : MI_BATCH_NON_SECURE_I965);
2084         /* bit0-7 is the length on GEN6+ */
2085         *cs++ = offset;
2086         intel_ring_advance(rq, cs);
2087
2088         return 0;
2089 }
2090
2091 /* Blitter support (SandyBridge+) */
2092
2093 static int gen6_ring_flush(struct i915_request *rq, u32 mode)
2094 {
2095         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
2096 }
2097
2098 static void i9xx_set_default_submission(struct intel_engine_cs *engine)
2099 {
2100         engine->submit_request = i9xx_submit_request;
2101         engine->cancel_requests = cancel_requests;
2102
2103         engine->park = NULL;
2104         engine->unpark = NULL;
2105 }
2106
2107 static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine)
2108 {
2109         i9xx_set_default_submission(engine);
2110         engine->submit_request = gen6_bsd_submit_request;
2111 }
2112
2113 static void ring_destroy(struct intel_engine_cs *engine)
2114 {
2115         struct drm_i915_private *dev_priv = engine->i915;
2116
2117         WARN_ON(INTEL_GEN(dev_priv) > 2 &&
2118                 (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
2119
2120         intel_ring_unpin(engine->buffer);
2121         intel_ring_put(engine->buffer);
2122
2123         intel_engine_cleanup_common(engine);
2124         kfree(engine);
2125 }
2126
2127 static void setup_irq(struct intel_engine_cs *engine)
2128 {
2129         struct drm_i915_private *i915 = engine->i915;
2130
2131         if (INTEL_GEN(i915) >= 6) {
2132                 engine->irq_enable = gen6_irq_enable;
2133                 engine->irq_disable = gen6_irq_disable;
2134         } else if (INTEL_GEN(i915) >= 5) {
2135                 engine->irq_enable = gen5_irq_enable;
2136                 engine->irq_disable = gen5_irq_disable;
2137         } else if (INTEL_GEN(i915) >= 3) {
2138                 engine->irq_enable = i9xx_irq_enable;
2139                 engine->irq_disable = i9xx_irq_disable;
2140         } else {
2141                 engine->irq_enable = i8xx_irq_enable;
2142                 engine->irq_disable = i8xx_irq_disable;
2143         }
2144 }
2145
2146 static void setup_common(struct intel_engine_cs *engine)
2147 {
2148         struct drm_i915_private *i915 = engine->i915;
2149
2150         /* gen8+ are only supported with execlists */
2151         GEM_BUG_ON(INTEL_GEN(i915) >= 8);
2152
2153         setup_irq(engine);
2154
2155         engine->destroy = ring_destroy;
2156
2157         engine->resume = xcs_resume;
2158         engine->reset.prepare = reset_prepare;
2159         engine->reset.reset = reset_ring;
2160         engine->reset.finish = reset_finish;
2161
2162         engine->cops = &ring_context_ops;
2163         engine->request_alloc = ring_request_alloc;
2164
2165         /*
2166          * Using a global execution timeline; the previous final breadcrumb is
2167          * equivalent to our next initial bread so we can elide
2168          * engine->emit_init_breadcrumb().
2169          */
2170         engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
2171         if (IS_GEN(i915, 5))
2172                 engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
2173
2174         engine->set_default_submission = i9xx_set_default_submission;
2175
2176         if (INTEL_GEN(i915) >= 6)
2177                 engine->emit_bb_start = gen6_emit_bb_start;
2178         else if (INTEL_GEN(i915) >= 4)
2179                 engine->emit_bb_start = i965_emit_bb_start;
2180         else if (IS_I830(i915) || IS_I845G(i915))
2181                 engine->emit_bb_start = i830_emit_bb_start;
2182         else
2183                 engine->emit_bb_start = i915_emit_bb_start;
2184 }
2185
2186 static void setup_rcs(struct intel_engine_cs *engine)
2187 {
2188         struct drm_i915_private *i915 = engine->i915;
2189
2190         if (HAS_L3_DPF(i915))
2191                 engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2192
2193         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
2194
2195         if (INTEL_GEN(i915) >= 7) {
2196                 engine->init_context = intel_rcs_ctx_init;
2197                 engine->emit_flush = gen7_render_ring_flush;
2198                 engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
2199         } else if (IS_GEN(i915, 6)) {
2200                 engine->init_context = intel_rcs_ctx_init;
2201                 engine->emit_flush = gen6_render_ring_flush;
2202                 engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
2203         } else if (IS_GEN(i915, 5)) {
2204                 engine->emit_flush = gen4_render_ring_flush;
2205         } else {
2206                 if (INTEL_GEN(i915) < 4)
2207                         engine->emit_flush = gen2_render_ring_flush;
2208                 else
2209                         engine->emit_flush = gen4_render_ring_flush;
2210                 engine->irq_enable_mask = I915_USER_INTERRUPT;
2211         }
2212
2213         if (IS_HASWELL(i915))
2214                 engine->emit_bb_start = hsw_emit_bb_start;
2215
2216         engine->resume = rcs_resume;
2217 }
2218
2219 static void setup_vcs(struct intel_engine_cs *engine)
2220 {
2221         struct drm_i915_private *i915 = engine->i915;
2222
2223         if (INTEL_GEN(i915) >= 6) {
2224                 /* gen6 bsd needs a special wa for tail updates */
2225                 if (IS_GEN(i915, 6))
2226                         engine->set_default_submission = gen6_bsd_set_default_submission;
2227                 engine->emit_flush = gen6_bsd_ring_flush;
2228                 engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
2229
2230                 if (IS_GEN(i915, 6))
2231                         engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
2232                 else
2233                         engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2234         } else {
2235                 engine->emit_flush = bsd_ring_flush;
2236                 if (IS_GEN(i915, 5))
2237                         engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
2238                 else
2239                         engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
2240         }
2241 }
2242
2243 static void setup_bcs(struct intel_engine_cs *engine)
2244 {
2245         struct drm_i915_private *i915 = engine->i915;
2246
2247         engine->emit_flush = gen6_ring_flush;
2248         engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
2249
2250         if (IS_GEN(i915, 6))
2251                 engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
2252         else
2253                 engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2254 }
2255
2256 static void setup_vecs(struct intel_engine_cs *engine)
2257 {
2258         struct drm_i915_private *i915 = engine->i915;
2259
2260         GEM_BUG_ON(INTEL_GEN(i915) < 7);
2261
2262         engine->emit_flush = gen6_ring_flush;
2263         engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
2264         engine->irq_enable = hsw_vebox_irq_enable;
2265         engine->irq_disable = hsw_vebox_irq_disable;
2266
2267         engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2268 }
2269
2270 int intel_ring_submission_setup(struct intel_engine_cs *engine)
2271 {
2272         setup_common(engine);
2273
2274         switch (engine->class) {
2275         case RENDER_CLASS:
2276                 setup_rcs(engine);
2277                 break;
2278         case VIDEO_DECODE_CLASS:
2279                 setup_vcs(engine);
2280                 break;
2281         case COPY_ENGINE_CLASS:
2282                 setup_bcs(engine);
2283                 break;
2284         case VIDEO_ENHANCEMENT_CLASS:
2285                 setup_vecs(engine);
2286                 break;
2287         default:
2288                 MISSING_CASE(engine->class);
2289                 return -ENODEV;
2290         }
2291
2292         return 0;
2293 }
2294
2295 int intel_ring_submission_init(struct intel_engine_cs *engine)
2296 {
2297         struct i915_timeline *timeline;
2298         struct intel_ring *ring;
2299         int err;
2300
2301         timeline = i915_timeline_create(engine->i915, engine->status_page.vma);
2302         if (IS_ERR(timeline)) {
2303                 err = PTR_ERR(timeline);
2304                 goto err;
2305         }
2306         GEM_BUG_ON(timeline->has_initial_breadcrumb);
2307
2308         ring = intel_engine_create_ring(engine, timeline, 32 * PAGE_SIZE);
2309         i915_timeline_put(timeline);
2310         if (IS_ERR(ring)) {
2311                 err = PTR_ERR(ring);
2312                 goto err;
2313         }
2314
2315         err = intel_ring_pin(ring);
2316         if (err)
2317                 goto err_ring;
2318
2319         GEM_BUG_ON(engine->buffer);
2320         engine->buffer = ring;
2321
2322         err = intel_engine_init_common(engine);
2323         if (err)
2324                 goto err_unpin;
2325
2326         GEM_BUG_ON(ring->timeline->hwsp_ggtt != engine->status_page.vma);
2327
2328         return 0;
2329
2330 err_unpin:
2331         intel_ring_unpin(ring);
2332 err_ring:
2333         intel_ring_put(ring);
2334 err:
2335         intel_engine_cleanup_common(engine);
2336         return err;
2337 }