drm/i915: Move shmem object setup to its own file
[sfrench/cifs-2.6.git] / drivers / gpu / drm / i915 / gt / intel_ringbuffer.c
1 /*
2  * Copyright © 2008-2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *    Zou Nan hai <nanhai.zou@intel.com>
26  *    Xiang Hai hao<haihao.xiang@intel.com>
27  *
28  */
29
30 #include <linux/log2.h>
31
32 #include <drm/i915_drm.h>
33
34 #include "i915_drv.h"
35 #include "i915_gem_render_state.h"
36 #include "i915_trace.h"
37 #include "intel_reset.h"
38 #include "intel_workarounds.h"
39
40 /* Rough estimate of the typical request size, performing a flush,
41  * set-context and then emitting the batch.
42  */
43 #define LEGACY_REQUEST_SIZE 200
44
45 unsigned int intel_ring_update_space(struct intel_ring *ring)
46 {
47         unsigned int space;
48
49         space = __intel_ring_space(ring->head, ring->emit, ring->size);
50
51         ring->space = space;
52         return space;
53 }
54
55 static int
56 gen2_render_ring_flush(struct i915_request *rq, u32 mode)
57 {
58         unsigned int num_store_dw;
59         u32 cmd, *cs;
60
61         cmd = MI_FLUSH;
62         num_store_dw = 0;
63         if (mode & EMIT_INVALIDATE)
64                 cmd |= MI_READ_FLUSH;
65         if (mode & EMIT_FLUSH)
66                 num_store_dw = 4;
67
68         cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
69         if (IS_ERR(cs))
70                 return PTR_ERR(cs);
71
72         *cs++ = cmd;
73         while (num_store_dw--) {
74                 *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
75                 *cs++ = i915_scratch_offset(rq->i915);
76                 *cs++ = 0;
77         }
78         *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
79
80         intel_ring_advance(rq, cs);
81
82         return 0;
83 }
84
85 static int
86 gen4_render_ring_flush(struct i915_request *rq, u32 mode)
87 {
88         u32 cmd, *cs;
89         int i;
90
91         /*
92          * read/write caches:
93          *
94          * I915_GEM_DOMAIN_RENDER is always invalidated, but is
95          * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
96          * also flushed at 2d versus 3d pipeline switches.
97          *
98          * read-only caches:
99          *
100          * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
101          * MI_READ_FLUSH is set, and is always flushed on 965.
102          *
103          * I915_GEM_DOMAIN_COMMAND may not exist?
104          *
105          * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
106          * invalidated when MI_EXE_FLUSH is set.
107          *
108          * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
109          * invalidated with every MI_FLUSH.
110          *
111          * TLBs:
112          *
113          * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
114          * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
115          * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
116          * are flushed at any MI_FLUSH.
117          */
118
119         cmd = MI_FLUSH;
120         if (mode & EMIT_INVALIDATE) {
121                 cmd |= MI_EXE_FLUSH;
122                 if (IS_G4X(rq->i915) || IS_GEN(rq->i915, 5))
123                         cmd |= MI_INVALIDATE_ISP;
124         }
125
126         i = 2;
127         if (mode & EMIT_INVALIDATE)
128                 i += 20;
129
130         cs = intel_ring_begin(rq, i);
131         if (IS_ERR(cs))
132                 return PTR_ERR(cs);
133
134         *cs++ = cmd;
135
136         /*
137          * A random delay to let the CS invalidate take effect? Without this
138          * delay, the GPU relocation path fails as the CS does not see
139          * the updated contents. Just as important, if we apply the flushes
140          * to the EMIT_FLUSH branch (i.e. immediately after the relocation
141          * write and before the invalidate on the next batch), the relocations
142          * still fail. This implies that is a delay following invalidation
143          * that is required to reset the caches as opposed to a delay to
144          * ensure the memory is written.
145          */
146         if (mode & EMIT_INVALIDATE) {
147                 *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
148                 *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
149                 *cs++ = 0;
150                 *cs++ = 0;
151
152                 for (i = 0; i < 12; i++)
153                         *cs++ = MI_FLUSH;
154
155                 *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
156                 *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
157                 *cs++ = 0;
158                 *cs++ = 0;
159         }
160
161         *cs++ = cmd;
162
163         intel_ring_advance(rq, cs);
164
165         return 0;
166 }
167
168 /*
169  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
170  * implementing two workarounds on gen6.  From section 1.4.7.1
171  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
172  *
173  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
174  * produced by non-pipelined state commands), software needs to first
175  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
176  * 0.
177  *
178  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
179  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
180  *
181  * And the workaround for these two requires this workaround first:
182  *
183  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
184  * BEFORE the pipe-control with a post-sync op and no write-cache
185  * flushes.
186  *
187  * And this last workaround is tricky because of the requirements on
188  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
189  * volume 2 part 1:
190  *
191  *     "1 of the following must also be set:
192  *      - Render Target Cache Flush Enable ([12] of DW1)
193  *      - Depth Cache Flush Enable ([0] of DW1)
194  *      - Stall at Pixel Scoreboard ([1] of DW1)
195  *      - Depth Stall ([13] of DW1)
196  *      - Post-Sync Operation ([13] of DW1)
197  *      - Notify Enable ([8] of DW1)"
198  *
199  * The cache flushes require the workaround flush that triggered this
200  * one, so we can't use it.  Depth stall would trigger the same.
201  * Post-sync nonzero is what triggered this second workaround, so we
202  * can't use that one either.  Notify enable is IRQs, which aren't
203  * really our business.  That leaves only stall at scoreboard.
204  */
205 static int
206 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
207 {
208         u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
209         u32 *cs;
210
211         cs = intel_ring_begin(rq, 6);
212         if (IS_ERR(cs))
213                 return PTR_ERR(cs);
214
215         *cs++ = GFX_OP_PIPE_CONTROL(5);
216         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
217         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
218         *cs++ = 0; /* low dword */
219         *cs++ = 0; /* high dword */
220         *cs++ = MI_NOOP;
221         intel_ring_advance(rq, cs);
222
223         cs = intel_ring_begin(rq, 6);
224         if (IS_ERR(cs))
225                 return PTR_ERR(cs);
226
227         *cs++ = GFX_OP_PIPE_CONTROL(5);
228         *cs++ = PIPE_CONTROL_QW_WRITE;
229         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
230         *cs++ = 0;
231         *cs++ = 0;
232         *cs++ = MI_NOOP;
233         intel_ring_advance(rq, cs);
234
235         return 0;
236 }
237
238 static int
239 gen6_render_ring_flush(struct i915_request *rq, u32 mode)
240 {
241         u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
242         u32 *cs, flags = 0;
243         int ret;
244
245         /* Force SNB workarounds for PIPE_CONTROL flushes */
246         ret = gen6_emit_post_sync_nonzero_flush(rq);
247         if (ret)
248                 return ret;
249
250         /* Just flush everything.  Experiments have shown that reducing the
251          * number of bits based on the write domains has little performance
252          * impact.
253          */
254         if (mode & EMIT_FLUSH) {
255                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
256                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
257                 /*
258                  * Ensure that any following seqno writes only happen
259                  * when the render cache is indeed flushed.
260                  */
261                 flags |= PIPE_CONTROL_CS_STALL;
262         }
263         if (mode & EMIT_INVALIDATE) {
264                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
265                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
266                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
267                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
268                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
269                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
270                 /*
271                  * TLB invalidate requires a post-sync write.
272                  */
273                 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
274         }
275
276         cs = intel_ring_begin(rq, 4);
277         if (IS_ERR(cs))
278                 return PTR_ERR(cs);
279
280         *cs++ = GFX_OP_PIPE_CONTROL(4);
281         *cs++ = flags;
282         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
283         *cs++ = 0;
284         intel_ring_advance(rq, cs);
285
286         return 0;
287 }
288
289 static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
290 {
291         /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
292         *cs++ = GFX_OP_PIPE_CONTROL(4);
293         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
294         *cs++ = 0;
295         *cs++ = 0;
296
297         *cs++ = GFX_OP_PIPE_CONTROL(4);
298         *cs++ = PIPE_CONTROL_QW_WRITE;
299         *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
300         *cs++ = 0;
301
302         /* Finally we can flush and with it emit the breadcrumb */
303         *cs++ = GFX_OP_PIPE_CONTROL(4);
304         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
305                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
306                  PIPE_CONTROL_DC_FLUSH_ENABLE |
307                  PIPE_CONTROL_QW_WRITE |
308                  PIPE_CONTROL_CS_STALL);
309         *cs++ = rq->timeline->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT;
310         *cs++ = rq->fence.seqno;
311
312         *cs++ = MI_USER_INTERRUPT;
313         *cs++ = MI_NOOP;
314
315         rq->tail = intel_ring_offset(rq, cs);
316         assert_ring_tail_valid(rq->ring, rq->tail);
317
318         return cs;
319 }
320
321 static int
322 gen7_render_ring_cs_stall_wa(struct i915_request *rq)
323 {
324         u32 *cs;
325
326         cs = intel_ring_begin(rq, 4);
327         if (IS_ERR(cs))
328                 return PTR_ERR(cs);
329
330         *cs++ = GFX_OP_PIPE_CONTROL(4);
331         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
332         *cs++ = 0;
333         *cs++ = 0;
334         intel_ring_advance(rq, cs);
335
336         return 0;
337 }
338
339 static int
340 gen7_render_ring_flush(struct i915_request *rq, u32 mode)
341 {
342         u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
343         u32 *cs, flags = 0;
344
345         /*
346          * Ensure that any following seqno writes only happen when the render
347          * cache is indeed flushed.
348          *
349          * Workaround: 4th PIPE_CONTROL command (except the ones with only
350          * read-cache invalidate bits set) must have the CS_STALL bit set. We
351          * don't try to be clever and just set it unconditionally.
352          */
353         flags |= PIPE_CONTROL_CS_STALL;
354
355         /* Just flush everything.  Experiments have shown that reducing the
356          * number of bits based on the write domains has little performance
357          * impact.
358          */
359         if (mode & EMIT_FLUSH) {
360                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
361                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
362                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
363                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
364         }
365         if (mode & EMIT_INVALIDATE) {
366                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
367                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
368                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
369                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
370                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
371                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
372                 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
373                 /*
374                  * TLB invalidate requires a post-sync write.
375                  */
376                 flags |= PIPE_CONTROL_QW_WRITE;
377                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
378
379                 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
380
381                 /* Workaround: we must issue a pipe_control with CS-stall bit
382                  * set before a pipe_control command that has the state cache
383                  * invalidate bit set. */
384                 gen7_render_ring_cs_stall_wa(rq);
385         }
386
387         cs = intel_ring_begin(rq, 4);
388         if (IS_ERR(cs))
389                 return PTR_ERR(cs);
390
391         *cs++ = GFX_OP_PIPE_CONTROL(4);
392         *cs++ = flags;
393         *cs++ = scratch_addr;
394         *cs++ = 0;
395         intel_ring_advance(rq, cs);
396
397         return 0;
398 }
399
400 static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
401 {
402         *cs++ = GFX_OP_PIPE_CONTROL(4);
403         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
404                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
405                  PIPE_CONTROL_DC_FLUSH_ENABLE |
406                  PIPE_CONTROL_FLUSH_ENABLE |
407                  PIPE_CONTROL_QW_WRITE |
408                  PIPE_CONTROL_GLOBAL_GTT_IVB |
409                  PIPE_CONTROL_CS_STALL);
410         *cs++ = rq->timeline->hwsp_offset;
411         *cs++ = rq->fence.seqno;
412
413         *cs++ = MI_USER_INTERRUPT;
414         *cs++ = MI_NOOP;
415
416         rq->tail = intel_ring_offset(rq, cs);
417         assert_ring_tail_valid(rq->ring, rq->tail);
418
419         return cs;
420 }
421
422 static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
423 {
424         GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
425         GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
426
427         *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
428         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
429         *cs++ = rq->fence.seqno;
430
431         *cs++ = MI_USER_INTERRUPT;
432
433         rq->tail = intel_ring_offset(rq, cs);
434         assert_ring_tail_valid(rq->ring, rq->tail);
435
436         return cs;
437 }
438
439 #define GEN7_XCS_WA 32
440 static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
441 {
442         int i;
443
444         GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
445         GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
446
447         *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
448         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
449         *cs++ = rq->fence.seqno;
450
451         for (i = 0; i < GEN7_XCS_WA; i++) {
452                 *cs++ = MI_STORE_DWORD_INDEX;
453                 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
454                 *cs++ = rq->fence.seqno;
455         }
456
457         *cs++ = MI_FLUSH_DW;
458         *cs++ = 0;
459         *cs++ = 0;
460
461         *cs++ = MI_USER_INTERRUPT;
462         *cs++ = MI_NOOP;
463
464         rq->tail = intel_ring_offset(rq, cs);
465         assert_ring_tail_valid(rq->ring, rq->tail);
466
467         return cs;
468 }
469 #undef GEN7_XCS_WA
470
471 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
472 {
473         /*
474          * Keep the render interrupt unmasked as this papers over
475          * lost interrupts following a reset.
476          */
477         if (engine->class == RENDER_CLASS) {
478                 if (INTEL_GEN(engine->i915) >= 6)
479                         mask &= ~BIT(0);
480                 else
481                         mask &= ~I915_USER_INTERRUPT;
482         }
483
484         intel_engine_set_hwsp_writemask(engine, mask);
485 }
486
487 static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys)
488 {
489         struct drm_i915_private *dev_priv = engine->i915;
490         u32 addr;
491
492         addr = lower_32_bits(phys);
493         if (INTEL_GEN(dev_priv) >= 4)
494                 addr |= (phys >> 28) & 0xf0;
495
496         I915_WRITE(HWS_PGA, addr);
497 }
498
499 static struct page *status_page(struct intel_engine_cs *engine)
500 {
501         struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
502
503         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
504         return sg_page(obj->mm.pages->sgl);
505 }
506
507 static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
508 {
509         set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine))));
510         set_hwstam(engine, ~0u);
511 }
512
513 static void set_hwsp(struct intel_engine_cs *engine, u32 offset)
514 {
515         struct drm_i915_private *dev_priv = engine->i915;
516         i915_reg_t hwsp;
517
518         /*
519          * The ring status page addresses are no longer next to the rest of
520          * the ring registers as of gen7.
521          */
522         if (IS_GEN(dev_priv, 7)) {
523                 switch (engine->id) {
524                 /*
525                  * No more rings exist on Gen7. Default case is only to shut up
526                  * gcc switch check warning.
527                  */
528                 default:
529                         GEM_BUG_ON(engine->id);
530                         /* fallthrough */
531                 case RCS0:
532                         hwsp = RENDER_HWS_PGA_GEN7;
533                         break;
534                 case BCS0:
535                         hwsp = BLT_HWS_PGA_GEN7;
536                         break;
537                 case VCS0:
538                         hwsp = BSD_HWS_PGA_GEN7;
539                         break;
540                 case VECS0:
541                         hwsp = VEBOX_HWS_PGA_GEN7;
542                         break;
543                 }
544         } else if (IS_GEN(dev_priv, 6)) {
545                 hwsp = RING_HWS_PGA_GEN6(engine->mmio_base);
546         } else {
547                 hwsp = RING_HWS_PGA(engine->mmio_base);
548         }
549
550         I915_WRITE(hwsp, offset);
551         POSTING_READ(hwsp);
552 }
553
554 static void flush_cs_tlb(struct intel_engine_cs *engine)
555 {
556         struct drm_i915_private *dev_priv = engine->i915;
557
558         if (!IS_GEN_RANGE(dev_priv, 6, 7))
559                 return;
560
561         /* ring should be idle before issuing a sync flush*/
562         WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
563
564         ENGINE_WRITE(engine, RING_INSTPM,
565                      _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
566                                         INSTPM_SYNC_FLUSH));
567         if (intel_wait_for_register(engine->uncore,
568                                     RING_INSTPM(engine->mmio_base),
569                                     INSTPM_SYNC_FLUSH, 0,
570                                     1000))
571                 DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
572                           engine->name);
573 }
574
575 static void ring_setup_status_page(struct intel_engine_cs *engine)
576 {
577         set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma));
578         set_hwstam(engine, ~0u);
579
580         flush_cs_tlb(engine);
581 }
582
583 static bool stop_ring(struct intel_engine_cs *engine)
584 {
585         struct drm_i915_private *dev_priv = engine->i915;
586
587         if (INTEL_GEN(dev_priv) > 2) {
588                 ENGINE_WRITE(engine,
589                              RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING));
590                 if (intel_wait_for_register(engine->uncore,
591                                             RING_MI_MODE(engine->mmio_base),
592                                             MODE_IDLE,
593                                             MODE_IDLE,
594                                             1000)) {
595                         DRM_ERROR("%s : timed out trying to stop ring\n",
596                                   engine->name);
597
598                         /*
599                          * Sometimes we observe that the idle flag is not
600                          * set even though the ring is empty. So double
601                          * check before giving up.
602                          */
603                         if (ENGINE_READ(engine, RING_HEAD) !=
604                             ENGINE_READ(engine, RING_TAIL))
605                                 return false;
606                 }
607         }
608
609         ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL));
610
611         ENGINE_WRITE(engine, RING_HEAD, 0);
612         ENGINE_WRITE(engine, RING_TAIL, 0);
613
614         /* The ring must be empty before it is disabled */
615         ENGINE_WRITE(engine, RING_CTL, 0);
616
617         return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0;
618 }
619
620 static int xcs_resume(struct intel_engine_cs *engine)
621 {
622         struct drm_i915_private *dev_priv = engine->i915;
623         struct intel_ring *ring = engine->buffer;
624         int ret = 0;
625
626         GEM_TRACE("%s: ring:{HEAD:%04x, TAIL:%04x}\n",
627                   engine->name, ring->head, ring->tail);
628
629         intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
630
631         if (!stop_ring(engine)) {
632                 /* G45 ring initialization often fails to reset head to zero */
633                 DRM_DEBUG_DRIVER("%s head not reset to zero "
634                                 "ctl %08x head %08x tail %08x start %08x\n",
635                                 engine->name,
636                                 ENGINE_READ(engine, RING_CTL),
637                                 ENGINE_READ(engine, RING_HEAD),
638                                 ENGINE_READ(engine, RING_TAIL),
639                                 ENGINE_READ(engine, RING_START));
640
641                 if (!stop_ring(engine)) {
642                         DRM_ERROR("failed to set %s head to zero "
643                                   "ctl %08x head %08x tail %08x start %08x\n",
644                                   engine->name,
645                                   ENGINE_READ(engine, RING_CTL),
646                                   ENGINE_READ(engine, RING_HEAD),
647                                   ENGINE_READ(engine, RING_TAIL),
648                                   ENGINE_READ(engine, RING_START));
649                         ret = -EIO;
650                         goto out;
651                 }
652         }
653
654         if (HWS_NEEDS_PHYSICAL(dev_priv))
655                 ring_setup_phys_status_page(engine);
656         else
657                 ring_setup_status_page(engine);
658
659         intel_engine_reset_breadcrumbs(engine);
660
661         /* Enforce ordering by reading HEAD register back */
662         ENGINE_READ(engine, RING_HEAD);
663
664         /* Initialize the ring. This must happen _after_ we've cleared the ring
665          * registers with the above sequence (the readback of the HEAD registers
666          * also enforces ordering), otherwise the hw might lose the new ring
667          * register values. */
668         ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma));
669
670         /* WaClearRingBufHeadRegAtInit:ctg,elk */
671         if (ENGINE_READ(engine, RING_HEAD))
672                 DRM_DEBUG_DRIVER("%s initialization failed [head=%08x], fudging\n",
673                                  engine->name, ENGINE_READ(engine, RING_HEAD));
674
675         /* Check that the ring offsets point within the ring! */
676         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
677         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
678         intel_ring_update_space(ring);
679
680         /* First wake the ring up to an empty/idle ring */
681         ENGINE_WRITE(engine, RING_HEAD, ring->head);
682         ENGINE_WRITE(engine, RING_TAIL, ring->head);
683         ENGINE_POSTING_READ(engine, RING_TAIL);
684
685         ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID);
686
687         /* If the head is still not zero, the ring is dead */
688         if (intel_wait_for_register(engine->uncore,
689                                     RING_CTL(engine->mmio_base),
690                                     RING_VALID, RING_VALID,
691                                     50)) {
692                 DRM_ERROR("%s initialization failed "
693                           "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
694                           engine->name,
695                           ENGINE_READ(engine, RING_CTL),
696                           ENGINE_READ(engine, RING_CTL) & RING_VALID,
697                           ENGINE_READ(engine, RING_HEAD), ring->head,
698                           ENGINE_READ(engine, RING_TAIL), ring->tail,
699                           ENGINE_READ(engine, RING_START),
700                           i915_ggtt_offset(ring->vma));
701                 ret = -EIO;
702                 goto out;
703         }
704
705         if (INTEL_GEN(dev_priv) > 2)
706                 ENGINE_WRITE(engine,
707                              RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
708
709         /* Now awake, let it get started */
710         if (ring->tail != ring->head) {
711                 ENGINE_WRITE(engine, RING_TAIL, ring->tail);
712                 ENGINE_POSTING_READ(engine, RING_TAIL);
713         }
714
715         /* Papering over lost _interrupts_ immediately following the restart */
716         intel_engine_queue_breadcrumbs(engine);
717 out:
718         intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
719
720         return ret;
721 }
722
723 static void reset_prepare(struct intel_engine_cs *engine)
724 {
725         intel_engine_stop_cs(engine);
726 }
727
728 static void reset_ring(struct intel_engine_cs *engine, bool stalled)
729 {
730         struct i915_timeline *tl = &engine->timeline;
731         struct i915_request *pos, *rq;
732         unsigned long flags;
733         u32 head;
734
735         rq = NULL;
736         spin_lock_irqsave(&tl->lock, flags);
737         list_for_each_entry(pos, &tl->requests, link) {
738                 if (!i915_request_completed(pos)) {
739                         rq = pos;
740                         break;
741                 }
742         }
743
744         /*
745          * The guilty request will get skipped on a hung engine.
746          *
747          * Users of client default contexts do not rely on logical
748          * state preserved between batches so it is safe to execute
749          * queued requests following the hang. Non default contexts
750          * rely on preserved state, so skipping a batch loses the
751          * evolution of the state and it needs to be considered corrupted.
752          * Executing more queued batches on top of corrupted state is
753          * risky. But we take the risk by trying to advance through
754          * the queued requests in order to make the client behaviour
755          * more predictable around resets, by not throwing away random
756          * amount of batches it has prepared for execution. Sophisticated
757          * clients can use gem_reset_stats_ioctl and dma fence status
758          * (exported via sync_file info ioctl on explicit fences) to observe
759          * when it loses the context state and should rebuild accordingly.
760          *
761          * The context ban, and ultimately the client ban, mechanism are safety
762          * valves if client submission ends up resulting in nothing more than
763          * subsequent hangs.
764          */
765
766         if (rq) {
767                 /*
768                  * Try to restore the logical GPU state to match the
769                  * continuation of the request queue. If we skip the
770                  * context/PD restore, then the next request may try to execute
771                  * assuming that its context is valid and loaded on the GPU and
772                  * so may try to access invalid memory, prompting repeated GPU
773                  * hangs.
774                  *
775                  * If the request was guilty, we still restore the logical
776                  * state in case the next request requires it (e.g. the
777                  * aliasing ppgtt), but skip over the hung batch.
778                  *
779                  * If the request was innocent, we try to replay the request
780                  * with the restored context.
781                  */
782                 i915_reset_request(rq, stalled);
783
784                 GEM_BUG_ON(rq->ring != engine->buffer);
785                 head = rq->head;
786         } else {
787                 head = engine->buffer->tail;
788         }
789         engine->buffer->head = intel_ring_wrap(engine->buffer, head);
790
791         spin_unlock_irqrestore(&tl->lock, flags);
792 }
793
794 static void reset_finish(struct intel_engine_cs *engine)
795 {
796 }
797
798 static int intel_rcs_ctx_init(struct i915_request *rq)
799 {
800         int ret;
801
802         ret = intel_engine_emit_ctx_wa(rq);
803         if (ret != 0)
804                 return ret;
805
806         ret = i915_gem_render_state_emit(rq);
807         if (ret)
808                 return ret;
809
810         return 0;
811 }
812
813 static int rcs_resume(struct intel_engine_cs *engine)
814 {
815         struct drm_i915_private *dev_priv = engine->i915;
816
817         /*
818          * Disable CONSTANT_BUFFER before it is loaded from the context
819          * image. For as it is loaded, it is executed and the stored
820          * address may no longer be valid, leading to a GPU hang.
821          *
822          * This imposes the requirement that userspace reload their
823          * CONSTANT_BUFFER on every batch, fortunately a requirement
824          * they are already accustomed to from before contexts were
825          * enabled.
826          */
827         if (IS_GEN(dev_priv, 4))
828                 I915_WRITE(ECOSKPD,
829                            _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE));
830
831         /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
832         if (IS_GEN_RANGE(dev_priv, 4, 6))
833                 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
834
835         /* We need to disable the AsyncFlip performance optimisations in order
836          * to use MI_WAIT_FOR_EVENT within the CS. It should already be
837          * programmed to '1' on all products.
838          *
839          * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
840          */
841         if (IS_GEN_RANGE(dev_priv, 6, 7))
842                 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
843
844         /* Required for the hardware to program scanline values for waiting */
845         /* WaEnableFlushTlbInvalidationMode:snb */
846         if (IS_GEN(dev_priv, 6))
847                 I915_WRITE(GFX_MODE,
848                            _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
849
850         /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
851         if (IS_GEN(dev_priv, 7))
852                 I915_WRITE(GFX_MODE_GEN7,
853                            _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
854                            _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
855
856         if (IS_GEN(dev_priv, 6)) {
857                 /* From the Sandybridge PRM, volume 1 part 3, page 24:
858                  * "If this bit is set, STCunit will have LRA as replacement
859                  *  policy. [...] This bit must be reset.  LRA replacement
860                  *  policy is not supported."
861                  */
862                 I915_WRITE(CACHE_MODE_0,
863                            _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
864         }
865
866         if (IS_GEN_RANGE(dev_priv, 6, 7))
867                 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
868
869         return xcs_resume(engine);
870 }
871
872 static void cancel_requests(struct intel_engine_cs *engine)
873 {
874         struct i915_request *request;
875         unsigned long flags;
876
877         spin_lock_irqsave(&engine->timeline.lock, flags);
878
879         /* Mark all submitted requests as skipped. */
880         list_for_each_entry(request, &engine->timeline.requests, link) {
881                 if (!i915_request_signaled(request))
882                         dma_fence_set_error(&request->fence, -EIO);
883
884                 i915_request_mark_complete(request);
885         }
886
887         /* Remaining _unready_ requests will be nop'ed when submitted */
888
889         spin_unlock_irqrestore(&engine->timeline.lock, flags);
890 }
891
892 static void i9xx_submit_request(struct i915_request *request)
893 {
894         i915_request_submit(request);
895
896         ENGINE_WRITE(request->engine, RING_TAIL,
897                      intel_ring_set_tail(request->ring, request->tail));
898 }
899
900 static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
901 {
902         GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
903         GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
904
905         *cs++ = MI_FLUSH;
906
907         *cs++ = MI_STORE_DWORD_INDEX;
908         *cs++ = I915_GEM_HWS_SEQNO_ADDR;
909         *cs++ = rq->fence.seqno;
910
911         *cs++ = MI_USER_INTERRUPT;
912         *cs++ = MI_NOOP;
913
914         rq->tail = intel_ring_offset(rq, cs);
915         assert_ring_tail_valid(rq->ring, rq->tail);
916
917         return cs;
918 }
919
920 #define GEN5_WA_STORES 8 /* must be at least 1! */
921 static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
922 {
923         int i;
924
925         GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
926         GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
927
928         *cs++ = MI_FLUSH;
929
930         BUILD_BUG_ON(GEN5_WA_STORES < 1);
931         for (i = 0; i < GEN5_WA_STORES; i++) {
932                 *cs++ = MI_STORE_DWORD_INDEX;
933                 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
934                 *cs++ = rq->fence.seqno;
935         }
936
937         *cs++ = MI_USER_INTERRUPT;
938
939         rq->tail = intel_ring_offset(rq, cs);
940         assert_ring_tail_valid(rq->ring, rq->tail);
941
942         return cs;
943 }
944 #undef GEN5_WA_STORES
945
946 static void
947 gen5_irq_enable(struct intel_engine_cs *engine)
948 {
949         gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
950 }
951
952 static void
953 gen5_irq_disable(struct intel_engine_cs *engine)
954 {
955         gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
956 }
957
958 static void
959 i9xx_irq_enable(struct intel_engine_cs *engine)
960 {
961         engine->i915->irq_mask &= ~engine->irq_enable_mask;
962         intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
963         intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
964 }
965
966 static void
967 i9xx_irq_disable(struct intel_engine_cs *engine)
968 {
969         engine->i915->irq_mask |= engine->irq_enable_mask;
970         intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
971 }
972
973 static void
974 i8xx_irq_enable(struct intel_engine_cs *engine)
975 {
976         struct drm_i915_private *dev_priv = engine->i915;
977
978         dev_priv->irq_mask &= ~engine->irq_enable_mask;
979         I915_WRITE16(GEN2_IMR, dev_priv->irq_mask);
980         POSTING_READ16(RING_IMR(engine->mmio_base));
981 }
982
983 static void
984 i8xx_irq_disable(struct intel_engine_cs *engine)
985 {
986         struct drm_i915_private *dev_priv = engine->i915;
987
988         dev_priv->irq_mask |= engine->irq_enable_mask;
989         I915_WRITE16(GEN2_IMR, dev_priv->irq_mask);
990 }
991
992 static int
993 bsd_ring_flush(struct i915_request *rq, u32 mode)
994 {
995         u32 *cs;
996
997         cs = intel_ring_begin(rq, 2);
998         if (IS_ERR(cs))
999                 return PTR_ERR(cs);
1000
1001         *cs++ = MI_FLUSH;
1002         *cs++ = MI_NOOP;
1003         intel_ring_advance(rq, cs);
1004         return 0;
1005 }
1006
1007 static void
1008 gen6_irq_enable(struct intel_engine_cs *engine)
1009 {
1010         ENGINE_WRITE(engine, RING_IMR,
1011                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
1012
1013         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1014         ENGINE_POSTING_READ(engine, RING_IMR);
1015
1016         gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
1017 }
1018
1019 static void
1020 gen6_irq_disable(struct intel_engine_cs *engine)
1021 {
1022         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
1023         gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
1024 }
1025
1026 static void
1027 hsw_vebox_irq_enable(struct intel_engine_cs *engine)
1028 {
1029         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
1030
1031         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1032         ENGINE_POSTING_READ(engine, RING_IMR);
1033
1034         gen6_unmask_pm_irq(engine->i915, engine->irq_enable_mask);
1035 }
1036
1037 static void
1038 hsw_vebox_irq_disable(struct intel_engine_cs *engine)
1039 {
1040         ENGINE_WRITE(engine, RING_IMR, ~0);
1041         gen6_mask_pm_irq(engine->i915, engine->irq_enable_mask);
1042 }
1043
1044 static int
1045 i965_emit_bb_start(struct i915_request *rq,
1046                    u64 offset, u32 length,
1047                    unsigned int dispatch_flags)
1048 {
1049         u32 *cs;
1050
1051         cs = intel_ring_begin(rq, 2);
1052         if (IS_ERR(cs))
1053                 return PTR_ERR(cs);
1054
1055         *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags &
1056                 I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965);
1057         *cs++ = offset;
1058         intel_ring_advance(rq, cs);
1059
1060         return 0;
1061 }
1062
1063 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
1064 #define I830_BATCH_LIMIT SZ_256K
1065 #define I830_TLB_ENTRIES (2)
1066 #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1067 static int
1068 i830_emit_bb_start(struct i915_request *rq,
1069                    u64 offset, u32 len,
1070                    unsigned int dispatch_flags)
1071 {
1072         u32 *cs, cs_offset = i915_scratch_offset(rq->i915);
1073
1074         GEM_BUG_ON(rq->i915->gt.scratch->size < I830_WA_SIZE);
1075
1076         cs = intel_ring_begin(rq, 6);
1077         if (IS_ERR(cs))
1078                 return PTR_ERR(cs);
1079
1080         /* Evict the invalid PTE TLBs */
1081         *cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
1082         *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
1083         *cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
1084         *cs++ = cs_offset;
1085         *cs++ = 0xdeadbeef;
1086         *cs++ = MI_NOOP;
1087         intel_ring_advance(rq, cs);
1088
1089         if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1090                 if (len > I830_BATCH_LIMIT)
1091                         return -ENOSPC;
1092
1093                 cs = intel_ring_begin(rq, 6 + 2);
1094                 if (IS_ERR(cs))
1095                         return PTR_ERR(cs);
1096
1097                 /* Blit the batch (which has now all relocs applied) to the
1098                  * stable batch scratch bo area (so that the CS never
1099                  * stumbles over its tlb invalidation bug) ...
1100                  */
1101                 *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA;
1102                 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
1103                 *cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
1104                 *cs++ = cs_offset;
1105                 *cs++ = 4096;
1106                 *cs++ = offset;
1107
1108                 *cs++ = MI_FLUSH;
1109                 *cs++ = MI_NOOP;
1110                 intel_ring_advance(rq, cs);
1111
1112                 /* ... and execute it. */
1113                 offset = cs_offset;
1114         }
1115
1116         cs = intel_ring_begin(rq, 2);
1117         if (IS_ERR(cs))
1118                 return PTR_ERR(cs);
1119
1120         *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1121         *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1122                 MI_BATCH_NON_SECURE);
1123         intel_ring_advance(rq, cs);
1124
1125         return 0;
1126 }
1127
1128 static int
1129 i915_emit_bb_start(struct i915_request *rq,
1130                    u64 offset, u32 len,
1131                    unsigned int dispatch_flags)
1132 {
1133         u32 *cs;
1134
1135         cs = intel_ring_begin(rq, 2);
1136         if (IS_ERR(cs))
1137                 return PTR_ERR(cs);
1138
1139         *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1140         *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1141                 MI_BATCH_NON_SECURE);
1142         intel_ring_advance(rq, cs);
1143
1144         return 0;
1145 }
1146
1147 int intel_ring_pin(struct intel_ring *ring)
1148 {
1149         struct i915_vma *vma = ring->vma;
1150         enum i915_map_type map = i915_coherent_map_type(vma->vm->i915);
1151         unsigned int flags;
1152         void *addr;
1153         int ret;
1154
1155         GEM_BUG_ON(ring->vaddr);
1156
1157         ret = i915_timeline_pin(ring->timeline);
1158         if (ret)
1159                 return ret;
1160
1161         flags = PIN_GLOBAL;
1162
1163         /* Ring wraparound at offset 0 sometimes hangs. No idea why. */
1164         flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
1165
1166         if (vma->obj->stolen)
1167                 flags |= PIN_MAPPABLE;
1168         else
1169                 flags |= PIN_HIGH;
1170
1171         ret = i915_vma_pin(vma, 0, 0, flags);
1172         if (unlikely(ret))
1173                 goto unpin_timeline;
1174
1175         if (i915_vma_is_map_and_fenceable(vma))
1176                 addr = (void __force *)i915_vma_pin_iomap(vma);
1177         else
1178                 addr = i915_gem_object_pin_map(vma->obj, map);
1179         if (IS_ERR(addr)) {
1180                 ret = PTR_ERR(addr);
1181                 goto unpin_ring;
1182         }
1183
1184         vma->obj->pin_global++;
1185
1186         ring->vaddr = addr;
1187         return 0;
1188
1189 unpin_ring:
1190         i915_vma_unpin(vma);
1191 unpin_timeline:
1192         i915_timeline_unpin(ring->timeline);
1193         return ret;
1194 }
1195
1196 void intel_ring_reset(struct intel_ring *ring, u32 tail)
1197 {
1198         GEM_BUG_ON(!intel_ring_offset_valid(ring, tail));
1199
1200         ring->tail = tail;
1201         ring->head = tail;
1202         ring->emit = tail;
1203         intel_ring_update_space(ring);
1204 }
1205
1206 void intel_ring_unpin(struct intel_ring *ring)
1207 {
1208         GEM_BUG_ON(!ring->vma);
1209         GEM_BUG_ON(!ring->vaddr);
1210
1211         /* Discard any unused bytes beyond that submitted to hw. */
1212         intel_ring_reset(ring, ring->tail);
1213
1214         if (i915_vma_is_map_and_fenceable(ring->vma))
1215                 i915_vma_unpin_iomap(ring->vma);
1216         else
1217                 i915_gem_object_unpin_map(ring->vma->obj);
1218         ring->vaddr = NULL;
1219
1220         ring->vma->obj->pin_global--;
1221         i915_vma_unpin(ring->vma);
1222
1223         i915_timeline_unpin(ring->timeline);
1224 }
1225
1226 static struct i915_vma *
1227 intel_ring_create_vma(struct drm_i915_private *dev_priv, int size)
1228 {
1229         struct i915_address_space *vm = &dev_priv->ggtt.vm;
1230         struct drm_i915_gem_object *obj;
1231         struct i915_vma *vma;
1232
1233         obj = i915_gem_object_create_stolen(dev_priv, size);
1234         if (!obj)
1235                 obj = i915_gem_object_create_internal(dev_priv, size);
1236         if (IS_ERR(obj))
1237                 return ERR_CAST(obj);
1238
1239         /*
1240          * Mark ring buffers as read-only from GPU side (so no stray overwrites)
1241          * if supported by the platform's GGTT.
1242          */
1243         if (vm->has_read_only)
1244                 i915_gem_object_set_readonly(obj);
1245
1246         vma = i915_vma_instance(obj, vm, NULL);
1247         if (IS_ERR(vma))
1248                 goto err;
1249
1250         return vma;
1251
1252 err:
1253         i915_gem_object_put(obj);
1254         return vma;
1255 }
1256
1257 struct intel_ring *
1258 intel_engine_create_ring(struct intel_engine_cs *engine,
1259                          struct i915_timeline *timeline,
1260                          int size)
1261 {
1262         struct intel_ring *ring;
1263         struct i915_vma *vma;
1264
1265         GEM_BUG_ON(!is_power_of_2(size));
1266         GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
1267         GEM_BUG_ON(timeline == &engine->timeline);
1268         lockdep_assert_held(&engine->i915->drm.struct_mutex);
1269
1270         ring = kzalloc(sizeof(*ring), GFP_KERNEL);
1271         if (!ring)
1272                 return ERR_PTR(-ENOMEM);
1273
1274         kref_init(&ring->ref);
1275         INIT_LIST_HEAD(&ring->request_list);
1276         ring->timeline = i915_timeline_get(timeline);
1277
1278         ring->size = size;
1279         /* Workaround an erratum on the i830 which causes a hang if
1280          * the TAIL pointer points to within the last 2 cachelines
1281          * of the buffer.
1282          */
1283         ring->effective_size = size;
1284         if (IS_I830(engine->i915) || IS_I845G(engine->i915))
1285                 ring->effective_size -= 2 * CACHELINE_BYTES;
1286
1287         intel_ring_update_space(ring);
1288
1289         vma = intel_ring_create_vma(engine->i915, size);
1290         if (IS_ERR(vma)) {
1291                 kfree(ring);
1292                 return ERR_CAST(vma);
1293         }
1294         ring->vma = vma;
1295
1296         return ring;
1297 }
1298
1299 void intel_ring_free(struct kref *ref)
1300 {
1301         struct intel_ring *ring = container_of(ref, typeof(*ring), ref);
1302         struct drm_i915_gem_object *obj = ring->vma->obj;
1303
1304         i915_vma_close(ring->vma);
1305         __i915_gem_object_release_unless_active(obj);
1306
1307         i915_timeline_put(ring->timeline);
1308         kfree(ring);
1309 }
1310
1311 static void __ring_context_fini(struct intel_context *ce)
1312 {
1313         GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
1314         i915_gem_object_put(ce->state->obj);
1315 }
1316
1317 static void ring_context_destroy(struct kref *ref)
1318 {
1319         struct intel_context *ce = container_of(ref, typeof(*ce), ref);
1320
1321         GEM_BUG_ON(intel_context_is_pinned(ce));
1322
1323         if (ce->state)
1324                 __ring_context_fini(ce);
1325
1326         intel_context_free(ce);
1327 }
1328
1329 static int __context_pin_ppgtt(struct i915_gem_context *ctx)
1330 {
1331         struct i915_hw_ppgtt *ppgtt;
1332         int err = 0;
1333
1334         ppgtt = ctx->ppgtt ?: ctx->i915->mm.aliasing_ppgtt;
1335         if (ppgtt)
1336                 err = gen6_ppgtt_pin(ppgtt);
1337
1338         return err;
1339 }
1340
1341 static void __context_unpin_ppgtt(struct i915_gem_context *ctx)
1342 {
1343         struct i915_hw_ppgtt *ppgtt;
1344
1345         ppgtt = ctx->ppgtt ?: ctx->i915->mm.aliasing_ppgtt;
1346         if (ppgtt)
1347                 gen6_ppgtt_unpin(ppgtt);
1348 }
1349
1350 static int __context_pin(struct intel_context *ce)
1351 {
1352         struct i915_vma *vma;
1353         int err;
1354
1355         vma = ce->state;
1356         if (!vma)
1357                 return 0;
1358
1359         err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
1360         if (err)
1361                 return err;
1362
1363         /*
1364          * And mark is as a globally pinned object to let the shrinker know
1365          * it cannot reclaim the object until we release it.
1366          */
1367         vma->obj->pin_global++;
1368         vma->obj->mm.dirty = true;
1369
1370         return 0;
1371 }
1372
1373 static void __context_unpin(struct intel_context *ce)
1374 {
1375         struct i915_vma *vma;
1376
1377         vma = ce->state;
1378         if (!vma)
1379                 return;
1380
1381         vma->obj->pin_global--;
1382         i915_vma_unpin(vma);
1383 }
1384
1385 static void ring_context_unpin(struct intel_context *ce)
1386 {
1387         __context_unpin_ppgtt(ce->gem_context);
1388         __context_unpin(ce);
1389 }
1390
1391 static struct i915_vma *
1392 alloc_context_vma(struct intel_engine_cs *engine)
1393 {
1394         struct drm_i915_private *i915 = engine->i915;
1395         struct drm_i915_gem_object *obj;
1396         struct i915_vma *vma;
1397         int err;
1398
1399         obj = i915_gem_object_create_shmem(i915, engine->context_size);
1400         if (IS_ERR(obj))
1401                 return ERR_CAST(obj);
1402
1403         /*
1404          * Try to make the context utilize L3 as well as LLC.
1405          *
1406          * On VLV we don't have L3 controls in the PTEs so we
1407          * shouldn't touch the cache level, especially as that
1408          * would make the object snooped which might have a
1409          * negative performance impact.
1410          *
1411          * Snooping is required on non-llc platforms in execlist
1412          * mode, but since all GGTT accesses use PAT entry 0 we
1413          * get snooping anyway regardless of cache_level.
1414          *
1415          * This is only applicable for Ivy Bridge devices since
1416          * later platforms don't have L3 control bits in the PTE.
1417          */
1418         if (IS_IVYBRIDGE(i915))
1419                 i915_gem_object_set_cache_coherency(obj, I915_CACHE_L3_LLC);
1420
1421         if (engine->default_state) {
1422                 void *defaults, *vaddr;
1423
1424                 vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
1425                 if (IS_ERR(vaddr)) {
1426                         err = PTR_ERR(vaddr);
1427                         goto err_obj;
1428                 }
1429
1430                 defaults = i915_gem_object_pin_map(engine->default_state,
1431                                                    I915_MAP_WB);
1432                 if (IS_ERR(defaults)) {
1433                         err = PTR_ERR(defaults);
1434                         goto err_map;
1435                 }
1436
1437                 memcpy(vaddr, defaults, engine->context_size);
1438                 i915_gem_object_unpin_map(engine->default_state);
1439
1440                 i915_gem_object_flush_map(obj);
1441                 i915_gem_object_unpin_map(obj);
1442         }
1443
1444         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
1445         if (IS_ERR(vma)) {
1446                 err = PTR_ERR(vma);
1447                 goto err_obj;
1448         }
1449
1450         return vma;
1451
1452 err_map:
1453         i915_gem_object_unpin_map(obj);
1454 err_obj:
1455         i915_gem_object_put(obj);
1456         return ERR_PTR(err);
1457 }
1458
1459 static int ring_context_pin(struct intel_context *ce)
1460 {
1461         struct intel_engine_cs *engine = ce->engine;
1462         int err;
1463
1464         /* One ringbuffer to rule them all */
1465         GEM_BUG_ON(!engine->buffer);
1466         ce->ring = engine->buffer;
1467
1468         if (!ce->state && engine->context_size) {
1469                 struct i915_vma *vma;
1470
1471                 vma = alloc_context_vma(engine);
1472                 if (IS_ERR(vma))
1473                         return PTR_ERR(vma);
1474
1475                 ce->state = vma;
1476         }
1477
1478         err = __context_pin(ce);
1479         if (err)
1480                 return err;
1481
1482         err = __context_pin_ppgtt(ce->gem_context);
1483         if (err)
1484                 goto err_unpin;
1485
1486         return 0;
1487
1488 err_unpin:
1489         __context_unpin(ce);
1490         return err;
1491 }
1492
1493 static void ring_context_reset(struct intel_context *ce)
1494 {
1495         intel_ring_reset(ce->ring, 0);
1496 }
1497
1498 static const struct intel_context_ops ring_context_ops = {
1499         .pin = ring_context_pin,
1500         .unpin = ring_context_unpin,
1501
1502         .enter = intel_context_enter_engine,
1503         .exit = intel_context_exit_engine,
1504
1505         .reset = ring_context_reset,
1506         .destroy = ring_context_destroy,
1507 };
1508
1509 static int load_pd_dir(struct i915_request *rq,
1510                        const struct i915_hw_ppgtt *ppgtt)
1511 {
1512         const struct intel_engine_cs * const engine = rq->engine;
1513         u32 *cs;
1514
1515         cs = intel_ring_begin(rq, 6);
1516         if (IS_ERR(cs))
1517                 return PTR_ERR(cs);
1518
1519         *cs++ = MI_LOAD_REGISTER_IMM(1);
1520         *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
1521         *cs++ = PP_DIR_DCLV_2G;
1522
1523         *cs++ = MI_LOAD_REGISTER_IMM(1);
1524         *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1525         *cs++ = ppgtt->pd.base.ggtt_offset << 10;
1526
1527         intel_ring_advance(rq, cs);
1528
1529         return 0;
1530 }
1531
1532 static int flush_pd_dir(struct i915_request *rq)
1533 {
1534         const struct intel_engine_cs * const engine = rq->engine;
1535         u32 *cs;
1536
1537         cs = intel_ring_begin(rq, 4);
1538         if (IS_ERR(cs))
1539                 return PTR_ERR(cs);
1540
1541         /* Stall until the page table load is complete */
1542         *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1543         *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1544         *cs++ = i915_scratch_offset(rq->i915);
1545         *cs++ = MI_NOOP;
1546
1547         intel_ring_advance(rq, cs);
1548         return 0;
1549 }
1550
1551 static inline int mi_set_context(struct i915_request *rq, u32 flags)
1552 {
1553         struct drm_i915_private *i915 = rq->i915;
1554         struct intel_engine_cs *engine = rq->engine;
1555         enum intel_engine_id id;
1556         const int num_engines =
1557                 IS_HSW_GT1(i915) ? RUNTIME_INFO(i915)->num_engines - 1 : 0;
1558         bool force_restore = false;
1559         int len;
1560         u32 *cs;
1561
1562         flags |= MI_MM_SPACE_GTT;
1563         if (IS_HASWELL(i915))
1564                 /* These flags are for resource streamer on HSW+ */
1565                 flags |= HSW_MI_RS_SAVE_STATE_EN | HSW_MI_RS_RESTORE_STATE_EN;
1566         else
1567                 /* We need to save the extended state for powersaving modes */
1568                 flags |= MI_SAVE_EXT_STATE_EN | MI_RESTORE_EXT_STATE_EN;
1569
1570         len = 4;
1571         if (IS_GEN(i915, 7))
1572                 len += 2 + (num_engines ? 4 * num_engines + 6 : 0);
1573         else if (IS_GEN(i915, 5))
1574                 len += 2;
1575         if (flags & MI_FORCE_RESTORE) {
1576                 GEM_BUG_ON(flags & MI_RESTORE_INHIBIT);
1577                 flags &= ~MI_FORCE_RESTORE;
1578                 force_restore = true;
1579                 len += 2;
1580         }
1581
1582         cs = intel_ring_begin(rq, len);
1583         if (IS_ERR(cs))
1584                 return PTR_ERR(cs);
1585
1586         /* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
1587         if (IS_GEN(i915, 7)) {
1588                 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1589                 if (num_engines) {
1590                         struct intel_engine_cs *signaller;
1591
1592                         *cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1593                         for_each_engine(signaller, i915, id) {
1594                                 if (signaller == engine)
1595                                         continue;
1596
1597                                 *cs++ = i915_mmio_reg_offset(
1598                                            RING_PSMI_CTL(signaller->mmio_base));
1599                                 *cs++ = _MASKED_BIT_ENABLE(
1600                                                 GEN6_PSMI_SLEEP_MSG_DISABLE);
1601                         }
1602                 }
1603         } else if (IS_GEN(i915, 5)) {
1604                 /*
1605                  * This w/a is only listed for pre-production ilk a/b steppings,
1606                  * but is also mentioned for programming the powerctx. To be
1607                  * safe, just apply the workaround; we do not use SyncFlush so
1608                  * this should never take effect and so be a no-op!
1609                  */
1610                 *cs++ = MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN;
1611         }
1612
1613         if (force_restore) {
1614                 /*
1615                  * The HW doesn't handle being told to restore the current
1616                  * context very well. Quite often it likes goes to go off and
1617                  * sulk, especially when it is meant to be reloading PP_DIR.
1618                  * A very simple fix to force the reload is to simply switch
1619                  * away from the current context and back again.
1620                  *
1621                  * Note that the kernel_context will contain random state
1622                  * following the INHIBIT_RESTORE. We accept this since we
1623                  * never use the kernel_context state; it is merely a
1624                  * placeholder we use to flush other contexts.
1625                  */
1626                 *cs++ = MI_SET_CONTEXT;
1627                 *cs++ = i915_ggtt_offset(engine->kernel_context->state) |
1628                         MI_MM_SPACE_GTT |
1629                         MI_RESTORE_INHIBIT;
1630         }
1631
1632         *cs++ = MI_NOOP;
1633         *cs++ = MI_SET_CONTEXT;
1634         *cs++ = i915_ggtt_offset(rq->hw_context->state) | flags;
1635         /*
1636          * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
1637          * WaMiSetContext_Hang:snb,ivb,vlv
1638          */
1639         *cs++ = MI_NOOP;
1640
1641         if (IS_GEN(i915, 7)) {
1642                 if (num_engines) {
1643                         struct intel_engine_cs *signaller;
1644                         i915_reg_t last_reg = {}; /* keep gcc quiet */
1645
1646                         *cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1647                         for_each_engine(signaller, i915, id) {
1648                                 if (signaller == engine)
1649                                         continue;
1650
1651                                 last_reg = RING_PSMI_CTL(signaller->mmio_base);
1652                                 *cs++ = i915_mmio_reg_offset(last_reg);
1653                                 *cs++ = _MASKED_BIT_DISABLE(
1654                                                 GEN6_PSMI_SLEEP_MSG_DISABLE);
1655                         }
1656
1657                         /* Insert a delay before the next switch! */
1658                         *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1659                         *cs++ = i915_mmio_reg_offset(last_reg);
1660                         *cs++ = i915_scratch_offset(rq->i915);
1661                         *cs++ = MI_NOOP;
1662                 }
1663                 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1664         } else if (IS_GEN(i915, 5)) {
1665                 *cs++ = MI_SUSPEND_FLUSH;
1666         }
1667
1668         intel_ring_advance(rq, cs);
1669
1670         return 0;
1671 }
1672
1673 static int remap_l3(struct i915_request *rq, int slice)
1674 {
1675         u32 *cs, *remap_info = rq->i915->l3_parity.remap_info[slice];
1676         int i;
1677
1678         if (!remap_info)
1679                 return 0;
1680
1681         cs = intel_ring_begin(rq, GEN7_L3LOG_SIZE/4 * 2 + 2);
1682         if (IS_ERR(cs))
1683                 return PTR_ERR(cs);
1684
1685         /*
1686          * Note: We do not worry about the concurrent register cacheline hang
1687          * here because no other code should access these registers other than
1688          * at initialization time.
1689          */
1690         *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
1691         for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
1692                 *cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
1693                 *cs++ = remap_info[i];
1694         }
1695         *cs++ = MI_NOOP;
1696         intel_ring_advance(rq, cs);
1697
1698         return 0;
1699 }
1700
1701 static int switch_context(struct i915_request *rq)
1702 {
1703         struct intel_engine_cs *engine = rq->engine;
1704         struct i915_gem_context *ctx = rq->gem_context;
1705         struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
1706         unsigned int unwind_mm = 0;
1707         u32 hw_flags = 0;
1708         int ret, i;
1709
1710         GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
1711
1712         if (ppgtt) {
1713                 int loops;
1714
1715                 /*
1716                  * Baytail takes a little more convincing that it really needs
1717                  * to reload the PD between contexts. It is not just a little
1718                  * longer, as adding more stalls after the load_pd_dir (i.e.
1719                  * adding a long loop around flush_pd_dir) is not as effective
1720                  * as reloading the PD umpteen times. 32 is derived from
1721                  * experimentation (gem_exec_parallel/fds) and has no good
1722                  * explanation.
1723                  */
1724                 loops = 1;
1725                 if (engine->id == BCS0 && IS_VALLEYVIEW(engine->i915))
1726                         loops = 32;
1727
1728                 do {
1729                         ret = load_pd_dir(rq, ppgtt);
1730                         if (ret)
1731                                 goto err;
1732                 } while (--loops);
1733
1734                 if (ppgtt->pd_dirty_engines & engine->mask) {
1735                         unwind_mm = engine->mask;
1736                         ppgtt->pd_dirty_engines &= ~unwind_mm;
1737                         hw_flags = MI_FORCE_RESTORE;
1738                 }
1739         }
1740
1741         if (rq->hw_context->state) {
1742                 GEM_BUG_ON(engine->id != RCS0);
1743
1744                 /*
1745                  * The kernel context(s) is treated as pure scratch and is not
1746                  * expected to retain any state (as we sacrifice it during
1747                  * suspend and on resume it may be corrupted). This is ok,
1748                  * as nothing actually executes using the kernel context; it
1749                  * is purely used for flushing user contexts.
1750                  */
1751                 if (i915_gem_context_is_kernel(ctx))
1752                         hw_flags = MI_RESTORE_INHIBIT;
1753
1754                 ret = mi_set_context(rq, hw_flags);
1755                 if (ret)
1756                         goto err_mm;
1757         }
1758
1759         if (ppgtt) {
1760                 ret = engine->emit_flush(rq, EMIT_INVALIDATE);
1761                 if (ret)
1762                         goto err_mm;
1763
1764                 ret = flush_pd_dir(rq);
1765                 if (ret)
1766                         goto err_mm;
1767
1768                 /*
1769                  * Not only do we need a full barrier (post-sync write) after
1770                  * invalidating the TLBs, but we need to wait a little bit
1771                  * longer. Whether this is merely delaying us, or the
1772                  * subsequent flush is a key part of serialising with the
1773                  * post-sync op, this extra pass appears vital before a
1774                  * mm switch!
1775                  */
1776                 ret = engine->emit_flush(rq, EMIT_INVALIDATE);
1777                 if (ret)
1778                         goto err_mm;
1779
1780                 ret = engine->emit_flush(rq, EMIT_FLUSH);
1781                 if (ret)
1782                         goto err_mm;
1783         }
1784
1785         if (ctx->remap_slice) {
1786                 for (i = 0; i < MAX_L3_SLICES; i++) {
1787                         if (!(ctx->remap_slice & BIT(i)))
1788                                 continue;
1789
1790                         ret = remap_l3(rq, i);
1791                         if (ret)
1792                                 goto err_mm;
1793                 }
1794
1795                 ctx->remap_slice = 0;
1796         }
1797
1798         return 0;
1799
1800 err_mm:
1801         if (unwind_mm)
1802                 ppgtt->pd_dirty_engines |= unwind_mm;
1803 err:
1804         return ret;
1805 }
1806
1807 static int ring_request_alloc(struct i915_request *request)
1808 {
1809         int ret;
1810
1811         GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1812         GEM_BUG_ON(request->timeline->has_initial_breadcrumb);
1813
1814         /*
1815          * Flush enough space to reduce the likelihood of waiting after
1816          * we start building the request - in which case we will just
1817          * have to repeat work.
1818          */
1819         request->reserved_space += LEGACY_REQUEST_SIZE;
1820
1821         /* Unconditionally invalidate GPU caches and TLBs. */
1822         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1823         if (ret)
1824                 return ret;
1825
1826         ret = switch_context(request);
1827         if (ret)
1828                 return ret;
1829
1830         request->reserved_space -= LEGACY_REQUEST_SIZE;
1831         return 0;
1832 }
1833
1834 static noinline int wait_for_space(struct intel_ring *ring, unsigned int bytes)
1835 {
1836         struct i915_request *target;
1837         long timeout;
1838
1839         if (intel_ring_update_space(ring) >= bytes)
1840                 return 0;
1841
1842         GEM_BUG_ON(list_empty(&ring->request_list));
1843         list_for_each_entry(target, &ring->request_list, ring_link) {
1844                 /* Would completion of this request free enough space? */
1845                 if (bytes <= __intel_ring_space(target->postfix,
1846                                                 ring->emit, ring->size))
1847                         break;
1848         }
1849
1850         if (WARN_ON(&target->ring_link == &ring->request_list))
1851                 return -ENOSPC;
1852
1853         timeout = i915_request_wait(target,
1854                                     I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
1855                                     MAX_SCHEDULE_TIMEOUT);
1856         if (timeout < 0)
1857                 return timeout;
1858
1859         i915_request_retire_upto(target);
1860
1861         intel_ring_update_space(ring);
1862         GEM_BUG_ON(ring->space < bytes);
1863         return 0;
1864 }
1865
1866 u32 *intel_ring_begin(struct i915_request *rq, unsigned int num_dwords)
1867 {
1868         struct intel_ring *ring = rq->ring;
1869         const unsigned int remain_usable = ring->effective_size - ring->emit;
1870         const unsigned int bytes = num_dwords * sizeof(u32);
1871         unsigned int need_wrap = 0;
1872         unsigned int total_bytes;
1873         u32 *cs;
1874
1875         /* Packets must be qword aligned. */
1876         GEM_BUG_ON(num_dwords & 1);
1877
1878         total_bytes = bytes + rq->reserved_space;
1879         GEM_BUG_ON(total_bytes > ring->effective_size);
1880
1881         if (unlikely(total_bytes > remain_usable)) {
1882                 const int remain_actual = ring->size - ring->emit;
1883
1884                 if (bytes > remain_usable) {
1885                         /*
1886                          * Not enough space for the basic request. So need to
1887                          * flush out the remainder and then wait for
1888                          * base + reserved.
1889                          */
1890                         total_bytes += remain_actual;
1891                         need_wrap = remain_actual | 1;
1892                 } else  {
1893                         /*
1894                          * The base request will fit but the reserved space
1895                          * falls off the end. So we don't need an immediate
1896                          * wrap and only need to effectively wait for the
1897                          * reserved size from the start of ringbuffer.
1898                          */
1899                         total_bytes = rq->reserved_space + remain_actual;
1900                 }
1901         }
1902
1903         if (unlikely(total_bytes > ring->space)) {
1904                 int ret;
1905
1906                 /*
1907                  * Space is reserved in the ringbuffer for finalising the
1908                  * request, as that cannot be allowed to fail. During request
1909                  * finalisation, reserved_space is set to 0 to stop the
1910                  * overallocation and the assumption is that then we never need
1911                  * to wait (which has the risk of failing with EINTR).
1912                  *
1913                  * See also i915_request_alloc() and i915_request_add().
1914                  */
1915                 GEM_BUG_ON(!rq->reserved_space);
1916
1917                 ret = wait_for_space(ring, total_bytes);
1918                 if (unlikely(ret))
1919                         return ERR_PTR(ret);
1920         }
1921
1922         if (unlikely(need_wrap)) {
1923                 need_wrap &= ~1;
1924                 GEM_BUG_ON(need_wrap > ring->space);
1925                 GEM_BUG_ON(ring->emit + need_wrap > ring->size);
1926                 GEM_BUG_ON(!IS_ALIGNED(need_wrap, sizeof(u64)));
1927
1928                 /* Fill the tail with MI_NOOP */
1929                 memset64(ring->vaddr + ring->emit, 0, need_wrap / sizeof(u64));
1930                 ring->space -= need_wrap;
1931                 ring->emit = 0;
1932         }
1933
1934         GEM_BUG_ON(ring->emit > ring->size - bytes);
1935         GEM_BUG_ON(ring->space < bytes);
1936         cs = ring->vaddr + ring->emit;
1937         GEM_DEBUG_EXEC(memset32(cs, POISON_INUSE, bytes / sizeof(*cs)));
1938         ring->emit += bytes;
1939         ring->space -= bytes;
1940
1941         return cs;
1942 }
1943
1944 /* Align the ring tail to a cacheline boundary */
1945 int intel_ring_cacheline_align(struct i915_request *rq)
1946 {
1947         int num_dwords;
1948         void *cs;
1949
1950         num_dwords = (rq->ring->emit & (CACHELINE_BYTES - 1)) / sizeof(u32);
1951         if (num_dwords == 0)
1952                 return 0;
1953
1954         num_dwords = CACHELINE_DWORDS - num_dwords;
1955         GEM_BUG_ON(num_dwords & 1);
1956
1957         cs = intel_ring_begin(rq, num_dwords);
1958         if (IS_ERR(cs))
1959                 return PTR_ERR(cs);
1960
1961         memset64(cs, (u64)MI_NOOP << 32 | MI_NOOP, num_dwords / 2);
1962         intel_ring_advance(rq, cs);
1963
1964         GEM_BUG_ON(rq->ring->emit & (CACHELINE_BYTES - 1));
1965         return 0;
1966 }
1967
1968 static void gen6_bsd_submit_request(struct i915_request *request)
1969 {
1970         struct intel_uncore *uncore = request->engine->uncore;
1971
1972         intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
1973
1974        /* Every tail move must follow the sequence below */
1975
1976         /* Disable notification that the ring is IDLE. The GT
1977          * will then assume that it is busy and bring it out of rc6.
1978          */
1979         intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
1980                               _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1981
1982         /* Clear the context id. Here be magic! */
1983         intel_uncore_write64_fw(uncore, GEN6_BSD_RNCID, 0x0);
1984
1985         /* Wait for the ring not to be idle, i.e. for it to wake up. */
1986         if (__intel_wait_for_register_fw(uncore,
1987                                          GEN6_BSD_SLEEP_PSMI_CONTROL,
1988                                          GEN6_BSD_SLEEP_INDICATOR,
1989                                          0,
1990                                          1000, 0, NULL))
1991                 DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
1992
1993         /* Now that the ring is fully powered up, update the tail */
1994         i9xx_submit_request(request);
1995
1996         /* Let the ring send IDLE messages to the GT again,
1997          * and so let it sleep to conserve power when idle.
1998          */
1999         intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
2000                               _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
2001
2002         intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
2003 }
2004
2005 static int mi_flush_dw(struct i915_request *rq, u32 flags)
2006 {
2007         u32 cmd, *cs;
2008
2009         cs = intel_ring_begin(rq, 4);
2010         if (IS_ERR(cs))
2011                 return PTR_ERR(cs);
2012
2013         cmd = MI_FLUSH_DW;
2014
2015         /*
2016          * We always require a command barrier so that subsequent
2017          * commands, such as breadcrumb interrupts, are strictly ordered
2018          * wrt the contents of the write cache being flushed to memory
2019          * (and thus being coherent from the CPU).
2020          */
2021         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2022
2023         /*
2024          * Bspec vol 1c.3 - blitter engine command streamer:
2025          * "If ENABLED, all TLBs will be invalidated once the flush
2026          * operation is complete. This bit is only valid when the
2027          * Post-Sync Operation field is a value of 1h or 3h."
2028          */
2029         cmd |= flags;
2030
2031         *cs++ = cmd;
2032         *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2033         *cs++ = 0;
2034         *cs++ = MI_NOOP;
2035
2036         intel_ring_advance(rq, cs);
2037
2038         return 0;
2039 }
2040
2041 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
2042 {
2043         return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
2044 }
2045
2046 static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
2047 {
2048         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
2049 }
2050
2051 static int
2052 hsw_emit_bb_start(struct i915_request *rq,
2053                   u64 offset, u32 len,
2054                   unsigned int dispatch_flags)
2055 {
2056         u32 *cs;
2057
2058         cs = intel_ring_begin(rq, 2);
2059         if (IS_ERR(cs))
2060                 return PTR_ERR(cs);
2061
2062         *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
2063                 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW);
2064         /* bit0-7 is the length on GEN6+ */
2065         *cs++ = offset;
2066         intel_ring_advance(rq, cs);
2067
2068         return 0;
2069 }
2070
2071 static int
2072 gen6_emit_bb_start(struct i915_request *rq,
2073                    u64 offset, u32 len,
2074                    unsigned int dispatch_flags)
2075 {
2076         u32 *cs;
2077
2078         cs = intel_ring_begin(rq, 2);
2079         if (IS_ERR(cs))
2080                 return PTR_ERR(cs);
2081
2082         *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
2083                 0 : MI_BATCH_NON_SECURE_I965);
2084         /* bit0-7 is the length on GEN6+ */
2085         *cs++ = offset;
2086         intel_ring_advance(rq, cs);
2087
2088         return 0;
2089 }
2090
2091 /* Blitter support (SandyBridge+) */
2092
2093 static int gen6_ring_flush(struct i915_request *rq, u32 mode)
2094 {
2095         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
2096 }
2097
2098 static void i9xx_set_default_submission(struct intel_engine_cs *engine)
2099 {
2100         engine->submit_request = i9xx_submit_request;
2101         engine->cancel_requests = cancel_requests;
2102
2103         engine->park = NULL;
2104         engine->unpark = NULL;
2105 }
2106
2107 static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine)
2108 {
2109         i9xx_set_default_submission(engine);
2110         engine->submit_request = gen6_bsd_submit_request;
2111 }
2112
2113 static void ring_destroy(struct intel_engine_cs *engine)
2114 {
2115         struct drm_i915_private *dev_priv = engine->i915;
2116
2117         WARN_ON(INTEL_GEN(dev_priv) > 2 &&
2118                 (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
2119
2120         intel_ring_unpin(engine->buffer);
2121         intel_ring_put(engine->buffer);
2122
2123         intel_engine_cleanup_common(engine);
2124         kfree(engine);
2125 }
2126
2127 static void setup_irq(struct intel_engine_cs *engine)
2128 {
2129         struct drm_i915_private *i915 = engine->i915;
2130
2131         if (INTEL_GEN(i915) >= 6) {
2132                 engine->irq_enable = gen6_irq_enable;
2133                 engine->irq_disable = gen6_irq_disable;
2134         } else if (INTEL_GEN(i915) >= 5) {
2135                 engine->irq_enable = gen5_irq_enable;
2136                 engine->irq_disable = gen5_irq_disable;
2137         } else if (INTEL_GEN(i915) >= 3) {
2138                 engine->irq_enable = i9xx_irq_enable;
2139                 engine->irq_disable = i9xx_irq_disable;
2140         } else {
2141                 engine->irq_enable = i8xx_irq_enable;
2142                 engine->irq_disable = i8xx_irq_disable;
2143         }
2144 }
2145
2146 static void setup_common(struct intel_engine_cs *engine)
2147 {
2148         struct drm_i915_private *i915 = engine->i915;
2149
2150         /* gen8+ are only supported with execlists */
2151         GEM_BUG_ON(INTEL_GEN(i915) >= 8);
2152
2153         setup_irq(engine);
2154
2155         engine->destroy = ring_destroy;
2156
2157         engine->resume = xcs_resume;
2158         engine->reset.prepare = reset_prepare;
2159         engine->reset.reset = reset_ring;
2160         engine->reset.finish = reset_finish;
2161
2162         engine->cops = &ring_context_ops;
2163         engine->request_alloc = ring_request_alloc;
2164
2165         /*
2166          * Using a global execution timeline; the previous final breadcrumb is
2167          * equivalent to our next initial bread so we can elide
2168          * engine->emit_init_breadcrumb().
2169          */
2170         engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
2171         if (IS_GEN(i915, 5))
2172                 engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
2173
2174         engine->set_default_submission = i9xx_set_default_submission;
2175
2176         if (INTEL_GEN(i915) >= 6)
2177                 engine->emit_bb_start = gen6_emit_bb_start;
2178         else if (INTEL_GEN(i915) >= 4)
2179                 engine->emit_bb_start = i965_emit_bb_start;
2180         else if (IS_I830(i915) || IS_I845G(i915))
2181                 engine->emit_bb_start = i830_emit_bb_start;
2182         else
2183                 engine->emit_bb_start = i915_emit_bb_start;
2184 }
2185
2186 static void setup_rcs(struct intel_engine_cs *engine)
2187 {
2188         struct drm_i915_private *i915 = engine->i915;
2189
2190         if (HAS_L3_DPF(i915))
2191                 engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2192
2193         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
2194
2195         if (INTEL_GEN(i915) >= 7) {
2196                 engine->init_context = intel_rcs_ctx_init;
2197                 engine->emit_flush = gen7_render_ring_flush;
2198                 engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
2199         } else if (IS_GEN(i915, 6)) {
2200                 engine->init_context = intel_rcs_ctx_init;
2201                 engine->emit_flush = gen6_render_ring_flush;
2202                 engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
2203         } else if (IS_GEN(i915, 5)) {
2204                 engine->emit_flush = gen4_render_ring_flush;
2205         } else {
2206                 if (INTEL_GEN(i915) < 4)
2207                         engine->emit_flush = gen2_render_ring_flush;
2208                 else
2209                         engine->emit_flush = gen4_render_ring_flush;
2210                 engine->irq_enable_mask = I915_USER_INTERRUPT;
2211         }
2212
2213         if (IS_HASWELL(i915))
2214                 engine->emit_bb_start = hsw_emit_bb_start;
2215
2216         engine->resume = rcs_resume;
2217 }
2218
2219 static void setup_vcs(struct intel_engine_cs *engine)
2220 {
2221         struct drm_i915_private *i915 = engine->i915;
2222
2223         if (INTEL_GEN(i915) >= 6) {
2224                 /* gen6 bsd needs a special wa for tail updates */
2225                 if (IS_GEN(i915, 6))
2226                         engine->set_default_submission = gen6_bsd_set_default_submission;
2227                 engine->emit_flush = gen6_bsd_ring_flush;
2228                 engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
2229
2230                 if (IS_GEN(i915, 6))
2231                         engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
2232                 else
2233                         engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2234         } else {
2235                 engine->emit_flush = bsd_ring_flush;
2236                 if (IS_GEN(i915, 5))
2237                         engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
2238                 else
2239                         engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
2240         }
2241 }
2242
2243 static void setup_bcs(struct intel_engine_cs *engine)
2244 {
2245         struct drm_i915_private *i915 = engine->i915;
2246
2247         engine->emit_flush = gen6_ring_flush;
2248         engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
2249
2250         if (IS_GEN(i915, 6))
2251                 engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
2252         else
2253                 engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2254 }
2255
2256 static void setup_vecs(struct intel_engine_cs *engine)
2257 {
2258         struct drm_i915_private *i915 = engine->i915;
2259
2260         GEM_BUG_ON(INTEL_GEN(i915) < 7);
2261
2262         engine->emit_flush = gen6_ring_flush;
2263         engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
2264         engine->irq_enable = hsw_vebox_irq_enable;
2265         engine->irq_disable = hsw_vebox_irq_disable;
2266
2267         engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2268 }
2269
2270 int intel_ring_submission_setup(struct intel_engine_cs *engine)
2271 {
2272         setup_common(engine);
2273
2274         switch (engine->class) {
2275         case RENDER_CLASS:
2276                 setup_rcs(engine);
2277                 break;
2278         case VIDEO_DECODE_CLASS:
2279                 setup_vcs(engine);
2280                 break;
2281         case COPY_ENGINE_CLASS:
2282                 setup_bcs(engine);
2283                 break;
2284         case VIDEO_ENHANCEMENT_CLASS:
2285                 setup_vecs(engine);
2286                 break;
2287         default:
2288                 MISSING_CASE(engine->class);
2289                 return -ENODEV;
2290         }
2291
2292         return 0;
2293 }
2294
2295 int intel_ring_submission_init(struct intel_engine_cs *engine)
2296 {
2297         struct i915_timeline *timeline;
2298         struct intel_ring *ring;
2299         int err;
2300
2301         timeline = i915_timeline_create(engine->i915, engine->status_page.vma);
2302         if (IS_ERR(timeline)) {
2303                 err = PTR_ERR(timeline);
2304                 goto err;
2305         }
2306         GEM_BUG_ON(timeline->has_initial_breadcrumb);
2307
2308         ring = intel_engine_create_ring(engine, timeline, 32 * PAGE_SIZE);
2309         i915_timeline_put(timeline);
2310         if (IS_ERR(ring)) {
2311                 err = PTR_ERR(ring);
2312                 goto err;
2313         }
2314
2315         err = intel_ring_pin(ring);
2316         if (err)
2317                 goto err_ring;
2318
2319         GEM_BUG_ON(engine->buffer);
2320         engine->buffer = ring;
2321
2322         err = intel_engine_init_common(engine);
2323         if (err)
2324                 goto err_unpin;
2325
2326         GEM_BUG_ON(ring->timeline->hwsp_ggtt != engine->status_page.vma);
2327
2328         return 0;
2329
2330 err_unpin:
2331         intel_ring_unpin(ring);
2332 err_ring:
2333         intel_ring_put(ring);
2334 err:
2335         intel_engine_cleanup_common(engine);
2336         return err;
2337 }