Merge drm/drm-next into drm-intel-next-queued
[sfrench/cifs-2.6.git] / drivers / gpu / drm / i915 / gt / selftest_hangcheck.c
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include <linux/kthread.h>
26
27 #include "gem/i915_gem_context.h"
28
29 #include "intel_gt.h"
30 #include "intel_engine_heartbeat.h"
31 #include "intel_engine_pm.h"
32
33 #include "i915_selftest.h"
34 #include "selftests/i915_random.h"
35 #include "selftests/igt_flush_test.h"
36 #include "selftests/igt_reset.h"
37 #include "selftests/igt_atomic.h"
38
39 #include "selftests/mock_drm.h"
40
41 #include "gem/selftests/mock_context.h"
42 #include "gem/selftests/igt_gem_utils.h"
43
44 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
45
46 struct hang {
47         struct intel_gt *gt;
48         struct drm_i915_gem_object *hws;
49         struct drm_i915_gem_object *obj;
50         struct i915_gem_context *ctx;
51         u32 *seqno;
52         u32 *batch;
53 };
54
55 static int hang_init(struct hang *h, struct intel_gt *gt)
56 {
57         void *vaddr;
58         int err;
59
60         memset(h, 0, sizeof(*h));
61         h->gt = gt;
62
63         h->ctx = kernel_context(gt->i915);
64         if (IS_ERR(h->ctx))
65                 return PTR_ERR(h->ctx);
66
67         GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
68
69         h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
70         if (IS_ERR(h->hws)) {
71                 err = PTR_ERR(h->hws);
72                 goto err_ctx;
73         }
74
75         h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
76         if (IS_ERR(h->obj)) {
77                 err = PTR_ERR(h->obj);
78                 goto err_hws;
79         }
80
81         i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
82         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
83         if (IS_ERR(vaddr)) {
84                 err = PTR_ERR(vaddr);
85                 goto err_obj;
86         }
87         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
88
89         vaddr = i915_gem_object_pin_map(h->obj,
90                                         i915_coherent_map_type(gt->i915));
91         if (IS_ERR(vaddr)) {
92                 err = PTR_ERR(vaddr);
93                 goto err_unpin_hws;
94         }
95         h->batch = vaddr;
96
97         return 0;
98
99 err_unpin_hws:
100         i915_gem_object_unpin_map(h->hws);
101 err_obj:
102         i915_gem_object_put(h->obj);
103 err_hws:
104         i915_gem_object_put(h->hws);
105 err_ctx:
106         kernel_context_close(h->ctx);
107         return err;
108 }
109
110 static u64 hws_address(const struct i915_vma *hws,
111                        const struct i915_request *rq)
112 {
113         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
114 }
115
116 static int move_to_active(struct i915_vma *vma,
117                           struct i915_request *rq,
118                           unsigned int flags)
119 {
120         int err;
121
122         i915_vma_lock(vma);
123         err = i915_request_await_object(rq, vma->obj,
124                                         flags & EXEC_OBJECT_WRITE);
125         if (err == 0)
126                 err = i915_vma_move_to_active(vma, rq, flags);
127         i915_vma_unlock(vma);
128
129         return err;
130 }
131
132 static struct i915_request *
133 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
134 {
135         struct intel_gt *gt = h->gt;
136         struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
137         struct drm_i915_gem_object *obj;
138         struct i915_request *rq = NULL;
139         struct i915_vma *hws, *vma;
140         unsigned int flags;
141         void *vaddr;
142         u32 *batch;
143         int err;
144
145         obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
146         if (IS_ERR(obj)) {
147                 i915_vm_put(vm);
148                 return ERR_CAST(obj);
149         }
150
151         vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
152         if (IS_ERR(vaddr)) {
153                 i915_gem_object_put(obj);
154                 i915_vm_put(vm);
155                 return ERR_CAST(vaddr);
156         }
157
158         i915_gem_object_unpin_map(h->obj);
159         i915_gem_object_put(h->obj);
160
161         h->obj = obj;
162         h->batch = vaddr;
163
164         vma = i915_vma_instance(h->obj, vm, NULL);
165         if (IS_ERR(vma)) {
166                 i915_vm_put(vm);
167                 return ERR_CAST(vma);
168         }
169
170         hws = i915_vma_instance(h->hws, vm, NULL);
171         if (IS_ERR(hws)) {
172                 i915_vm_put(vm);
173                 return ERR_CAST(hws);
174         }
175
176         err = i915_vma_pin(vma, 0, 0, PIN_USER);
177         if (err) {
178                 i915_vm_put(vm);
179                 return ERR_PTR(err);
180         }
181
182         err = i915_vma_pin(hws, 0, 0, PIN_USER);
183         if (err)
184                 goto unpin_vma;
185
186         rq = igt_request_alloc(h->ctx, engine);
187         if (IS_ERR(rq)) {
188                 err = PTR_ERR(rq);
189                 goto unpin_hws;
190         }
191
192         err = move_to_active(vma, rq, 0);
193         if (err)
194                 goto cancel_rq;
195
196         err = move_to_active(hws, rq, 0);
197         if (err)
198                 goto cancel_rq;
199
200         batch = h->batch;
201         if (INTEL_GEN(gt->i915) >= 8) {
202                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
203                 *batch++ = lower_32_bits(hws_address(hws, rq));
204                 *batch++ = upper_32_bits(hws_address(hws, rq));
205                 *batch++ = rq->fence.seqno;
206                 *batch++ = MI_ARB_CHECK;
207
208                 memset(batch, 0, 1024);
209                 batch += 1024 / sizeof(*batch);
210
211                 *batch++ = MI_ARB_CHECK;
212                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
213                 *batch++ = lower_32_bits(vma->node.start);
214                 *batch++ = upper_32_bits(vma->node.start);
215         } else if (INTEL_GEN(gt->i915) >= 6) {
216                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
217                 *batch++ = 0;
218                 *batch++ = lower_32_bits(hws_address(hws, rq));
219                 *batch++ = rq->fence.seqno;
220                 *batch++ = MI_ARB_CHECK;
221
222                 memset(batch, 0, 1024);
223                 batch += 1024 / sizeof(*batch);
224
225                 *batch++ = MI_ARB_CHECK;
226                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
227                 *batch++ = lower_32_bits(vma->node.start);
228         } else if (INTEL_GEN(gt->i915) >= 4) {
229                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
230                 *batch++ = 0;
231                 *batch++ = lower_32_bits(hws_address(hws, rq));
232                 *batch++ = rq->fence.seqno;
233                 *batch++ = MI_ARB_CHECK;
234
235                 memset(batch, 0, 1024);
236                 batch += 1024 / sizeof(*batch);
237
238                 *batch++ = MI_ARB_CHECK;
239                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
240                 *batch++ = lower_32_bits(vma->node.start);
241         } else {
242                 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
243                 *batch++ = lower_32_bits(hws_address(hws, rq));
244                 *batch++ = rq->fence.seqno;
245                 *batch++ = MI_ARB_CHECK;
246
247                 memset(batch, 0, 1024);
248                 batch += 1024 / sizeof(*batch);
249
250                 *batch++ = MI_ARB_CHECK;
251                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
252                 *batch++ = lower_32_bits(vma->node.start);
253         }
254         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
255         intel_gt_chipset_flush(engine->gt);
256
257         if (rq->engine->emit_init_breadcrumb) {
258                 err = rq->engine->emit_init_breadcrumb(rq);
259                 if (err)
260                         goto cancel_rq;
261         }
262
263         flags = 0;
264         if (INTEL_GEN(gt->i915) <= 5)
265                 flags |= I915_DISPATCH_SECURE;
266
267         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
268
269 cancel_rq:
270         if (err) {
271                 i915_request_skip(rq, err);
272                 i915_request_add(rq);
273         }
274 unpin_hws:
275         i915_vma_unpin(hws);
276 unpin_vma:
277         i915_vma_unpin(vma);
278         i915_vm_put(vm);
279         return err ? ERR_PTR(err) : rq;
280 }
281
282 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
283 {
284         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
285 }
286
287 static void hang_fini(struct hang *h)
288 {
289         *h->batch = MI_BATCH_BUFFER_END;
290         intel_gt_chipset_flush(h->gt);
291
292         i915_gem_object_unpin_map(h->obj);
293         i915_gem_object_put(h->obj);
294
295         i915_gem_object_unpin_map(h->hws);
296         i915_gem_object_put(h->hws);
297
298         kernel_context_close(h->ctx);
299
300         igt_flush_test(h->gt->i915);
301 }
302
303 static bool wait_until_running(struct hang *h, struct i915_request *rq)
304 {
305         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
306                                                rq->fence.seqno),
307                              10) &&
308                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
309                                             rq->fence.seqno),
310                           1000));
311 }
312
313 static void engine_heartbeat_disable(struct intel_engine_cs *engine,
314                                      unsigned long *saved)
315 {
316         *saved = engine->props.heartbeat_interval_ms;
317         engine->props.heartbeat_interval_ms = 0;
318
319         intel_engine_pm_get(engine);
320         intel_engine_park_heartbeat(engine);
321 }
322
323 static void engine_heartbeat_enable(struct intel_engine_cs *engine,
324                                     unsigned long saved)
325 {
326         intel_engine_pm_put(engine);
327
328         engine->props.heartbeat_interval_ms = saved;
329 }
330
331 static int igt_hang_sanitycheck(void *arg)
332 {
333         struct intel_gt *gt = arg;
334         struct i915_request *rq;
335         struct intel_engine_cs *engine;
336         enum intel_engine_id id;
337         struct hang h;
338         int err;
339
340         /* Basic check that we can execute our hanging batch */
341
342         err = hang_init(&h, gt);
343         if (err)
344                 return err;
345
346         for_each_engine(engine, gt, id) {
347                 struct intel_wedge_me w;
348                 long timeout;
349
350                 if (!intel_engine_can_store_dword(engine))
351                         continue;
352
353                 rq = hang_create_request(&h, engine);
354                 if (IS_ERR(rq)) {
355                         err = PTR_ERR(rq);
356                         pr_err("Failed to create request for %s, err=%d\n",
357                                engine->name, err);
358                         goto fini;
359                 }
360
361                 i915_request_get(rq);
362
363                 *h.batch = MI_BATCH_BUFFER_END;
364                 intel_gt_chipset_flush(engine->gt);
365
366                 i915_request_add(rq);
367
368                 timeout = 0;
369                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
370                         timeout = i915_request_wait(rq, 0,
371                                                     MAX_SCHEDULE_TIMEOUT);
372                 if (intel_gt_is_wedged(gt))
373                         timeout = -EIO;
374
375                 i915_request_put(rq);
376
377                 if (timeout < 0) {
378                         err = timeout;
379                         pr_err("Wait for request failed on %s, err=%d\n",
380                                engine->name, err);
381                         goto fini;
382                 }
383         }
384
385 fini:
386         hang_fini(&h);
387         return err;
388 }
389
390 static bool wait_for_idle(struct intel_engine_cs *engine)
391 {
392         return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
393 }
394
395 static int igt_reset_nop(void *arg)
396 {
397         struct intel_gt *gt = arg;
398         struct i915_gpu_error *global = &gt->i915->gpu_error;
399         struct intel_engine_cs *engine;
400         unsigned int reset_count, count;
401         enum intel_engine_id id;
402         IGT_TIMEOUT(end_time);
403         int err = 0;
404
405         /* Check that we can reset during non-user portions of requests */
406
407         reset_count = i915_reset_count(global);
408         count = 0;
409         do {
410                 for_each_engine(engine, gt, id) {
411                         struct intel_context *ce;
412                         int i;
413
414                         ce = intel_context_create(engine);
415                         if (IS_ERR(ce)) {
416                                 err = PTR_ERR(ce);
417                                 break;
418                         }
419
420                         for (i = 0; i < 16; i++) {
421                                 struct i915_request *rq;
422
423                                 rq = intel_context_create_request(ce);
424                                 if (IS_ERR(rq)) {
425                                         err = PTR_ERR(rq);
426                                         break;
427                                 }
428
429                                 i915_request_add(rq);
430                         }
431
432                         intel_context_put(ce);
433                 }
434
435                 igt_global_reset_lock(gt);
436                 intel_gt_reset(gt, ALL_ENGINES, NULL);
437                 igt_global_reset_unlock(gt);
438
439                 if (intel_gt_is_wedged(gt)) {
440                         err = -EIO;
441                         break;
442                 }
443
444                 if (i915_reset_count(global) != reset_count + ++count) {
445                         pr_err("Full GPU reset not recorded!\n");
446                         err = -EINVAL;
447                         break;
448                 }
449
450                 err = igt_flush_test(gt->i915);
451                 if (err)
452                         break;
453         } while (time_before(jiffies, end_time));
454         pr_info("%s: %d resets\n", __func__, count);
455
456         if (igt_flush_test(gt->i915))
457                 err = -EIO;
458         return err;
459 }
460
461 static int igt_reset_nop_engine(void *arg)
462 {
463         struct intel_gt *gt = arg;
464         struct i915_gpu_error *global = &gt->i915->gpu_error;
465         struct intel_engine_cs *engine;
466         enum intel_engine_id id;
467
468         /* Check that we can engine-reset during non-user portions */
469
470         if (!intel_has_reset_engine(gt))
471                 return 0;
472
473         for_each_engine(engine, gt, id) {
474                 unsigned int reset_count, reset_engine_count, count;
475                 struct intel_context *ce;
476                 unsigned long heartbeat;
477                 IGT_TIMEOUT(end_time);
478                 int err;
479
480                 ce = intel_context_create(engine);
481                 if (IS_ERR(ce))
482                         return PTR_ERR(ce);
483
484                 reset_count = i915_reset_count(global);
485                 reset_engine_count = i915_reset_engine_count(global, engine);
486                 count = 0;
487
488                 engine_heartbeat_disable(engine, &heartbeat);
489                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
490                 do {
491                         int i;
492
493                         if (!wait_for_idle(engine)) {
494                                 pr_err("%s failed to idle before reset\n",
495                                        engine->name);
496                                 err = -EIO;
497                                 break;
498                         }
499
500                         for (i = 0; i < 16; i++) {
501                                 struct i915_request *rq;
502
503                                 rq = intel_context_create_request(ce);
504                                 if (IS_ERR(rq)) {
505                                         err = PTR_ERR(rq);
506                                         break;
507                                 }
508
509                                 i915_request_add(rq);
510                         }
511                         err = intel_engine_reset(engine, NULL);
512                         if (err) {
513                                 pr_err("i915_reset_engine failed\n");
514                                 break;
515                         }
516
517                         if (i915_reset_count(global) != reset_count) {
518                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
519                                 err = -EINVAL;
520                                 break;
521                         }
522
523                         if (i915_reset_engine_count(global, engine) !=
524                             reset_engine_count + ++count) {
525                                 pr_err("%s engine reset not recorded!\n",
526                                        engine->name);
527                                 err = -EINVAL;
528                                 break;
529                         }
530                 } while (time_before(jiffies, end_time));
531                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
532                 engine_heartbeat_enable(engine, heartbeat);
533
534                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
535
536                 intel_context_put(ce);
537                 if (igt_flush_test(gt->i915))
538                         err = -EIO;
539                 if (err)
540                         return err;
541         }
542
543         return 0;
544 }
545
546 static int __igt_reset_engine(struct intel_gt *gt, bool active)
547 {
548         struct i915_gpu_error *global = &gt->i915->gpu_error;
549         struct intel_engine_cs *engine;
550         enum intel_engine_id id;
551         struct hang h;
552         int err = 0;
553
554         /* Check that we can issue an engine reset on an idle engine (no-op) */
555
556         if (!intel_has_reset_engine(gt))
557                 return 0;
558
559         if (active) {
560                 err = hang_init(&h, gt);
561                 if (err)
562                         return err;
563         }
564
565         for_each_engine(engine, gt, id) {
566                 unsigned int reset_count, reset_engine_count;
567                 unsigned long heartbeat;
568                 IGT_TIMEOUT(end_time);
569
570                 if (active && !intel_engine_can_store_dword(engine))
571                         continue;
572
573                 if (!wait_for_idle(engine)) {
574                         pr_err("%s failed to idle before reset\n",
575                                engine->name);
576                         err = -EIO;
577                         break;
578                 }
579
580                 reset_count = i915_reset_count(global);
581                 reset_engine_count = i915_reset_engine_count(global, engine);
582
583                 engine_heartbeat_disable(engine, &heartbeat);
584                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
585                 do {
586                         if (active) {
587                                 struct i915_request *rq;
588
589                                 rq = hang_create_request(&h, engine);
590                                 if (IS_ERR(rq)) {
591                                         err = PTR_ERR(rq);
592                                         break;
593                                 }
594
595                                 i915_request_get(rq);
596                                 i915_request_add(rq);
597
598                                 if (!wait_until_running(&h, rq)) {
599                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
600
601                                         pr_err("%s: Failed to start request %llx, at %x\n",
602                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
603                                         intel_engine_dump(engine, &p,
604                                                           "%s\n", engine->name);
605
606                                         i915_request_put(rq);
607                                         err = -EIO;
608                                         break;
609                                 }
610
611                                 i915_request_put(rq);
612                         }
613
614                         err = intel_engine_reset(engine, NULL);
615                         if (err) {
616                                 pr_err("i915_reset_engine failed\n");
617                                 break;
618                         }
619
620                         if (i915_reset_count(global) != reset_count) {
621                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
622                                 err = -EINVAL;
623                                 break;
624                         }
625
626                         if (i915_reset_engine_count(global, engine) !=
627                             ++reset_engine_count) {
628                                 pr_err("%s engine reset not recorded!\n",
629                                        engine->name);
630                                 err = -EINVAL;
631                                 break;
632                         }
633                 } while (time_before(jiffies, end_time));
634                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
635                 engine_heartbeat_enable(engine, heartbeat);
636
637                 if (err)
638                         break;
639
640                 err = igt_flush_test(gt->i915);
641                 if (err)
642                         break;
643         }
644
645         if (intel_gt_is_wedged(gt))
646                 err = -EIO;
647
648         if (active)
649                 hang_fini(&h);
650
651         return err;
652 }
653
654 static int igt_reset_idle_engine(void *arg)
655 {
656         return __igt_reset_engine(arg, false);
657 }
658
659 static int igt_reset_active_engine(void *arg)
660 {
661         return __igt_reset_engine(arg, true);
662 }
663
664 struct active_engine {
665         struct task_struct *task;
666         struct intel_engine_cs *engine;
667         unsigned long resets;
668         unsigned int flags;
669 };
670
671 #define TEST_ACTIVE     BIT(0)
672 #define TEST_OTHERS     BIT(1)
673 #define TEST_SELF       BIT(2)
674 #define TEST_PRIORITY   BIT(3)
675
676 static int active_request_put(struct i915_request *rq)
677 {
678         int err = 0;
679
680         if (!rq)
681                 return 0;
682
683         if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
684                 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
685                           rq->engine->name,
686                           rq->fence.context,
687                           rq->fence.seqno);
688                 GEM_TRACE_DUMP();
689
690                 intel_gt_set_wedged(rq->engine->gt);
691                 err = -EIO;
692         }
693
694         i915_request_put(rq);
695
696         return err;
697 }
698
699 static int active_engine(void *data)
700 {
701         I915_RND_STATE(prng);
702         struct active_engine *arg = data;
703         struct intel_engine_cs *engine = arg->engine;
704         struct i915_request *rq[8] = {};
705         struct intel_context *ce[ARRAY_SIZE(rq)];
706         unsigned long count;
707         int err = 0;
708
709         for (count = 0; count < ARRAY_SIZE(ce); count++) {
710                 ce[count] = intel_context_create(engine);
711                 if (IS_ERR(ce[count])) {
712                         err = PTR_ERR(ce[count]);
713                         while (--count)
714                                 intel_context_put(ce[count]);
715                         return err;
716                 }
717         }
718
719         count = 0;
720         while (!kthread_should_stop()) {
721                 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
722                 struct i915_request *old = rq[idx];
723                 struct i915_request *new;
724
725                 new = intel_context_create_request(ce[idx]);
726                 if (IS_ERR(new)) {
727                         err = PTR_ERR(new);
728                         break;
729                 }
730
731                 rq[idx] = i915_request_get(new);
732                 i915_request_add(new);
733
734                 if (engine->schedule && arg->flags & TEST_PRIORITY) {
735                         struct i915_sched_attr attr = {
736                                 .priority =
737                                         i915_prandom_u32_max_state(512, &prng),
738                         };
739                         engine->schedule(rq[idx], &attr);
740                 }
741
742                 err = active_request_put(old);
743                 if (err)
744                         break;
745
746                 cond_resched();
747         }
748
749         for (count = 0; count < ARRAY_SIZE(rq); count++) {
750                 int err__ = active_request_put(rq[count]);
751
752                 /* Keep the first error */
753                 if (!err)
754                         err = err__;
755
756                 intel_context_put(ce[count]);
757         }
758
759         return err;
760 }
761
762 static int __igt_reset_engines(struct intel_gt *gt,
763                                const char *test_name,
764                                unsigned int flags)
765 {
766         struct i915_gpu_error *global = &gt->i915->gpu_error;
767         struct intel_engine_cs *engine, *other;
768         enum intel_engine_id id, tmp;
769         struct hang h;
770         int err = 0;
771
772         /* Check that issuing a reset on one engine does not interfere
773          * with any other engine.
774          */
775
776         if (!intel_has_reset_engine(gt))
777                 return 0;
778
779         if (flags & TEST_ACTIVE) {
780                 err = hang_init(&h, gt);
781                 if (err)
782                         return err;
783
784                 if (flags & TEST_PRIORITY)
785                         h.ctx->sched.priority = 1024;
786         }
787
788         for_each_engine(engine, gt, id) {
789                 struct active_engine threads[I915_NUM_ENGINES] = {};
790                 unsigned long device = i915_reset_count(global);
791                 unsigned long count = 0, reported;
792                 unsigned long heartbeat;
793                 IGT_TIMEOUT(end_time);
794
795                 if (flags & TEST_ACTIVE &&
796                     !intel_engine_can_store_dword(engine))
797                         continue;
798
799                 if (!wait_for_idle(engine)) {
800                         pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
801                                engine->name, test_name);
802                         err = -EIO;
803                         break;
804                 }
805
806                 memset(threads, 0, sizeof(threads));
807                 for_each_engine(other, gt, tmp) {
808                         struct task_struct *tsk;
809
810                         threads[tmp].resets =
811                                 i915_reset_engine_count(global, other);
812
813                         if (!(flags & TEST_OTHERS))
814                                 continue;
815
816                         if (other == engine && !(flags & TEST_SELF))
817                                 continue;
818
819                         threads[tmp].engine = other;
820                         threads[tmp].flags = flags;
821
822                         tsk = kthread_run(active_engine, &threads[tmp],
823                                           "igt/%s", other->name);
824                         if (IS_ERR(tsk)) {
825                                 err = PTR_ERR(tsk);
826                                 goto unwind;
827                         }
828
829                         threads[tmp].task = tsk;
830                         get_task_struct(tsk);
831                 }
832
833                 yield(); /* start all threads before we begin */
834
835                 engine_heartbeat_disable(engine, &heartbeat);
836                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
837                 do {
838                         struct i915_request *rq = NULL;
839
840                         if (flags & TEST_ACTIVE) {
841                                 rq = hang_create_request(&h, engine);
842                                 if (IS_ERR(rq)) {
843                                         err = PTR_ERR(rq);
844                                         break;
845                                 }
846
847                                 i915_request_get(rq);
848                                 i915_request_add(rq);
849
850                                 if (!wait_until_running(&h, rq)) {
851                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
852
853                                         pr_err("%s: Failed to start request %llx, at %x\n",
854                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
855                                         intel_engine_dump(engine, &p,
856                                                           "%s\n", engine->name);
857
858                                         i915_request_put(rq);
859                                         err = -EIO;
860                                         break;
861                                 }
862                         }
863
864                         err = intel_engine_reset(engine, NULL);
865                         if (err) {
866                                 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
867                                        engine->name, test_name, err);
868                                 break;
869                         }
870
871                         count++;
872
873                         if (rq) {
874                                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
875                                         struct drm_printer p =
876                                                 drm_info_printer(gt->i915->drm.dev);
877
878                                         pr_err("i915_reset_engine(%s:%s):"
879                                                " failed to complete request after reset\n",
880                                                engine->name, test_name);
881                                         intel_engine_dump(engine, &p,
882                                                           "%s\n", engine->name);
883                                         i915_request_put(rq);
884
885                                         GEM_TRACE_DUMP();
886                                         intel_gt_set_wedged(gt);
887                                         err = -EIO;
888                                         break;
889                                 }
890
891                                 i915_request_put(rq);
892                         }
893
894                         if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
895                                 struct drm_printer p =
896                                         drm_info_printer(gt->i915->drm.dev);
897
898                                 pr_err("i915_reset_engine(%s:%s):"
899                                        " failed to idle after reset\n",
900                                        engine->name, test_name);
901                                 intel_engine_dump(engine, &p,
902                                                   "%s\n", engine->name);
903
904                                 err = -EIO;
905                                 break;
906                         }
907                 } while (time_before(jiffies, end_time));
908                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
909                 engine_heartbeat_enable(engine, heartbeat);
910
911                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
912                         engine->name, test_name, count);
913
914                 reported = i915_reset_engine_count(global, engine);
915                 reported -= threads[engine->id].resets;
916                 if (reported != count) {
917                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
918                                engine->name, test_name, count, reported);
919                         if (!err)
920                                 err = -EINVAL;
921                 }
922
923 unwind:
924                 for_each_engine(other, gt, tmp) {
925                         int ret;
926
927                         if (!threads[tmp].task)
928                                 continue;
929
930                         ret = kthread_stop(threads[tmp].task);
931                         if (ret) {
932                                 pr_err("kthread for other engine %s failed, err=%d\n",
933                                        other->name, ret);
934                                 if (!err)
935                                         err = ret;
936                         }
937                         put_task_struct(threads[tmp].task);
938
939                         if (other->uabi_class != engine->uabi_class &&
940                             threads[tmp].resets !=
941                             i915_reset_engine_count(global, other)) {
942                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
943                                        other->name,
944                                        i915_reset_engine_count(global, other) -
945                                        threads[tmp].resets);
946                                 if (!err)
947                                         err = -EINVAL;
948                         }
949                 }
950
951                 if (device != i915_reset_count(global)) {
952                         pr_err("Global reset (count=%ld)!\n",
953                                i915_reset_count(global) - device);
954                         if (!err)
955                                 err = -EINVAL;
956                 }
957
958                 if (err)
959                         break;
960
961                 err = igt_flush_test(gt->i915);
962                 if (err)
963                         break;
964         }
965
966         if (intel_gt_is_wedged(gt))
967                 err = -EIO;
968
969         if (flags & TEST_ACTIVE)
970                 hang_fini(&h);
971
972         return err;
973 }
974
975 static int igt_reset_engines(void *arg)
976 {
977         static const struct {
978                 const char *name;
979                 unsigned int flags;
980         } phases[] = {
981                 { "idle", 0 },
982                 { "active", TEST_ACTIVE },
983                 { "others-idle", TEST_OTHERS },
984                 { "others-active", TEST_OTHERS | TEST_ACTIVE },
985                 {
986                         "others-priority",
987                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
988                 },
989                 {
990                         "self-priority",
991                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
992                 },
993                 { }
994         };
995         struct intel_gt *gt = arg;
996         typeof(*phases) *p;
997         int err;
998
999         for (p = phases; p->name; p++) {
1000                 if (p->flags & TEST_PRIORITY) {
1001                         if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1002                                 continue;
1003                 }
1004
1005                 err = __igt_reset_engines(arg, p->name, p->flags);
1006                 if (err)
1007                         return err;
1008         }
1009
1010         return 0;
1011 }
1012
1013 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1014 {
1015         u32 count = i915_reset_count(&gt->i915->gpu_error);
1016
1017         intel_gt_reset(gt, mask, NULL);
1018
1019         return count;
1020 }
1021
1022 static int igt_reset_wait(void *arg)
1023 {
1024         struct intel_gt *gt = arg;
1025         struct i915_gpu_error *global = &gt->i915->gpu_error;
1026         struct intel_engine_cs *engine = gt->engine[RCS0];
1027         struct i915_request *rq;
1028         unsigned int reset_count;
1029         struct hang h;
1030         long timeout;
1031         int err;
1032
1033         if (!engine || !intel_engine_can_store_dword(engine))
1034                 return 0;
1035
1036         /* Check that we detect a stuck waiter and issue a reset */
1037
1038         igt_global_reset_lock(gt);
1039
1040         err = hang_init(&h, gt);
1041         if (err)
1042                 goto unlock;
1043
1044         rq = hang_create_request(&h, engine);
1045         if (IS_ERR(rq)) {
1046                 err = PTR_ERR(rq);
1047                 goto fini;
1048         }
1049
1050         i915_request_get(rq);
1051         i915_request_add(rq);
1052
1053         if (!wait_until_running(&h, rq)) {
1054                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1055
1056                 pr_err("%s: Failed to start request %llx, at %x\n",
1057                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1058                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1059
1060                 intel_gt_set_wedged(gt);
1061
1062                 err = -EIO;
1063                 goto out_rq;
1064         }
1065
1066         reset_count = fake_hangcheck(gt, ALL_ENGINES);
1067
1068         timeout = i915_request_wait(rq, 0, 10);
1069         if (timeout < 0) {
1070                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1071                        timeout);
1072                 err = timeout;
1073                 goto out_rq;
1074         }
1075
1076         if (i915_reset_count(global) == reset_count) {
1077                 pr_err("No GPU reset recorded!\n");
1078                 err = -EINVAL;
1079                 goto out_rq;
1080         }
1081
1082 out_rq:
1083         i915_request_put(rq);
1084 fini:
1085         hang_fini(&h);
1086 unlock:
1087         igt_global_reset_unlock(gt);
1088
1089         if (intel_gt_is_wedged(gt))
1090                 return -EIO;
1091
1092         return err;
1093 }
1094
1095 struct evict_vma {
1096         struct completion completion;
1097         struct i915_vma *vma;
1098 };
1099
1100 static int evict_vma(void *data)
1101 {
1102         struct evict_vma *arg = data;
1103         struct i915_address_space *vm = arg->vma->vm;
1104         struct drm_mm_node evict = arg->vma->node;
1105         int err;
1106
1107         complete(&arg->completion);
1108
1109         mutex_lock(&vm->mutex);
1110         err = i915_gem_evict_for_node(vm, &evict, 0);
1111         mutex_unlock(&vm->mutex);
1112
1113         return err;
1114 }
1115
1116 static int evict_fence(void *data)
1117 {
1118         struct evict_vma *arg = data;
1119         int err;
1120
1121         complete(&arg->completion);
1122
1123         /* Mark the fence register as dirty to force the mmio update. */
1124         err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1125         if (err) {
1126                 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1127                 return err;
1128         }
1129
1130         err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1131         if (err) {
1132                 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1133                 return err;
1134         }
1135
1136         err = i915_vma_pin_fence(arg->vma);
1137         i915_vma_unpin(arg->vma);
1138         if (err) {
1139                 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1140                 return err;
1141         }
1142
1143         i915_vma_unpin_fence(arg->vma);
1144
1145         return 0;
1146 }
1147
1148 static int __igt_reset_evict_vma(struct intel_gt *gt,
1149                                  struct i915_address_space *vm,
1150                                  int (*fn)(void *),
1151                                  unsigned int flags)
1152 {
1153         struct intel_engine_cs *engine = gt->engine[RCS0];
1154         struct drm_i915_gem_object *obj;
1155         struct task_struct *tsk = NULL;
1156         struct i915_request *rq;
1157         struct evict_vma arg;
1158         struct hang h;
1159         unsigned int pin_flags;
1160         int err;
1161
1162         if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1163                 return 0;
1164
1165         if (!engine || !intel_engine_can_store_dword(engine))
1166                 return 0;
1167
1168         /* Check that we can recover an unbind stuck on a hanging request */
1169
1170         err = hang_init(&h, gt);
1171         if (err)
1172                 return err;
1173
1174         obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1175         if (IS_ERR(obj)) {
1176                 err = PTR_ERR(obj);
1177                 goto fini;
1178         }
1179
1180         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1181                 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1182                 if (err) {
1183                         pr_err("Invalid X-tiling settings; err:%d\n", err);
1184                         goto out_obj;
1185                 }
1186         }
1187
1188         arg.vma = i915_vma_instance(obj, vm, NULL);
1189         if (IS_ERR(arg.vma)) {
1190                 err = PTR_ERR(arg.vma);
1191                 goto out_obj;
1192         }
1193
1194         rq = hang_create_request(&h, engine);
1195         if (IS_ERR(rq)) {
1196                 err = PTR_ERR(rq);
1197                 goto out_obj;
1198         }
1199
1200         pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1201
1202         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1203                 pin_flags |= PIN_MAPPABLE;
1204
1205         err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1206         if (err) {
1207                 i915_request_add(rq);
1208                 goto out_obj;
1209         }
1210
1211         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1212                 err = i915_vma_pin_fence(arg.vma);
1213                 if (err) {
1214                         pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1215                         i915_vma_unpin(arg.vma);
1216                         i915_request_add(rq);
1217                         goto out_obj;
1218                 }
1219         }
1220
1221         i915_vma_lock(arg.vma);
1222         err = i915_request_await_object(rq, arg.vma->obj,
1223                                         flags & EXEC_OBJECT_WRITE);
1224         if (err == 0)
1225                 err = i915_vma_move_to_active(arg.vma, rq, flags);
1226         i915_vma_unlock(arg.vma);
1227
1228         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1229                 i915_vma_unpin_fence(arg.vma);
1230         i915_vma_unpin(arg.vma);
1231
1232         i915_request_get(rq);
1233         i915_request_add(rq);
1234         if (err)
1235                 goto out_rq;
1236
1237         if (!wait_until_running(&h, rq)) {
1238                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1239
1240                 pr_err("%s: Failed to start request %llx, at %x\n",
1241                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1242                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1243
1244                 intel_gt_set_wedged(gt);
1245                 goto out_reset;
1246         }
1247
1248         init_completion(&arg.completion);
1249
1250         tsk = kthread_run(fn, &arg, "igt/evict_vma");
1251         if (IS_ERR(tsk)) {
1252                 err = PTR_ERR(tsk);
1253                 tsk = NULL;
1254                 goto out_reset;
1255         }
1256         get_task_struct(tsk);
1257
1258         wait_for_completion(&arg.completion);
1259
1260         if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1261                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1262
1263                 pr_err("igt/evict_vma kthread did not wait\n");
1264                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1265
1266                 intel_gt_set_wedged(gt);
1267                 goto out_reset;
1268         }
1269
1270 out_reset:
1271         igt_global_reset_lock(gt);
1272         fake_hangcheck(gt, rq->engine->mask);
1273         igt_global_reset_unlock(gt);
1274
1275         if (tsk) {
1276                 struct intel_wedge_me w;
1277
1278                 /* The reset, even indirectly, should take less than 10ms. */
1279                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1280                         err = kthread_stop(tsk);
1281
1282                 put_task_struct(tsk);
1283         }
1284
1285 out_rq:
1286         i915_request_put(rq);
1287 out_obj:
1288         i915_gem_object_put(obj);
1289 fini:
1290         hang_fini(&h);
1291         if (intel_gt_is_wedged(gt))
1292                 return -EIO;
1293
1294         return err;
1295 }
1296
1297 static int igt_reset_evict_ggtt(void *arg)
1298 {
1299         struct intel_gt *gt = arg;
1300
1301         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1302                                      evict_vma, EXEC_OBJECT_WRITE);
1303 }
1304
1305 static int igt_reset_evict_ppgtt(void *arg)
1306 {
1307         struct intel_gt *gt = arg;
1308         struct i915_ppgtt *ppgtt;
1309         int err;
1310
1311         /* aliasing == global gtt locking, covered above */
1312         if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1313                 return 0;
1314
1315         ppgtt = i915_ppgtt_create(gt);
1316         if (IS_ERR(ppgtt))
1317                 return PTR_ERR(ppgtt);
1318
1319         err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1320                                     evict_vma, EXEC_OBJECT_WRITE);
1321         i915_vm_put(&ppgtt->vm);
1322
1323         return err;
1324 }
1325
1326 static int igt_reset_evict_fence(void *arg)
1327 {
1328         struct intel_gt *gt = arg;
1329
1330         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1331                                      evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1332 }
1333
1334 static int wait_for_others(struct intel_gt *gt,
1335                            struct intel_engine_cs *exclude)
1336 {
1337         struct intel_engine_cs *engine;
1338         enum intel_engine_id id;
1339
1340         for_each_engine(engine, gt, id) {
1341                 if (engine == exclude)
1342                         continue;
1343
1344                 if (!wait_for_idle(engine))
1345                         return -EIO;
1346         }
1347
1348         return 0;
1349 }
1350
1351 static int igt_reset_queue(void *arg)
1352 {
1353         struct intel_gt *gt = arg;
1354         struct i915_gpu_error *global = &gt->i915->gpu_error;
1355         struct intel_engine_cs *engine;
1356         enum intel_engine_id id;
1357         struct hang h;
1358         int err;
1359
1360         /* Check that we replay pending requests following a hang */
1361
1362         igt_global_reset_lock(gt);
1363
1364         err = hang_init(&h, gt);
1365         if (err)
1366                 goto unlock;
1367
1368         for_each_engine(engine, gt, id) {
1369                 struct i915_request *prev;
1370                 IGT_TIMEOUT(end_time);
1371                 unsigned int count;
1372
1373                 if (!intel_engine_can_store_dword(engine))
1374                         continue;
1375
1376                 prev = hang_create_request(&h, engine);
1377                 if (IS_ERR(prev)) {
1378                         err = PTR_ERR(prev);
1379                         goto fini;
1380                 }
1381
1382                 i915_request_get(prev);
1383                 i915_request_add(prev);
1384
1385                 count = 0;
1386                 do {
1387                         struct i915_request *rq;
1388                         unsigned int reset_count;
1389
1390                         rq = hang_create_request(&h, engine);
1391                         if (IS_ERR(rq)) {
1392                                 err = PTR_ERR(rq);
1393                                 goto fini;
1394                         }
1395
1396                         i915_request_get(rq);
1397                         i915_request_add(rq);
1398
1399                         /*
1400                          * XXX We don't handle resetting the kernel context
1401                          * very well. If we trigger a device reset twice in
1402                          * quick succession while the kernel context is
1403                          * executing, we may end up skipping the breadcrumb.
1404                          * This is really only a problem for the selftest as
1405                          * normally there is a large interlude between resets
1406                          * (hangcheck), or we focus on resetting just one
1407                          * engine and so avoid repeatedly resetting innocents.
1408                          */
1409                         err = wait_for_others(gt, engine);
1410                         if (err) {
1411                                 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1412                                        __func__, engine->name);
1413                                 i915_request_put(rq);
1414                                 i915_request_put(prev);
1415
1416                                 GEM_TRACE_DUMP();
1417                                 intel_gt_set_wedged(gt);
1418                                 goto fini;
1419                         }
1420
1421                         if (!wait_until_running(&h, prev)) {
1422                                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1423
1424                                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1425                                        __func__, engine->name,
1426                                        prev->fence.seqno, hws_seqno(&h, prev));
1427                                 intel_engine_dump(engine, &p,
1428                                                   "%s\n", engine->name);
1429
1430                                 i915_request_put(rq);
1431                                 i915_request_put(prev);
1432
1433                                 intel_gt_set_wedged(gt);
1434
1435                                 err = -EIO;
1436                                 goto fini;
1437                         }
1438
1439                         reset_count = fake_hangcheck(gt, BIT(id));
1440
1441                         if (prev->fence.error != -EIO) {
1442                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1443                                        prev->fence.error);
1444                                 i915_request_put(rq);
1445                                 i915_request_put(prev);
1446                                 err = -EINVAL;
1447                                 goto fini;
1448                         }
1449
1450                         if (rq->fence.error) {
1451                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1452                                        rq->fence.error);
1453                                 i915_request_put(rq);
1454                                 i915_request_put(prev);
1455                                 err = -EINVAL;
1456                                 goto fini;
1457                         }
1458
1459                         if (i915_reset_count(global) == reset_count) {
1460                                 pr_err("No GPU reset recorded!\n");
1461                                 i915_request_put(rq);
1462                                 i915_request_put(prev);
1463                                 err = -EINVAL;
1464                                 goto fini;
1465                         }
1466
1467                         i915_request_put(prev);
1468                         prev = rq;
1469                         count++;
1470                 } while (time_before(jiffies, end_time));
1471                 pr_info("%s: Completed %d resets\n", engine->name, count);
1472
1473                 *h.batch = MI_BATCH_BUFFER_END;
1474                 intel_gt_chipset_flush(engine->gt);
1475
1476                 i915_request_put(prev);
1477
1478                 err = igt_flush_test(gt->i915);
1479                 if (err)
1480                         break;
1481         }
1482
1483 fini:
1484         hang_fini(&h);
1485 unlock:
1486         igt_global_reset_unlock(gt);
1487
1488         if (intel_gt_is_wedged(gt))
1489                 return -EIO;
1490
1491         return err;
1492 }
1493
1494 static int igt_handle_error(void *arg)
1495 {
1496         struct intel_gt *gt = arg;
1497         struct i915_gpu_error *global = &gt->i915->gpu_error;
1498         struct intel_engine_cs *engine = gt->engine[RCS0];
1499         struct hang h;
1500         struct i915_request *rq;
1501         struct i915_gpu_coredump *error;
1502         int err;
1503
1504         /* Check that we can issue a global GPU and engine reset */
1505
1506         if (!intel_has_reset_engine(gt))
1507                 return 0;
1508
1509         if (!engine || !intel_engine_can_store_dword(engine))
1510                 return 0;
1511
1512         err = hang_init(&h, gt);
1513         if (err)
1514                 return err;
1515
1516         rq = hang_create_request(&h, engine);
1517         if (IS_ERR(rq)) {
1518                 err = PTR_ERR(rq);
1519                 goto err_fini;
1520         }
1521
1522         i915_request_get(rq);
1523         i915_request_add(rq);
1524
1525         if (!wait_until_running(&h, rq)) {
1526                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1527
1528                 pr_err("%s: Failed to start request %llx, at %x\n",
1529                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1530                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1531
1532                 intel_gt_set_wedged(gt);
1533
1534                 err = -EIO;
1535                 goto err_request;
1536         }
1537
1538         /* Temporarily disable error capture */
1539         error = xchg(&global->first_error, (void *)-1);
1540
1541         intel_gt_handle_error(gt, engine->mask, 0, NULL);
1542
1543         xchg(&global->first_error, error);
1544
1545         if (rq->fence.error != -EIO) {
1546                 pr_err("Guilty request not identified!\n");
1547                 err = -EINVAL;
1548                 goto err_request;
1549         }
1550
1551 err_request:
1552         i915_request_put(rq);
1553 err_fini:
1554         hang_fini(&h);
1555         return err;
1556 }
1557
1558 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1559                                      const struct igt_atomic_section *p,
1560                                      const char *mode)
1561 {
1562         struct tasklet_struct * const t = &engine->execlists.tasklet;
1563         int err;
1564
1565         GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1566                   engine->name, mode, p->name);
1567
1568         tasklet_disable(t);
1569         p->critical_section_begin();
1570
1571         err = intel_engine_reset(engine, NULL);
1572
1573         p->critical_section_end();
1574         tasklet_enable(t);
1575
1576         if (err)
1577                 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1578                        engine->name, mode, p->name);
1579
1580         return err;
1581 }
1582
1583 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1584                                    const struct igt_atomic_section *p)
1585 {
1586         struct i915_request *rq;
1587         struct hang h;
1588         int err;
1589
1590         err = __igt_atomic_reset_engine(engine, p, "idle");
1591         if (err)
1592                 return err;
1593
1594         err = hang_init(&h, engine->gt);
1595         if (err)
1596                 return err;
1597
1598         rq = hang_create_request(&h, engine);
1599         if (IS_ERR(rq)) {
1600                 err = PTR_ERR(rq);
1601                 goto out;
1602         }
1603
1604         i915_request_get(rq);
1605         i915_request_add(rq);
1606
1607         if (wait_until_running(&h, rq)) {
1608                 err = __igt_atomic_reset_engine(engine, p, "active");
1609         } else {
1610                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1611                        __func__, engine->name,
1612                        rq->fence.seqno, hws_seqno(&h, rq));
1613                 intel_gt_set_wedged(engine->gt);
1614                 err = -EIO;
1615         }
1616
1617         if (err == 0) {
1618                 struct intel_wedge_me w;
1619
1620                 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1621                         i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1622                 if (intel_gt_is_wedged(engine->gt))
1623                         err = -EIO;
1624         }
1625
1626         i915_request_put(rq);
1627 out:
1628         hang_fini(&h);
1629         return err;
1630 }
1631
1632 static int igt_reset_engines_atomic(void *arg)
1633 {
1634         struct intel_gt *gt = arg;
1635         const typeof(*igt_atomic_phases) *p;
1636         int err = 0;
1637
1638         /* Check that the engines resets are usable from atomic context */
1639
1640         if (!intel_has_reset_engine(gt))
1641                 return 0;
1642
1643         if (intel_uc_uses_guc_submission(&gt->uc))
1644                 return 0;
1645
1646         igt_global_reset_lock(gt);
1647
1648         /* Flush any requests before we get started and check basics */
1649         if (!igt_force_reset(gt))
1650                 goto unlock;
1651
1652         for (p = igt_atomic_phases; p->name; p++) {
1653                 struct intel_engine_cs *engine;
1654                 enum intel_engine_id id;
1655
1656                 for_each_engine(engine, gt, id) {
1657                         err = igt_atomic_reset_engine(engine, p);
1658                         if (err)
1659                                 goto out;
1660                 }
1661         }
1662
1663 out:
1664         /* As we poke around the guts, do a full reset before continuing. */
1665         igt_force_reset(gt);
1666 unlock:
1667         igt_global_reset_unlock(gt);
1668
1669         return err;
1670 }
1671
1672 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1673 {
1674         static const struct i915_subtest tests[] = {
1675                 SUBTEST(igt_hang_sanitycheck),
1676                 SUBTEST(igt_reset_nop),
1677                 SUBTEST(igt_reset_nop_engine),
1678                 SUBTEST(igt_reset_idle_engine),
1679                 SUBTEST(igt_reset_active_engine),
1680                 SUBTEST(igt_reset_engines),
1681                 SUBTEST(igt_reset_engines_atomic),
1682                 SUBTEST(igt_reset_queue),
1683                 SUBTEST(igt_reset_wait),
1684                 SUBTEST(igt_reset_evict_ggtt),
1685                 SUBTEST(igt_reset_evict_ppgtt),
1686                 SUBTEST(igt_reset_evict_fence),
1687                 SUBTEST(igt_handle_error),
1688         };
1689         struct intel_gt *gt = &i915->gt;
1690         intel_wakeref_t wakeref;
1691         int err;
1692
1693         if (!intel_has_gpu_reset(gt))
1694                 return 0;
1695
1696         if (intel_gt_is_wedged(gt))
1697                 return -EIO; /* we're long past hope of a successful reset */
1698
1699         wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1700
1701         err = intel_gt_live_subtests(tests, gt);
1702
1703         intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1704
1705         return err;
1706 }