4a20ba63446c5acb7d399e6868dbc1400f0984ef
[sfrench/cifs-2.6.git] / drivers / gpu / drm / i915 / gt / selftest_hangcheck.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2016 Intel Corporation
4  */
5
6 #include <linux/kthread.h>
7
8 #include "gem/i915_gem_context.h"
9
10 #include "i915_gem_evict.h"
11 #include "intel_gt.h"
12 #include "intel_engine_heartbeat.h"
13 #include "intel_engine_pm.h"
14 #include "selftest_engine_heartbeat.h"
15
16 #include "i915_selftest.h"
17 #include "selftests/i915_random.h"
18 #include "selftests/igt_flush_test.h"
19 #include "selftests/igt_reset.h"
20 #include "selftests/igt_atomic.h"
21 #include "selftests/igt_spinner.h"
22 #include "selftests/intel_scheduler_helpers.h"
23
24 #include "selftests/mock_drm.h"
25
26 #include "gem/selftests/mock_context.h"
27 #include "gem/selftests/igt_gem_utils.h"
28
29 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
30
31 struct hang {
32         struct intel_gt *gt;
33         struct drm_i915_gem_object *hws;
34         struct drm_i915_gem_object *obj;
35         struct i915_gem_context *ctx;
36         u32 *seqno;
37         u32 *batch;
38 };
39
40 static int hang_init(struct hang *h, struct intel_gt *gt)
41 {
42         void *vaddr;
43         int err;
44
45         memset(h, 0, sizeof(*h));
46         h->gt = gt;
47
48         h->ctx = kernel_context(gt->i915, NULL);
49         if (IS_ERR(h->ctx))
50                 return PTR_ERR(h->ctx);
51
52         GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
53
54         h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
55         if (IS_ERR(h->hws)) {
56                 err = PTR_ERR(h->hws);
57                 goto err_ctx;
58         }
59
60         h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
61         if (IS_ERR(h->obj)) {
62                 err = PTR_ERR(h->obj);
63                 goto err_hws;
64         }
65
66         i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
67         vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
68         if (IS_ERR(vaddr)) {
69                 err = PTR_ERR(vaddr);
70                 goto err_obj;
71         }
72         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
73
74         vaddr = i915_gem_object_pin_map_unlocked(h->obj,
75                                                  i915_coherent_map_type(gt->i915, h->obj, false));
76         if (IS_ERR(vaddr)) {
77                 err = PTR_ERR(vaddr);
78                 goto err_unpin_hws;
79         }
80         h->batch = vaddr;
81
82         return 0;
83
84 err_unpin_hws:
85         i915_gem_object_unpin_map(h->hws);
86 err_obj:
87         i915_gem_object_put(h->obj);
88 err_hws:
89         i915_gem_object_put(h->hws);
90 err_ctx:
91         kernel_context_close(h->ctx);
92         return err;
93 }
94
95 static u64 hws_address(const struct i915_vma *hws,
96                        const struct i915_request *rq)
97 {
98         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
99 }
100
101 static int move_to_active(struct i915_vma *vma,
102                           struct i915_request *rq,
103                           unsigned int flags)
104 {
105         int err;
106
107         i915_vma_lock(vma);
108         err = i915_request_await_object(rq, vma->obj,
109                                         flags & EXEC_OBJECT_WRITE);
110         if (err == 0)
111                 err = i915_vma_move_to_active(vma, rq, flags);
112         i915_vma_unlock(vma);
113
114         return err;
115 }
116
117 static struct i915_request *
118 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
119 {
120         struct intel_gt *gt = h->gt;
121         struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
122         struct drm_i915_gem_object *obj;
123         struct i915_request *rq = NULL;
124         struct i915_vma *hws, *vma;
125         unsigned int flags;
126         void *vaddr;
127         u32 *batch;
128         int err;
129
130         obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
131         if (IS_ERR(obj)) {
132                 i915_vm_put(vm);
133                 return ERR_CAST(obj);
134         }
135
136         vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
137         if (IS_ERR(vaddr)) {
138                 i915_gem_object_put(obj);
139                 i915_vm_put(vm);
140                 return ERR_CAST(vaddr);
141         }
142
143         i915_gem_object_unpin_map(h->obj);
144         i915_gem_object_put(h->obj);
145
146         h->obj = obj;
147         h->batch = vaddr;
148
149         vma = i915_vma_instance(h->obj, vm, NULL);
150         if (IS_ERR(vma)) {
151                 i915_vm_put(vm);
152                 return ERR_CAST(vma);
153         }
154
155         hws = i915_vma_instance(h->hws, vm, NULL);
156         if (IS_ERR(hws)) {
157                 i915_vm_put(vm);
158                 return ERR_CAST(hws);
159         }
160
161         err = i915_vma_pin(vma, 0, 0, PIN_USER);
162         if (err) {
163                 i915_vm_put(vm);
164                 return ERR_PTR(err);
165         }
166
167         err = i915_vma_pin(hws, 0, 0, PIN_USER);
168         if (err)
169                 goto unpin_vma;
170
171         rq = igt_request_alloc(h->ctx, engine);
172         if (IS_ERR(rq)) {
173                 err = PTR_ERR(rq);
174                 goto unpin_hws;
175         }
176
177         err = move_to_active(vma, rq, 0);
178         if (err)
179                 goto cancel_rq;
180
181         err = move_to_active(hws, rq, 0);
182         if (err)
183                 goto cancel_rq;
184
185         batch = h->batch;
186         if (GRAPHICS_VER(gt->i915) >= 8) {
187                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
188                 *batch++ = lower_32_bits(hws_address(hws, rq));
189                 *batch++ = upper_32_bits(hws_address(hws, rq));
190                 *batch++ = rq->fence.seqno;
191                 *batch++ = MI_NOOP;
192
193                 memset(batch, 0, 1024);
194                 batch += 1024 / sizeof(*batch);
195
196                 *batch++ = MI_NOOP;
197                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
198                 *batch++ = lower_32_bits(vma->node.start);
199                 *batch++ = upper_32_bits(vma->node.start);
200         } else if (GRAPHICS_VER(gt->i915) >= 6) {
201                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
202                 *batch++ = 0;
203                 *batch++ = lower_32_bits(hws_address(hws, rq));
204                 *batch++ = rq->fence.seqno;
205                 *batch++ = MI_NOOP;
206
207                 memset(batch, 0, 1024);
208                 batch += 1024 / sizeof(*batch);
209
210                 *batch++ = MI_NOOP;
211                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
212                 *batch++ = lower_32_bits(vma->node.start);
213         } else if (GRAPHICS_VER(gt->i915) >= 4) {
214                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
215                 *batch++ = 0;
216                 *batch++ = lower_32_bits(hws_address(hws, rq));
217                 *batch++ = rq->fence.seqno;
218                 *batch++ = MI_NOOP;
219
220                 memset(batch, 0, 1024);
221                 batch += 1024 / sizeof(*batch);
222
223                 *batch++ = MI_NOOP;
224                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
225                 *batch++ = lower_32_bits(vma->node.start);
226         } else {
227                 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
228                 *batch++ = lower_32_bits(hws_address(hws, rq));
229                 *batch++ = rq->fence.seqno;
230                 *batch++ = MI_NOOP;
231
232                 memset(batch, 0, 1024);
233                 batch += 1024 / sizeof(*batch);
234
235                 *batch++ = MI_NOOP;
236                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
237                 *batch++ = lower_32_bits(vma->node.start);
238         }
239         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
240         intel_gt_chipset_flush(engine->gt);
241
242         if (rq->engine->emit_init_breadcrumb) {
243                 err = rq->engine->emit_init_breadcrumb(rq);
244                 if (err)
245                         goto cancel_rq;
246         }
247
248         flags = 0;
249         if (GRAPHICS_VER(gt->i915) <= 5)
250                 flags |= I915_DISPATCH_SECURE;
251
252         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
253
254 cancel_rq:
255         if (err) {
256                 i915_request_set_error_once(rq, err);
257                 i915_request_add(rq);
258         }
259 unpin_hws:
260         i915_vma_unpin(hws);
261 unpin_vma:
262         i915_vma_unpin(vma);
263         i915_vm_put(vm);
264         return err ? ERR_PTR(err) : rq;
265 }
266
267 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
268 {
269         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
270 }
271
272 static void hang_fini(struct hang *h)
273 {
274         *h->batch = MI_BATCH_BUFFER_END;
275         intel_gt_chipset_flush(h->gt);
276
277         i915_gem_object_unpin_map(h->obj);
278         i915_gem_object_put(h->obj);
279
280         i915_gem_object_unpin_map(h->hws);
281         i915_gem_object_put(h->hws);
282
283         kernel_context_close(h->ctx);
284
285         igt_flush_test(h->gt->i915);
286 }
287
288 static bool wait_until_running(struct hang *h, struct i915_request *rq)
289 {
290         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
291                                                rq->fence.seqno),
292                              10) &&
293                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
294                                             rq->fence.seqno),
295                           1000));
296 }
297
298 static int igt_hang_sanitycheck(void *arg)
299 {
300         struct intel_gt *gt = arg;
301         struct i915_request *rq;
302         struct intel_engine_cs *engine;
303         enum intel_engine_id id;
304         struct hang h;
305         int err;
306
307         /* Basic check that we can execute our hanging batch */
308
309         err = hang_init(&h, gt);
310         if (err)
311                 return err;
312
313         for_each_engine(engine, gt, id) {
314                 struct intel_wedge_me w;
315                 long timeout;
316
317                 if (!intel_engine_can_store_dword(engine))
318                         continue;
319
320                 rq = hang_create_request(&h, engine);
321                 if (IS_ERR(rq)) {
322                         err = PTR_ERR(rq);
323                         pr_err("Failed to create request for %s, err=%d\n",
324                                engine->name, err);
325                         goto fini;
326                 }
327
328                 i915_request_get(rq);
329
330                 *h.batch = MI_BATCH_BUFFER_END;
331                 intel_gt_chipset_flush(engine->gt);
332
333                 i915_request_add(rq);
334
335                 timeout = 0;
336                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
337                         timeout = i915_request_wait(rq, 0,
338                                                     MAX_SCHEDULE_TIMEOUT);
339                 if (intel_gt_is_wedged(gt))
340                         timeout = -EIO;
341
342                 i915_request_put(rq);
343
344                 if (timeout < 0) {
345                         err = timeout;
346                         pr_err("Wait for request failed on %s, err=%d\n",
347                                engine->name, err);
348                         goto fini;
349                 }
350         }
351
352 fini:
353         hang_fini(&h);
354         return err;
355 }
356
357 static bool wait_for_idle(struct intel_engine_cs *engine)
358 {
359         return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
360 }
361
362 static int igt_reset_nop(void *arg)
363 {
364         struct intel_gt *gt = arg;
365         struct i915_gpu_error *global = &gt->i915->gpu_error;
366         struct intel_engine_cs *engine;
367         unsigned int reset_count, count;
368         enum intel_engine_id id;
369         IGT_TIMEOUT(end_time);
370         int err = 0;
371
372         /* Check that we can reset during non-user portions of requests */
373
374         reset_count = i915_reset_count(global);
375         count = 0;
376         do {
377                 for_each_engine(engine, gt, id) {
378                         struct intel_context *ce;
379                         int i;
380
381                         ce = intel_context_create(engine);
382                         if (IS_ERR(ce)) {
383                                 err = PTR_ERR(ce);
384                                 pr_err("[%s] Create context failed: %d!\n", engine->name, err);
385                                 break;
386                         }
387
388                         for (i = 0; i < 16; i++) {
389                                 struct i915_request *rq;
390
391                                 rq = intel_context_create_request(ce);
392                                 if (IS_ERR(rq)) {
393                                         err = PTR_ERR(rq);
394                                         pr_err("[%s] Create request failed: %d!\n",
395                                                engine->name, err);
396                                         break;
397                                 }
398
399                                 i915_request_add(rq);
400                         }
401
402                         intel_context_put(ce);
403                 }
404
405                 igt_global_reset_lock(gt);
406                 intel_gt_reset(gt, ALL_ENGINES, NULL);
407                 igt_global_reset_unlock(gt);
408
409                 if (intel_gt_is_wedged(gt)) {
410                         pr_err("[%s] GT is wedged!\n", engine->name);
411                         err = -EIO;
412                         break;
413                 }
414
415                 if (i915_reset_count(global) != reset_count + ++count) {
416                         pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
417                                engine->name, i915_reset_count(global), reset_count, count);
418                         err = -EINVAL;
419                         break;
420                 }
421
422                 err = igt_flush_test(gt->i915);
423                 if (err) {
424                         pr_err("[%s] Flush failed: %d!\n", engine->name, err);
425                         break;
426                 }
427         } while (time_before(jiffies, end_time));
428         pr_info("%s: %d resets\n", __func__, count);
429
430         if (igt_flush_test(gt->i915)) {
431                 pr_err("Post flush failed: %d!\n", err);
432                 err = -EIO;
433         }
434
435         return err;
436 }
437
438 static int igt_reset_nop_engine(void *arg)
439 {
440         struct intel_gt *gt = arg;
441         struct i915_gpu_error *global = &gt->i915->gpu_error;
442         struct intel_engine_cs *engine;
443         enum intel_engine_id id;
444
445         /* Check that we can engine-reset during non-user portions */
446
447         if (!intel_has_reset_engine(gt))
448                 return 0;
449
450         for_each_engine(engine, gt, id) {
451                 unsigned int reset_count, reset_engine_count, count;
452                 struct intel_context *ce;
453                 IGT_TIMEOUT(end_time);
454                 int err;
455
456                 if (intel_engine_uses_guc(engine)) {
457                         /* Engine level resets are triggered by GuC when a hang
458                          * is detected. They can't be triggered by the KMD any
459                          * more. Thus a nop batch cannot be used as a reset test
460                          */
461                         continue;
462                 }
463
464                 ce = intel_context_create(engine);
465                 if (IS_ERR(ce)) {
466                         pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
467                         return PTR_ERR(ce);
468                 }
469
470                 reset_count = i915_reset_count(global);
471                 reset_engine_count = i915_reset_engine_count(global, engine);
472                 count = 0;
473
474                 st_engine_heartbeat_disable(engine);
475                 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
476                                             &gt->reset.flags));
477                 do {
478                         int i;
479
480                         if (!wait_for_idle(engine)) {
481                                 pr_err("%s failed to idle before reset\n",
482                                        engine->name);
483                                 err = -EIO;
484                                 break;
485                         }
486
487                         for (i = 0; i < 16; i++) {
488                                 struct i915_request *rq;
489
490                                 rq = intel_context_create_request(ce);
491                                 if (IS_ERR(rq)) {
492                                         struct drm_printer p =
493                                                 drm_info_printer(gt->i915->drm.dev);
494                                         intel_engine_dump(engine, &p,
495                                                           "%s(%s): failed to submit request\n",
496                                                           __func__,
497                                                           engine->name);
498
499                                         GEM_TRACE("%s(%s): failed to submit request\n",
500                                                   __func__,
501                                                   engine->name);
502                                         GEM_TRACE_DUMP();
503
504                                         intel_gt_set_wedged(gt);
505
506                                         err = PTR_ERR(rq);
507                                         break;
508                                 }
509
510                                 i915_request_add(rq);
511                         }
512                         err = intel_engine_reset(engine, NULL);
513                         if (err) {
514                                 pr_err("intel_engine_reset(%s) failed, err:%d\n",
515                                        engine->name, err);
516                                 break;
517                         }
518
519                         if (i915_reset_count(global) != reset_count) {
520                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
521                                 err = -EINVAL;
522                                 break;
523                         }
524
525                         if (i915_reset_engine_count(global, engine) !=
526                             reset_engine_count + ++count) {
527                                 pr_err("%s engine reset not recorded!\n",
528                                        engine->name);
529                                 err = -EINVAL;
530                                 break;
531                         }
532                 } while (time_before(jiffies, end_time));
533                 clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
534                 st_engine_heartbeat_enable(engine);
535
536                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
537
538                 intel_context_put(ce);
539                 if (igt_flush_test(gt->i915))
540                         err = -EIO;
541                 if (err)
542                         return err;
543         }
544
545         return 0;
546 }
547
548 static void force_reset_timeout(struct intel_engine_cs *engine)
549 {
550         engine->reset_timeout.probability = 999;
551         atomic_set(&engine->reset_timeout.times, -1);
552 }
553
554 static void cancel_reset_timeout(struct intel_engine_cs *engine)
555 {
556         memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
557 }
558
559 static int igt_reset_fail_engine(void *arg)
560 {
561         struct intel_gt *gt = arg;
562         struct intel_engine_cs *engine;
563         enum intel_engine_id id;
564
565         /* Check that we can recover from engine-reset failues */
566
567         if (!intel_has_reset_engine(gt))
568                 return 0;
569
570         for_each_engine(engine, gt, id) {
571                 unsigned int count;
572                 struct intel_context *ce;
573                 IGT_TIMEOUT(end_time);
574                 int err;
575
576                 /* Can't manually break the reset if i915 doesn't perform it */
577                 if (intel_engine_uses_guc(engine))
578                         continue;
579
580                 ce = intel_context_create(engine);
581                 if (IS_ERR(ce)) {
582                         pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
583                         return PTR_ERR(ce);
584                 }
585
586                 st_engine_heartbeat_disable(engine);
587                 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
588                                             &gt->reset.flags));
589
590                 force_reset_timeout(engine);
591                 err = intel_engine_reset(engine, NULL);
592                 cancel_reset_timeout(engine);
593                 if (err == 0) /* timeouts only generated on gen8+ */
594                         goto skip;
595
596                 count = 0;
597                 do {
598                         struct i915_request *last = NULL;
599                         int i;
600
601                         if (!wait_for_idle(engine)) {
602                                 pr_err("%s failed to idle before reset\n",
603                                        engine->name);
604                                 err = -EIO;
605                                 break;
606                         }
607
608                         for (i = 0; i < count % 15; i++) {
609                                 struct i915_request *rq;
610
611                                 rq = intel_context_create_request(ce);
612                                 if (IS_ERR(rq)) {
613                                         struct drm_printer p =
614                                                 drm_info_printer(gt->i915->drm.dev);
615                                         intel_engine_dump(engine, &p,
616                                                           "%s(%s): failed to submit request\n",
617                                                           __func__,
618                                                           engine->name);
619
620                                         GEM_TRACE("%s(%s): failed to submit request\n",
621                                                   __func__,
622                                                   engine->name);
623                                         GEM_TRACE_DUMP();
624
625                                         intel_gt_set_wedged(gt);
626                                         if (last)
627                                                 i915_request_put(last);
628
629                                         err = PTR_ERR(rq);
630                                         goto out;
631                                 }
632
633                                 if (last)
634                                         i915_request_put(last);
635                                 last = i915_request_get(rq);
636                                 i915_request_add(rq);
637                         }
638
639                         if (count & 1) {
640                                 err = intel_engine_reset(engine, NULL);
641                                 if (err) {
642                                         GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
643                                                       engine->name, err);
644                                         GEM_TRACE_DUMP();
645                                         i915_request_put(last);
646                                         break;
647                                 }
648                         } else {
649                                 force_reset_timeout(engine);
650                                 err = intel_engine_reset(engine, NULL);
651                                 cancel_reset_timeout(engine);
652                                 if (err != -ETIMEDOUT) {
653                                         pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
654                                                engine->name, err);
655                                         i915_request_put(last);
656                                         break;
657                                 }
658                         }
659
660                         err = 0;
661                         if (last) {
662                                 if (i915_request_wait(last, 0, HZ / 2) < 0) {
663                                         struct drm_printer p =
664                                                 drm_info_printer(gt->i915->drm.dev);
665
666                                         intel_engine_dump(engine, &p,
667                                                           "%s(%s): failed to complete request\n",
668                                                           __func__,
669                                                           engine->name);
670
671                                         GEM_TRACE("%s(%s): failed to complete request\n",
672                                                   __func__,
673                                                   engine->name);
674                                         GEM_TRACE_DUMP();
675
676                                         err = -EIO;
677                                 }
678                                 i915_request_put(last);
679                         }
680                         count++;
681                 } while (err == 0 && time_before(jiffies, end_time));
682 out:
683                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
684 skip:
685                 clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
686                 st_engine_heartbeat_enable(engine);
687                 intel_context_put(ce);
688
689                 if (igt_flush_test(gt->i915))
690                         err = -EIO;
691                 if (err)
692                         return err;
693         }
694
695         return 0;
696 }
697
698 static int __igt_reset_engine(struct intel_gt *gt, bool active)
699 {
700         struct i915_gpu_error *global = &gt->i915->gpu_error;
701         struct intel_engine_cs *engine;
702         enum intel_engine_id id;
703         struct hang h;
704         int err = 0;
705
706         /* Check that we can issue an engine reset on an idle engine (no-op) */
707
708         if (!intel_has_reset_engine(gt))
709                 return 0;
710
711         if (active) {
712                 err = hang_init(&h, gt);
713                 if (err)
714                         return err;
715         }
716
717         for_each_engine(engine, gt, id) {
718                 unsigned int reset_count, reset_engine_count;
719                 unsigned long count;
720                 bool using_guc = intel_engine_uses_guc(engine);
721                 IGT_TIMEOUT(end_time);
722
723                 if (using_guc && !active)
724                         continue;
725
726                 if (active && !intel_engine_can_store_dword(engine))
727                         continue;
728
729                 if (!wait_for_idle(engine)) {
730                         pr_err("%s failed to idle before reset\n",
731                                engine->name);
732                         err = -EIO;
733                         break;
734                 }
735
736                 reset_count = i915_reset_count(global);
737                 reset_engine_count = i915_reset_engine_count(global, engine);
738
739                 st_engine_heartbeat_disable(engine);
740                 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
741                                             &gt->reset.flags));
742                 count = 0;
743                 do {
744                         struct i915_request *rq = NULL;
745                         struct intel_selftest_saved_policy saved;
746                         int err2;
747
748                         err = intel_selftest_modify_policy(engine, &saved,
749                                                            SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
750                         if (err) {
751                                 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
752                                 break;
753                         }
754
755                         if (active) {
756                                 rq = hang_create_request(&h, engine);
757                                 if (IS_ERR(rq)) {
758                                         err = PTR_ERR(rq);
759                                         pr_err("[%s] Create hang request failed: %d!\n",
760                                                engine->name, err);
761                                         goto restore;
762                                 }
763
764                                 i915_request_get(rq);
765                                 i915_request_add(rq);
766
767                                 if (!wait_until_running(&h, rq)) {
768                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
769
770                                         pr_err("%s: Failed to start request %llx, at %x\n",
771                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
772                                         intel_engine_dump(engine, &p,
773                                                           "%s\n", engine->name);
774
775                                         i915_request_put(rq);
776                                         err = -EIO;
777                                         goto restore;
778                                 }
779                         }
780
781                         if (!using_guc) {
782                                 err = intel_engine_reset(engine, NULL);
783                                 if (err) {
784                                         pr_err("intel_engine_reset(%s) failed, err:%d\n",
785                                                engine->name, err);
786                                         goto skip;
787                                 }
788                         }
789
790                         if (rq) {
791                                 /* Ensure the reset happens and kills the engine */
792                                 err = intel_selftest_wait_for_rq(rq);
793                                 if (err)
794                                         pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
795                                                engine->name, rq->fence.context,
796                                                rq->fence.seqno, rq->context->guc_id.id, err);
797                         }
798
799 skip:
800                         if (rq)
801                                 i915_request_put(rq);
802
803                         if (i915_reset_count(global) != reset_count) {
804                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
805                                 err = -EINVAL;
806                                 goto restore;
807                         }
808
809                         /* GuC based resets are not logged per engine */
810                         if (!using_guc) {
811                                 if (i915_reset_engine_count(global, engine) !=
812                                     ++reset_engine_count) {
813                                         pr_err("%s engine reset not recorded!\n",
814                                                engine->name);
815                                         err = -EINVAL;
816                                         goto restore;
817                                 }
818                         }
819
820                         count++;
821
822 restore:
823                         err2 = intel_selftest_restore_policy(engine, &saved);
824                         if (err2)
825                                 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
826                         if (err == 0)
827                                 err = err2;
828                         if (err)
829                                 break;
830                 } while (time_before(jiffies, end_time));
831                 clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
832                 st_engine_heartbeat_enable(engine);
833                 pr_info("%s: Completed %lu %s resets\n",
834                         engine->name, count, active ? "active" : "idle");
835
836                 if (err)
837                         break;
838
839                 err = igt_flush_test(gt->i915);
840                 if (err) {
841                         pr_err("[%s] Flush failed: %d!\n", engine->name, err);
842                         break;
843                 }
844         }
845
846         if (intel_gt_is_wedged(gt)) {
847                 pr_err("GT is wedged!\n");
848                 err = -EIO;
849         }
850
851         if (active)
852                 hang_fini(&h);
853
854         return err;
855 }
856
857 static int igt_reset_idle_engine(void *arg)
858 {
859         return __igt_reset_engine(arg, false);
860 }
861
862 static int igt_reset_active_engine(void *arg)
863 {
864         return __igt_reset_engine(arg, true);
865 }
866
867 struct active_engine {
868         struct task_struct *task;
869         struct intel_engine_cs *engine;
870         unsigned long resets;
871         unsigned int flags;
872 };
873
874 #define TEST_ACTIVE     BIT(0)
875 #define TEST_OTHERS     BIT(1)
876 #define TEST_SELF       BIT(2)
877 #define TEST_PRIORITY   BIT(3)
878
879 static int active_request_put(struct i915_request *rq)
880 {
881         int err = 0;
882
883         if (!rq)
884                 return 0;
885
886         if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
887                 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
888                           rq->engine->name,
889                           rq->fence.context,
890                           rq->fence.seqno);
891                 GEM_TRACE_DUMP();
892
893                 intel_gt_set_wedged(rq->engine->gt);
894                 err = -EIO;
895         }
896
897         i915_request_put(rq);
898
899         return err;
900 }
901
902 static int active_engine(void *data)
903 {
904         I915_RND_STATE(prng);
905         struct active_engine *arg = data;
906         struct intel_engine_cs *engine = arg->engine;
907         struct i915_request *rq[8] = {};
908         struct intel_context *ce[ARRAY_SIZE(rq)];
909         unsigned long count;
910         int err = 0;
911
912         for (count = 0; count < ARRAY_SIZE(ce); count++) {
913                 ce[count] = intel_context_create(engine);
914                 if (IS_ERR(ce[count])) {
915                         err = PTR_ERR(ce[count]);
916                         pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
917                         while (--count)
918                                 intel_context_put(ce[count]);
919                         return err;
920                 }
921         }
922
923         count = 0;
924         while (!kthread_should_stop()) {
925                 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
926                 struct i915_request *old = rq[idx];
927                 struct i915_request *new;
928
929                 new = intel_context_create_request(ce[idx]);
930                 if (IS_ERR(new)) {
931                         err = PTR_ERR(new);
932                         pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
933                         break;
934                 }
935
936                 rq[idx] = i915_request_get(new);
937                 i915_request_add(new);
938
939                 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
940                         struct i915_sched_attr attr = {
941                                 .priority =
942                                         i915_prandom_u32_max_state(512, &prng),
943                         };
944                         engine->sched_engine->schedule(rq[idx], &attr);
945                 }
946
947                 err = active_request_put(old);
948                 if (err) {
949                         pr_err("[%s] Request put failed: %d!\n", engine->name, err);
950                         break;
951                 }
952
953                 cond_resched();
954         }
955
956         for (count = 0; count < ARRAY_SIZE(rq); count++) {
957                 int err__ = active_request_put(rq[count]);
958
959                 if (err)
960                         pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
961
962                 /* Keep the first error */
963                 if (!err)
964                         err = err__;
965
966                 intel_context_put(ce[count]);
967         }
968
969         return err;
970 }
971
972 static int __igt_reset_engines(struct intel_gt *gt,
973                                const char *test_name,
974                                unsigned int flags)
975 {
976         struct i915_gpu_error *global = &gt->i915->gpu_error;
977         struct intel_engine_cs *engine, *other;
978         enum intel_engine_id id, tmp;
979         struct hang h;
980         int err = 0;
981
982         /* Check that issuing a reset on one engine does not interfere
983          * with any other engine.
984          */
985
986         if (!intel_has_reset_engine(gt))
987                 return 0;
988
989         if (flags & TEST_ACTIVE) {
990                 err = hang_init(&h, gt);
991                 if (err)
992                         return err;
993
994                 if (flags & TEST_PRIORITY)
995                         h.ctx->sched.priority = 1024;
996         }
997
998         for_each_engine(engine, gt, id) {
999                 struct active_engine threads[I915_NUM_ENGINES] = {};
1000                 unsigned long device = i915_reset_count(global);
1001                 unsigned long count = 0, reported;
1002                 bool using_guc = intel_engine_uses_guc(engine);
1003                 IGT_TIMEOUT(end_time);
1004
1005                 if (flags & TEST_ACTIVE) {
1006                         if (!intel_engine_can_store_dword(engine))
1007                                 continue;
1008                 } else if (using_guc)
1009                         continue;
1010
1011                 if (!wait_for_idle(engine)) {
1012                         pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1013                                engine->name, test_name);
1014                         err = -EIO;
1015                         break;
1016                 }
1017
1018                 memset(threads, 0, sizeof(threads));
1019                 for_each_engine(other, gt, tmp) {
1020                         struct task_struct *tsk;
1021
1022                         threads[tmp].resets =
1023                                 i915_reset_engine_count(global, other);
1024
1025                         if (other == engine && !(flags & TEST_SELF))
1026                                 continue;
1027
1028                         if (other != engine && !(flags & TEST_OTHERS))
1029                                 continue;
1030
1031                         threads[tmp].engine = other;
1032                         threads[tmp].flags = flags;
1033
1034                         tsk = kthread_run(active_engine, &threads[tmp],
1035                                           "igt/%s", other->name);
1036                         if (IS_ERR(tsk)) {
1037                                 err = PTR_ERR(tsk);
1038                                 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1039                                 goto unwind;
1040                         }
1041
1042                         threads[tmp].task = tsk;
1043                         get_task_struct(tsk);
1044                 }
1045
1046                 yield(); /* start all threads before we begin */
1047
1048                 st_engine_heartbeat_disable_no_pm(engine);
1049                 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
1050                                             &gt->reset.flags));
1051                 do {
1052                         struct i915_request *rq = NULL;
1053                         struct intel_selftest_saved_policy saved;
1054                         int err2;
1055
1056                         err = intel_selftest_modify_policy(engine, &saved,
1057                                                            SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1058                         if (err) {
1059                                 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1060                                 break;
1061                         }
1062
1063                         if (flags & TEST_ACTIVE) {
1064                                 rq = hang_create_request(&h, engine);
1065                                 if (IS_ERR(rq)) {
1066                                         err = PTR_ERR(rq);
1067                                         pr_err("[%s] Create hang request failed: %d!\n",
1068                                                engine->name, err);
1069                                         goto restore;
1070                                 }
1071
1072                                 i915_request_get(rq);
1073                                 i915_request_add(rq);
1074
1075                                 if (!wait_until_running(&h, rq)) {
1076                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1077
1078                                         pr_err("%s: Failed to start request %llx, at %x\n",
1079                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
1080                                         intel_engine_dump(engine, &p,
1081                                                           "%s\n", engine->name);
1082
1083                                         i915_request_put(rq);
1084                                         err = -EIO;
1085                                         goto restore;
1086                                 }
1087                         } else {
1088                                 intel_engine_pm_get(engine);
1089                         }
1090
1091                         if (!using_guc) {
1092                                 err = intel_engine_reset(engine, NULL);
1093                                 if (err) {
1094                                         pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1095                                                engine->name, test_name, err);
1096                                         goto restore;
1097                                 }
1098                         }
1099
1100                         if (rq) {
1101                                 /* Ensure the reset happens and kills the engine */
1102                                 err = intel_selftest_wait_for_rq(rq);
1103                                 if (err)
1104                                         pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1105                                                engine->name, rq->fence.context,
1106                                                rq->fence.seqno, rq->context->guc_id.id, err);
1107                         }
1108
1109                         count++;
1110
1111                         if (rq) {
1112                                 if (rq->fence.error != -EIO) {
1113                                         pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1114                                                engine->name, test_name,
1115                                                rq->fence.context,
1116                                                rq->fence.seqno, rq->context->guc_id.id);
1117                                         i915_request_put(rq);
1118
1119                                         GEM_TRACE_DUMP();
1120                                         intel_gt_set_wedged(gt);
1121                                         err = -EIO;
1122                                         goto restore;
1123                                 }
1124
1125                                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1126                                         struct drm_printer p =
1127                                                 drm_info_printer(gt->i915->drm.dev);
1128
1129                                         pr_err("i915_reset_engine(%s:%s):"
1130                                                " failed to complete request %llx:%lld after reset\n",
1131                                                engine->name, test_name,
1132                                                rq->fence.context,
1133                                                rq->fence.seqno);
1134                                         intel_engine_dump(engine, &p,
1135                                                           "%s\n", engine->name);
1136                                         i915_request_put(rq);
1137
1138                                         GEM_TRACE_DUMP();
1139                                         intel_gt_set_wedged(gt);
1140                                         err = -EIO;
1141                                         goto restore;
1142                                 }
1143
1144                                 i915_request_put(rq);
1145                         }
1146
1147                         if (!(flags & TEST_ACTIVE))
1148                                 intel_engine_pm_put(engine);
1149
1150                         if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1151                                 struct drm_printer p =
1152                                         drm_info_printer(gt->i915->drm.dev);
1153
1154                                 pr_err("i915_reset_engine(%s:%s):"
1155                                        " failed to idle after reset\n",
1156                                        engine->name, test_name);
1157                                 intel_engine_dump(engine, &p,
1158                                                   "%s\n", engine->name);
1159
1160                                 err = -EIO;
1161                                 goto restore;
1162                         }
1163
1164 restore:
1165                         err2 = intel_selftest_restore_policy(engine, &saved);
1166                         if (err2)
1167                                 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1168                         if (err == 0)
1169                                 err = err2;
1170                         if (err)
1171                                 break;
1172                 } while (time_before(jiffies, end_time));
1173                 clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1174                 st_engine_heartbeat_enable_no_pm(engine);
1175
1176                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1177                         engine->name, test_name, count);
1178
1179                 /* GuC based resets are not logged per engine */
1180                 if (!using_guc) {
1181                         reported = i915_reset_engine_count(global, engine);
1182                         reported -= threads[engine->id].resets;
1183                         if (reported != count) {
1184                                 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1185                                        engine->name, test_name, count, reported);
1186                                 if (!err)
1187                                         err = -EINVAL;
1188                         }
1189                 }
1190
1191 unwind:
1192                 for_each_engine(other, gt, tmp) {
1193                         int ret;
1194
1195                         if (!threads[tmp].task)
1196                                 continue;
1197
1198                         ret = kthread_stop(threads[tmp].task);
1199                         if (ret) {
1200                                 pr_err("kthread for other engine %s failed, err=%d\n",
1201                                        other->name, ret);
1202                                 if (!err)
1203                                         err = ret;
1204                         }
1205                         put_task_struct(threads[tmp].task);
1206
1207                         /* GuC based resets are not logged per engine */
1208                         if (!using_guc) {
1209                                 if (other->uabi_class != engine->uabi_class &&
1210                                     threads[tmp].resets !=
1211                                     i915_reset_engine_count(global, other)) {
1212                                         pr_err("Innocent engine %s was reset (count=%ld)\n",
1213                                                other->name,
1214                                                i915_reset_engine_count(global, other) -
1215                                                threads[tmp].resets);
1216                                         if (!err)
1217                                                 err = -EINVAL;
1218                                 }
1219                         }
1220                 }
1221
1222                 if (device != i915_reset_count(global)) {
1223                         pr_err("Global reset (count=%ld)!\n",
1224                                i915_reset_count(global) - device);
1225                         if (!err)
1226                                 err = -EINVAL;
1227                 }
1228
1229                 if (err)
1230                         break;
1231
1232                 err = igt_flush_test(gt->i915);
1233                 if (err) {
1234                         pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1235                         break;
1236                 }
1237         }
1238
1239         if (intel_gt_is_wedged(gt))
1240                 err = -EIO;
1241
1242         if (flags & TEST_ACTIVE)
1243                 hang_fini(&h);
1244
1245         return err;
1246 }
1247
1248 static int igt_reset_engines(void *arg)
1249 {
1250         static const struct {
1251                 const char *name;
1252                 unsigned int flags;
1253         } phases[] = {
1254                 { "idle", 0 },
1255                 { "active", TEST_ACTIVE },
1256                 { "others-idle", TEST_OTHERS },
1257                 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1258                 {
1259                         "others-priority",
1260                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1261                 },
1262                 {
1263                         "self-priority",
1264                         TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1265                 },
1266                 { }
1267         };
1268         struct intel_gt *gt = arg;
1269         typeof(*phases) *p;
1270         int err;
1271
1272         for (p = phases; p->name; p++) {
1273                 if (p->flags & TEST_PRIORITY) {
1274                         if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1275                                 continue;
1276                 }
1277
1278                 err = __igt_reset_engines(arg, p->name, p->flags);
1279                 if (err)
1280                         return err;
1281         }
1282
1283         return 0;
1284 }
1285
1286 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1287 {
1288         u32 count = i915_reset_count(&gt->i915->gpu_error);
1289
1290         intel_gt_reset(gt, mask, NULL);
1291
1292         return count;
1293 }
1294
1295 static int igt_reset_wait(void *arg)
1296 {
1297         struct intel_gt *gt = arg;
1298         struct i915_gpu_error *global = &gt->i915->gpu_error;
1299         struct intel_engine_cs *engine = gt->engine[RCS0];
1300         struct i915_request *rq;
1301         unsigned int reset_count;
1302         struct hang h;
1303         long timeout;
1304         int err;
1305
1306         if (!engine || !intel_engine_can_store_dword(engine))
1307                 return 0;
1308
1309         /* Check that we detect a stuck waiter and issue a reset */
1310
1311         igt_global_reset_lock(gt);
1312
1313         err = hang_init(&h, gt);
1314         if (err) {
1315                 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1316                 goto unlock;
1317         }
1318
1319         rq = hang_create_request(&h, engine);
1320         if (IS_ERR(rq)) {
1321                 err = PTR_ERR(rq);
1322                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1323                 goto fini;
1324         }
1325
1326         i915_request_get(rq);
1327         i915_request_add(rq);
1328
1329         if (!wait_until_running(&h, rq)) {
1330                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1331
1332                 pr_err("%s: Failed to start request %llx, at %x\n",
1333                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1334                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1335
1336                 intel_gt_set_wedged(gt);
1337
1338                 err = -EIO;
1339                 goto out_rq;
1340         }
1341
1342         reset_count = fake_hangcheck(gt, ALL_ENGINES);
1343
1344         timeout = i915_request_wait(rq, 0, 10);
1345         if (timeout < 0) {
1346                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1347                        timeout);
1348                 err = timeout;
1349                 goto out_rq;
1350         }
1351
1352         if (i915_reset_count(global) == reset_count) {
1353                 pr_err("No GPU reset recorded!\n");
1354                 err = -EINVAL;
1355                 goto out_rq;
1356         }
1357
1358 out_rq:
1359         i915_request_put(rq);
1360 fini:
1361         hang_fini(&h);
1362 unlock:
1363         igt_global_reset_unlock(gt);
1364
1365         if (intel_gt_is_wedged(gt))
1366                 return -EIO;
1367
1368         return err;
1369 }
1370
1371 struct evict_vma {
1372         struct completion completion;
1373         struct i915_vma *vma;
1374 };
1375
1376 static int evict_vma(void *data)
1377 {
1378         struct evict_vma *arg = data;
1379         struct i915_address_space *vm = arg->vma->vm;
1380         struct drm_mm_node evict = arg->vma->node;
1381         int err;
1382
1383         complete(&arg->completion);
1384
1385         mutex_lock(&vm->mutex);
1386         err = i915_gem_evict_for_node(vm, &evict, 0);
1387         mutex_unlock(&vm->mutex);
1388
1389         return err;
1390 }
1391
1392 static int evict_fence(void *data)
1393 {
1394         struct evict_vma *arg = data;
1395         int err;
1396
1397         complete(&arg->completion);
1398
1399         /* Mark the fence register as dirty to force the mmio update. */
1400         err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1401         if (err) {
1402                 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1403                 return err;
1404         }
1405
1406         err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1407         if (err) {
1408                 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1409                 return err;
1410         }
1411
1412         err = i915_vma_pin_fence(arg->vma);
1413         i915_vma_unpin(arg->vma);
1414         if (err) {
1415                 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1416                 return err;
1417         }
1418
1419         i915_vma_unpin_fence(arg->vma);
1420
1421         return 0;
1422 }
1423
1424 static int __igt_reset_evict_vma(struct intel_gt *gt,
1425                                  struct i915_address_space *vm,
1426                                  int (*fn)(void *),
1427                                  unsigned int flags)
1428 {
1429         struct intel_engine_cs *engine = gt->engine[RCS0];
1430         struct drm_i915_gem_object *obj;
1431         struct task_struct *tsk = NULL;
1432         struct i915_request *rq;
1433         struct evict_vma arg;
1434         struct hang h;
1435         unsigned int pin_flags;
1436         int err;
1437
1438         if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1439                 return 0;
1440
1441         if (!engine || !intel_engine_can_store_dword(engine))
1442                 return 0;
1443
1444         /* Check that we can recover an unbind stuck on a hanging request */
1445
1446         err = hang_init(&h, gt);
1447         if (err) {
1448                 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1449                 return err;
1450         }
1451
1452         obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1453         if (IS_ERR(obj)) {
1454                 err = PTR_ERR(obj);
1455                 pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1456                 goto fini;
1457         }
1458
1459         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1460                 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1461                 if (err) {
1462                         pr_err("Invalid X-tiling settings; err:%d\n", err);
1463                         goto out_obj;
1464                 }
1465         }
1466
1467         arg.vma = i915_vma_instance(obj, vm, NULL);
1468         if (IS_ERR(arg.vma)) {
1469                 err = PTR_ERR(arg.vma);
1470                 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1471                 goto out_obj;
1472         }
1473
1474         rq = hang_create_request(&h, engine);
1475         if (IS_ERR(rq)) {
1476                 err = PTR_ERR(rq);
1477                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1478                 goto out_obj;
1479         }
1480
1481         pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1482
1483         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1484                 pin_flags |= PIN_MAPPABLE;
1485
1486         err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1487         if (err) {
1488                 i915_request_add(rq);
1489                 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1490                 goto out_obj;
1491         }
1492
1493         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1494                 err = i915_vma_pin_fence(arg.vma);
1495                 if (err) {
1496                         pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1497                         i915_vma_unpin(arg.vma);
1498                         i915_request_add(rq);
1499                         goto out_obj;
1500                 }
1501         }
1502
1503         i915_vma_lock(arg.vma);
1504         err = i915_request_await_object(rq, arg.vma->obj,
1505                                         flags & EXEC_OBJECT_WRITE);
1506         if (err == 0) {
1507                 err = i915_vma_move_to_active(arg.vma, rq, flags);
1508                 if (err)
1509                         pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1510         } else {
1511                 pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1512         }
1513
1514         i915_vma_unlock(arg.vma);
1515
1516         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1517                 i915_vma_unpin_fence(arg.vma);
1518         i915_vma_unpin(arg.vma);
1519
1520         i915_request_get(rq);
1521         i915_request_add(rq);
1522         if (err)
1523                 goto out_rq;
1524
1525         if (!wait_until_running(&h, rq)) {
1526                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1527
1528                 pr_err("%s: Failed to start request %llx, at %x\n",
1529                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1530                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1531
1532                 intel_gt_set_wedged(gt);
1533                 goto out_reset;
1534         }
1535
1536         init_completion(&arg.completion);
1537
1538         tsk = kthread_run(fn, &arg, "igt/evict_vma");
1539         if (IS_ERR(tsk)) {
1540                 err = PTR_ERR(tsk);
1541                 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1542                 tsk = NULL;
1543                 goto out_reset;
1544         }
1545         get_task_struct(tsk);
1546
1547         wait_for_completion(&arg.completion);
1548
1549         if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1550                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1551
1552                 pr_err("igt/evict_vma kthread did not wait\n");
1553                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1554
1555                 intel_gt_set_wedged(gt);
1556                 goto out_reset;
1557         }
1558
1559 out_reset:
1560         igt_global_reset_lock(gt);
1561         fake_hangcheck(gt, rq->engine->mask);
1562         igt_global_reset_unlock(gt);
1563
1564         if (tsk) {
1565                 struct intel_wedge_me w;
1566
1567                 /* The reset, even indirectly, should take less than 10ms. */
1568                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1569                         err = kthread_stop(tsk);
1570
1571                 put_task_struct(tsk);
1572         }
1573
1574 out_rq:
1575         i915_request_put(rq);
1576 out_obj:
1577         i915_gem_object_put(obj);
1578 fini:
1579         hang_fini(&h);
1580         if (intel_gt_is_wedged(gt))
1581                 return -EIO;
1582
1583         return err;
1584 }
1585
1586 static int igt_reset_evict_ggtt(void *arg)
1587 {
1588         struct intel_gt *gt = arg;
1589
1590         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1591                                      evict_vma, EXEC_OBJECT_WRITE);
1592 }
1593
1594 static int igt_reset_evict_ppgtt(void *arg)
1595 {
1596         struct intel_gt *gt = arg;
1597         struct i915_ppgtt *ppgtt;
1598         int err;
1599
1600         /* aliasing == global gtt locking, covered above */
1601         if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1602                 return 0;
1603
1604         ppgtt = i915_ppgtt_create(gt, 0);
1605         if (IS_ERR(ppgtt))
1606                 return PTR_ERR(ppgtt);
1607
1608         err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1609                                     evict_vma, EXEC_OBJECT_WRITE);
1610         i915_vm_put(&ppgtt->vm);
1611
1612         return err;
1613 }
1614
1615 static int igt_reset_evict_fence(void *arg)
1616 {
1617         struct intel_gt *gt = arg;
1618
1619         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1620                                      evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1621 }
1622
1623 static int wait_for_others(struct intel_gt *gt,
1624                            struct intel_engine_cs *exclude)
1625 {
1626         struct intel_engine_cs *engine;
1627         enum intel_engine_id id;
1628
1629         for_each_engine(engine, gt, id) {
1630                 if (engine == exclude)
1631                         continue;
1632
1633                 if (!wait_for_idle(engine))
1634                         return -EIO;
1635         }
1636
1637         return 0;
1638 }
1639
1640 static int igt_reset_queue(void *arg)
1641 {
1642         struct intel_gt *gt = arg;
1643         struct i915_gpu_error *global = &gt->i915->gpu_error;
1644         struct intel_engine_cs *engine;
1645         enum intel_engine_id id;
1646         struct hang h;
1647         int err;
1648
1649         /* Check that we replay pending requests following a hang */
1650
1651         igt_global_reset_lock(gt);
1652
1653         err = hang_init(&h, gt);
1654         if (err)
1655                 goto unlock;
1656
1657         for_each_engine(engine, gt, id) {
1658                 struct intel_selftest_saved_policy saved;
1659                 struct i915_request *prev;
1660                 IGT_TIMEOUT(end_time);
1661                 unsigned int count;
1662                 bool using_guc = intel_engine_uses_guc(engine);
1663
1664                 if (!intel_engine_can_store_dword(engine))
1665                         continue;
1666
1667                 if (using_guc) {
1668                         err = intel_selftest_modify_policy(engine, &saved,
1669                                                            SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1670                         if (err) {
1671                                 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1672                                 goto fini;
1673                         }
1674                 }
1675
1676                 prev = hang_create_request(&h, engine);
1677                 if (IS_ERR(prev)) {
1678                         err = PTR_ERR(prev);
1679                         pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1680                         goto restore;
1681                 }
1682
1683                 i915_request_get(prev);
1684                 i915_request_add(prev);
1685
1686                 count = 0;
1687                 do {
1688                         struct i915_request *rq;
1689                         unsigned int reset_count;
1690
1691                         rq = hang_create_request(&h, engine);
1692                         if (IS_ERR(rq)) {
1693                                 err = PTR_ERR(rq);
1694                                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1695                                 goto restore;
1696                         }
1697
1698                         i915_request_get(rq);
1699                         i915_request_add(rq);
1700
1701                         /*
1702                          * XXX We don't handle resetting the kernel context
1703                          * very well. If we trigger a device reset twice in
1704                          * quick succession while the kernel context is
1705                          * executing, we may end up skipping the breadcrumb.
1706                          * This is really only a problem for the selftest as
1707                          * normally there is a large interlude between resets
1708                          * (hangcheck), or we focus on resetting just one
1709                          * engine and so avoid repeatedly resetting innocents.
1710                          */
1711                         err = wait_for_others(gt, engine);
1712                         if (err) {
1713                                 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1714                                        __func__, engine->name);
1715                                 i915_request_put(rq);
1716                                 i915_request_put(prev);
1717
1718                                 GEM_TRACE_DUMP();
1719                                 intel_gt_set_wedged(gt);
1720                                 goto restore;
1721                         }
1722
1723                         if (!wait_until_running(&h, prev)) {
1724                                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1725
1726                                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1727                                        __func__, engine->name,
1728                                        prev->fence.seqno, hws_seqno(&h, prev));
1729                                 intel_engine_dump(engine, &p,
1730                                                   "%s\n", engine->name);
1731
1732                                 i915_request_put(rq);
1733                                 i915_request_put(prev);
1734
1735                                 intel_gt_set_wedged(gt);
1736
1737                                 err = -EIO;
1738                                 goto restore;
1739                         }
1740
1741                         reset_count = fake_hangcheck(gt, BIT(id));
1742
1743                         if (prev->fence.error != -EIO) {
1744                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1745                                        prev->fence.error);
1746                                 i915_request_put(rq);
1747                                 i915_request_put(prev);
1748                                 err = -EINVAL;
1749                                 goto restore;
1750                         }
1751
1752                         if (rq->fence.error) {
1753                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1754                                        rq->fence.error);
1755                                 i915_request_put(rq);
1756                                 i915_request_put(prev);
1757                                 err = -EINVAL;
1758                                 goto restore;
1759                         }
1760
1761                         if (i915_reset_count(global) == reset_count) {
1762                                 pr_err("No GPU reset recorded!\n");
1763                                 i915_request_put(rq);
1764                                 i915_request_put(prev);
1765                                 err = -EINVAL;
1766                                 goto restore;
1767                         }
1768
1769                         i915_request_put(prev);
1770                         prev = rq;
1771                         count++;
1772                 } while (time_before(jiffies, end_time));
1773                 pr_info("%s: Completed %d queued resets\n",
1774                         engine->name, count);
1775
1776                 *h.batch = MI_BATCH_BUFFER_END;
1777                 intel_gt_chipset_flush(engine->gt);
1778
1779                 i915_request_put(prev);
1780
1781 restore:
1782                 if (using_guc) {
1783                         int err2 = intel_selftest_restore_policy(engine, &saved);
1784
1785                         if (err2)
1786                                 pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1787                                        __func__, __LINE__, engine->name, err2);
1788                         if (err == 0)
1789                                 err = err2;
1790                 }
1791                 if (err)
1792                         goto fini;
1793
1794                 err = igt_flush_test(gt->i915);
1795                 if (err) {
1796                         pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1797                         break;
1798                 }
1799         }
1800
1801 fini:
1802         hang_fini(&h);
1803 unlock:
1804         igt_global_reset_unlock(gt);
1805
1806         if (intel_gt_is_wedged(gt))
1807                 return -EIO;
1808
1809         return err;
1810 }
1811
1812 static int igt_handle_error(void *arg)
1813 {
1814         struct intel_gt *gt = arg;
1815         struct i915_gpu_error *global = &gt->i915->gpu_error;
1816         struct intel_engine_cs *engine = gt->engine[RCS0];
1817         struct hang h;
1818         struct i915_request *rq;
1819         struct i915_gpu_coredump *error;
1820         int err;
1821
1822         /* Check that we can issue a global GPU and engine reset */
1823
1824         if (!intel_has_reset_engine(gt))
1825                 return 0;
1826
1827         if (!engine || !intel_engine_can_store_dword(engine))
1828                 return 0;
1829
1830         err = hang_init(&h, gt);
1831         if (err) {
1832                 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1833                 return err;
1834         }
1835
1836         rq = hang_create_request(&h, engine);
1837         if (IS_ERR(rq)) {
1838                 err = PTR_ERR(rq);
1839                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1840                 goto err_fini;
1841         }
1842
1843         i915_request_get(rq);
1844         i915_request_add(rq);
1845
1846         if (!wait_until_running(&h, rq)) {
1847                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1848
1849                 pr_err("%s: Failed to start request %llx, at %x\n",
1850                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1851                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1852
1853                 intel_gt_set_wedged(gt);
1854
1855                 err = -EIO;
1856                 goto err_request;
1857         }
1858
1859         /* Temporarily disable error capture */
1860         error = xchg(&global->first_error, (void *)-1);
1861
1862         intel_gt_handle_error(gt, engine->mask, 0, NULL);
1863
1864         xchg(&global->first_error, error);
1865
1866         if (rq->fence.error != -EIO) {
1867                 pr_err("Guilty request not identified!\n");
1868                 err = -EINVAL;
1869                 goto err_request;
1870         }
1871
1872 err_request:
1873         i915_request_put(rq);
1874 err_fini:
1875         hang_fini(&h);
1876         return err;
1877 }
1878
1879 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1880                                      const struct igt_atomic_section *p,
1881                                      const char *mode)
1882 {
1883         struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1884         int err;
1885
1886         GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1887                   engine->name, mode, p->name);
1888
1889         if (t->func)
1890                 tasklet_disable(t);
1891         if (strcmp(p->name, "softirq"))
1892                 local_bh_disable();
1893         p->critical_section_begin();
1894
1895         err = __intel_engine_reset_bh(engine, NULL);
1896
1897         p->critical_section_end();
1898         if (strcmp(p->name, "softirq"))
1899                 local_bh_enable();
1900         if (t->func) {
1901                 tasklet_enable(t);
1902                 tasklet_hi_schedule(t);
1903         }
1904
1905         if (err)
1906                 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1907                        engine->name, mode, p->name);
1908
1909         return err;
1910 }
1911
1912 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1913                                    const struct igt_atomic_section *p)
1914 {
1915         struct i915_request *rq;
1916         struct hang h;
1917         int err;
1918
1919         err = __igt_atomic_reset_engine(engine, p, "idle");
1920         if (err)
1921                 return err;
1922
1923         err = hang_init(&h, engine->gt);
1924         if (err) {
1925                 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1926                 return err;
1927         }
1928
1929         rq = hang_create_request(&h, engine);
1930         if (IS_ERR(rq)) {
1931                 err = PTR_ERR(rq);
1932                 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1933                 goto out;
1934         }
1935
1936         i915_request_get(rq);
1937         i915_request_add(rq);
1938
1939         if (wait_until_running(&h, rq)) {
1940                 err = __igt_atomic_reset_engine(engine, p, "active");
1941         } else {
1942                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1943                        __func__, engine->name,
1944                        rq->fence.seqno, hws_seqno(&h, rq));
1945                 intel_gt_set_wedged(engine->gt);
1946                 err = -EIO;
1947         }
1948
1949         if (err == 0) {
1950                 struct intel_wedge_me w;
1951
1952                 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1953                         i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1954                 if (intel_gt_is_wedged(engine->gt))
1955                         err = -EIO;
1956         }
1957
1958         i915_request_put(rq);
1959 out:
1960         hang_fini(&h);
1961         return err;
1962 }
1963
1964 static int igt_reset_engines_atomic(void *arg)
1965 {
1966         struct intel_gt *gt = arg;
1967         const typeof(*igt_atomic_phases) *p;
1968         int err = 0;
1969
1970         /* Check that the engines resets are usable from atomic context */
1971
1972         if (!intel_has_reset_engine(gt))
1973                 return 0;
1974
1975         if (intel_uc_uses_guc_submission(&gt->uc))
1976                 return 0;
1977
1978         igt_global_reset_lock(gt);
1979
1980         /* Flush any requests before we get started and check basics */
1981         if (!igt_force_reset(gt))
1982                 goto unlock;
1983
1984         for (p = igt_atomic_phases; p->name; p++) {
1985                 struct intel_engine_cs *engine;
1986                 enum intel_engine_id id;
1987
1988                 for_each_engine(engine, gt, id) {
1989                         err = igt_atomic_reset_engine(engine, p);
1990                         if (err)
1991                                 goto out;
1992                 }
1993         }
1994
1995 out:
1996         /* As we poke around the guts, do a full reset before continuing. */
1997         igt_force_reset(gt);
1998 unlock:
1999         igt_global_reset_unlock(gt);
2000
2001         return err;
2002 }
2003
2004 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2005 {
2006         static const struct i915_subtest tests[] = {
2007                 SUBTEST(igt_hang_sanitycheck),
2008                 SUBTEST(igt_reset_nop),
2009                 SUBTEST(igt_reset_nop_engine),
2010                 SUBTEST(igt_reset_idle_engine),
2011                 SUBTEST(igt_reset_active_engine),
2012                 SUBTEST(igt_reset_fail_engine),
2013                 SUBTEST(igt_reset_engines),
2014                 SUBTEST(igt_reset_engines_atomic),
2015                 SUBTEST(igt_reset_queue),
2016                 SUBTEST(igt_reset_wait),
2017                 SUBTEST(igt_reset_evict_ggtt),
2018                 SUBTEST(igt_reset_evict_ppgtt),
2019                 SUBTEST(igt_reset_evict_fence),
2020                 SUBTEST(igt_handle_error),
2021         };
2022         struct intel_gt *gt = to_gt(i915);
2023         intel_wakeref_t wakeref;
2024         int err;
2025
2026         if (!intel_has_gpu_reset(gt))
2027                 return 0;
2028
2029         if (intel_gt_is_wedged(gt))
2030                 return -EIO; /* we're long past hope of a successful reset */
2031
2032         wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2033
2034         err = intel_gt_live_subtests(tests, gt);
2035
2036         intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2037
2038         return err;
2039 }