1 // SPDX-License-Identifier: MIT
3 * Copyright © 2016 Intel Corporation
6 #include <linux/kthread.h>
8 #include "gem/i915_gem_context.h"
10 #include "i915_gem_evict.h"
12 #include "intel_engine_heartbeat.h"
13 #include "intel_engine_pm.h"
14 #include "selftest_engine_heartbeat.h"
16 #include "i915_selftest.h"
17 #include "selftests/i915_random.h"
18 #include "selftests/igt_flush_test.h"
19 #include "selftests/igt_reset.h"
20 #include "selftests/igt_atomic.h"
21 #include "selftests/igt_spinner.h"
22 #include "selftests/intel_scheduler_helpers.h"
24 #include "selftests/mock_drm.h"
26 #include "gem/selftests/mock_context.h"
27 #include "gem/selftests/igt_gem_utils.h"
29 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
33 struct drm_i915_gem_object *hws;
34 struct drm_i915_gem_object *obj;
35 struct i915_gem_context *ctx;
40 static int hang_init(struct hang *h, struct intel_gt *gt)
45 memset(h, 0, sizeof(*h));
48 h->ctx = kernel_context(gt->i915, NULL);
50 return PTR_ERR(h->ctx);
52 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
54 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
56 err = PTR_ERR(h->hws);
60 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
62 err = PTR_ERR(h->obj);
66 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
67 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
72 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
74 vaddr = i915_gem_object_pin_map_unlocked(h->obj,
75 i915_coherent_map_type(gt->i915, h->obj, false));
85 i915_gem_object_unpin_map(h->hws);
87 i915_gem_object_put(h->obj);
89 i915_gem_object_put(h->hws);
91 kernel_context_close(h->ctx);
95 static u64 hws_address(const struct i915_vma *hws,
96 const struct i915_request *rq)
98 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
101 static int move_to_active(struct i915_vma *vma,
102 struct i915_request *rq,
108 err = i915_request_await_object(rq, vma->obj,
109 flags & EXEC_OBJECT_WRITE);
111 err = i915_vma_move_to_active(vma, rq, flags);
112 i915_vma_unlock(vma);
117 static struct i915_request *
118 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
120 struct intel_gt *gt = h->gt;
121 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
122 struct drm_i915_gem_object *obj;
123 struct i915_request *rq = NULL;
124 struct i915_vma *hws, *vma;
130 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
133 return ERR_CAST(obj);
136 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
138 i915_gem_object_put(obj);
140 return ERR_CAST(vaddr);
143 i915_gem_object_unpin_map(h->obj);
144 i915_gem_object_put(h->obj);
149 vma = i915_vma_instance(h->obj, vm, NULL);
152 return ERR_CAST(vma);
155 hws = i915_vma_instance(h->hws, vm, NULL);
158 return ERR_CAST(hws);
161 err = i915_vma_pin(vma, 0, 0, PIN_USER);
167 err = i915_vma_pin(hws, 0, 0, PIN_USER);
171 rq = igt_request_alloc(h->ctx, engine);
177 err = move_to_active(vma, rq, 0);
181 err = move_to_active(hws, rq, 0);
186 if (GRAPHICS_VER(gt->i915) >= 8) {
187 *batch++ = MI_STORE_DWORD_IMM_GEN4;
188 *batch++ = lower_32_bits(hws_address(hws, rq));
189 *batch++ = upper_32_bits(hws_address(hws, rq));
190 *batch++ = rq->fence.seqno;
193 memset(batch, 0, 1024);
194 batch += 1024 / sizeof(*batch);
197 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
198 *batch++ = lower_32_bits(vma->node.start);
199 *batch++ = upper_32_bits(vma->node.start);
200 } else if (GRAPHICS_VER(gt->i915) >= 6) {
201 *batch++ = MI_STORE_DWORD_IMM_GEN4;
203 *batch++ = lower_32_bits(hws_address(hws, rq));
204 *batch++ = rq->fence.seqno;
207 memset(batch, 0, 1024);
208 batch += 1024 / sizeof(*batch);
211 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
212 *batch++ = lower_32_bits(vma->node.start);
213 } else if (GRAPHICS_VER(gt->i915) >= 4) {
214 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
216 *batch++ = lower_32_bits(hws_address(hws, rq));
217 *batch++ = rq->fence.seqno;
220 memset(batch, 0, 1024);
221 batch += 1024 / sizeof(*batch);
224 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
225 *batch++ = lower_32_bits(vma->node.start);
227 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
228 *batch++ = lower_32_bits(hws_address(hws, rq));
229 *batch++ = rq->fence.seqno;
232 memset(batch, 0, 1024);
233 batch += 1024 / sizeof(*batch);
236 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
237 *batch++ = lower_32_bits(vma->node.start);
239 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
240 intel_gt_chipset_flush(engine->gt);
242 if (rq->engine->emit_init_breadcrumb) {
243 err = rq->engine->emit_init_breadcrumb(rq);
249 if (GRAPHICS_VER(gt->i915) <= 5)
250 flags |= I915_DISPATCH_SECURE;
252 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
256 i915_request_set_error_once(rq, err);
257 i915_request_add(rq);
264 return err ? ERR_PTR(err) : rq;
267 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
269 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
272 static void hang_fini(struct hang *h)
274 *h->batch = MI_BATCH_BUFFER_END;
275 intel_gt_chipset_flush(h->gt);
277 i915_gem_object_unpin_map(h->obj);
278 i915_gem_object_put(h->obj);
280 i915_gem_object_unpin_map(h->hws);
281 i915_gem_object_put(h->hws);
283 kernel_context_close(h->ctx);
285 igt_flush_test(h->gt->i915);
288 static bool wait_until_running(struct hang *h, struct i915_request *rq)
290 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
293 wait_for(i915_seqno_passed(hws_seqno(h, rq),
298 static int igt_hang_sanitycheck(void *arg)
300 struct intel_gt *gt = arg;
301 struct i915_request *rq;
302 struct intel_engine_cs *engine;
303 enum intel_engine_id id;
307 /* Basic check that we can execute our hanging batch */
309 err = hang_init(&h, gt);
313 for_each_engine(engine, gt, id) {
314 struct intel_wedge_me w;
317 if (!intel_engine_can_store_dword(engine))
320 rq = hang_create_request(&h, engine);
323 pr_err("Failed to create request for %s, err=%d\n",
328 i915_request_get(rq);
330 *h.batch = MI_BATCH_BUFFER_END;
331 intel_gt_chipset_flush(engine->gt);
333 i915_request_add(rq);
336 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
337 timeout = i915_request_wait(rq, 0,
338 MAX_SCHEDULE_TIMEOUT);
339 if (intel_gt_is_wedged(gt))
342 i915_request_put(rq);
346 pr_err("Wait for request failed on %s, err=%d\n",
357 static bool wait_for_idle(struct intel_engine_cs *engine)
359 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
362 static int igt_reset_nop(void *arg)
364 struct intel_gt *gt = arg;
365 struct i915_gpu_error *global = >->i915->gpu_error;
366 struct intel_engine_cs *engine;
367 unsigned int reset_count, count;
368 enum intel_engine_id id;
369 IGT_TIMEOUT(end_time);
372 /* Check that we can reset during non-user portions of requests */
374 reset_count = i915_reset_count(global);
377 for_each_engine(engine, gt, id) {
378 struct intel_context *ce;
381 ce = intel_context_create(engine);
384 pr_err("[%s] Create context failed: %d!\n", engine->name, err);
388 for (i = 0; i < 16; i++) {
389 struct i915_request *rq;
391 rq = intel_context_create_request(ce);
394 pr_err("[%s] Create request failed: %d!\n",
399 i915_request_add(rq);
402 intel_context_put(ce);
405 igt_global_reset_lock(gt);
406 intel_gt_reset(gt, ALL_ENGINES, NULL);
407 igt_global_reset_unlock(gt);
409 if (intel_gt_is_wedged(gt)) {
410 pr_err("[%s] GT is wedged!\n", engine->name);
415 if (i915_reset_count(global) != reset_count + ++count) {
416 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
417 engine->name, i915_reset_count(global), reset_count, count);
422 err = igt_flush_test(gt->i915);
424 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
427 } while (time_before(jiffies, end_time));
428 pr_info("%s: %d resets\n", __func__, count);
430 if (igt_flush_test(gt->i915)) {
431 pr_err("Post flush failed: %d!\n", err);
438 static int igt_reset_nop_engine(void *arg)
440 struct intel_gt *gt = arg;
441 struct i915_gpu_error *global = >->i915->gpu_error;
442 struct intel_engine_cs *engine;
443 enum intel_engine_id id;
445 /* Check that we can engine-reset during non-user portions */
447 if (!intel_has_reset_engine(gt))
450 for_each_engine(engine, gt, id) {
451 unsigned int reset_count, reset_engine_count, count;
452 struct intel_context *ce;
453 IGT_TIMEOUT(end_time);
456 if (intel_engine_uses_guc(engine)) {
457 /* Engine level resets are triggered by GuC when a hang
458 * is detected. They can't be triggered by the KMD any
459 * more. Thus a nop batch cannot be used as a reset test
464 ce = intel_context_create(engine);
466 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
470 reset_count = i915_reset_count(global);
471 reset_engine_count = i915_reset_engine_count(global, engine);
474 st_engine_heartbeat_disable(engine);
475 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
479 if (!wait_for_idle(engine)) {
480 pr_err("%s failed to idle before reset\n",
486 for (i = 0; i < 16; i++) {
487 struct i915_request *rq;
489 rq = intel_context_create_request(ce);
491 struct drm_printer p =
492 drm_info_printer(gt->i915->drm.dev);
493 intel_engine_dump(engine, &p,
494 "%s(%s): failed to submit request\n",
498 GEM_TRACE("%s(%s): failed to submit request\n",
503 intel_gt_set_wedged(gt);
509 i915_request_add(rq);
511 err = intel_engine_reset(engine, NULL);
513 pr_err("intel_engine_reset(%s) failed, err:%d\n",
518 if (i915_reset_count(global) != reset_count) {
519 pr_err("Full GPU reset recorded! (engine reset expected)\n");
524 if (i915_reset_engine_count(global, engine) !=
525 reset_engine_count + ++count) {
526 pr_err("%s engine reset not recorded!\n",
531 } while (time_before(jiffies, end_time));
532 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
533 st_engine_heartbeat_enable(engine);
535 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
537 intel_context_put(ce);
538 if (igt_flush_test(gt->i915))
547 static void force_reset_timeout(struct intel_engine_cs *engine)
549 engine->reset_timeout.probability = 999;
550 atomic_set(&engine->reset_timeout.times, -1);
553 static void cancel_reset_timeout(struct intel_engine_cs *engine)
555 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
558 static int igt_reset_fail_engine(void *arg)
560 struct intel_gt *gt = arg;
561 struct intel_engine_cs *engine;
562 enum intel_engine_id id;
564 /* Check that we can recover from engine-reset failues */
566 if (!intel_has_reset_engine(gt))
569 for_each_engine(engine, gt, id) {
571 struct intel_context *ce;
572 IGT_TIMEOUT(end_time);
575 /* Can't manually break the reset if i915 doesn't perform it */
576 if (intel_engine_uses_guc(engine))
579 ce = intel_context_create(engine);
581 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
585 st_engine_heartbeat_disable(engine);
586 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
588 force_reset_timeout(engine);
589 err = intel_engine_reset(engine, NULL);
590 cancel_reset_timeout(engine);
591 if (err == 0) /* timeouts only generated on gen8+ */
596 struct i915_request *last = NULL;
599 if (!wait_for_idle(engine)) {
600 pr_err("%s failed to idle before reset\n",
606 for (i = 0; i < count % 15; i++) {
607 struct i915_request *rq;
609 rq = intel_context_create_request(ce);
611 struct drm_printer p =
612 drm_info_printer(gt->i915->drm.dev);
613 intel_engine_dump(engine, &p,
614 "%s(%s): failed to submit request\n",
618 GEM_TRACE("%s(%s): failed to submit request\n",
623 intel_gt_set_wedged(gt);
625 i915_request_put(last);
632 i915_request_put(last);
633 last = i915_request_get(rq);
634 i915_request_add(rq);
638 err = intel_engine_reset(engine, NULL);
640 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
643 i915_request_put(last);
647 force_reset_timeout(engine);
648 err = intel_engine_reset(engine, NULL);
649 cancel_reset_timeout(engine);
650 if (err != -ETIMEDOUT) {
651 pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
653 i915_request_put(last);
660 if (i915_request_wait(last, 0, HZ / 2) < 0) {
661 struct drm_printer p =
662 drm_info_printer(gt->i915->drm.dev);
664 intel_engine_dump(engine, &p,
665 "%s(%s): failed to complete request\n",
669 GEM_TRACE("%s(%s): failed to complete request\n",
676 i915_request_put(last);
679 } while (err == 0 && time_before(jiffies, end_time));
681 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
683 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
684 st_engine_heartbeat_enable(engine);
685 intel_context_put(ce);
687 if (igt_flush_test(gt->i915))
696 static int __igt_reset_engine(struct intel_gt *gt, bool active)
698 struct i915_gpu_error *global = >->i915->gpu_error;
699 struct intel_engine_cs *engine;
700 enum intel_engine_id id;
704 /* Check that we can issue an engine reset on an idle engine (no-op) */
706 if (!intel_has_reset_engine(gt))
710 err = hang_init(&h, gt);
715 for_each_engine(engine, gt, id) {
716 unsigned int reset_count, reset_engine_count;
718 bool using_guc = intel_engine_uses_guc(engine);
719 IGT_TIMEOUT(end_time);
721 if (using_guc && !active)
724 if (active && !intel_engine_can_store_dword(engine))
727 if (!wait_for_idle(engine)) {
728 pr_err("%s failed to idle before reset\n",
734 reset_count = i915_reset_count(global);
735 reset_engine_count = i915_reset_engine_count(global, engine);
737 st_engine_heartbeat_disable(engine);
738 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
741 struct i915_request *rq = NULL;
742 struct intel_selftest_saved_policy saved;
745 err = intel_selftest_modify_policy(engine, &saved,
746 SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
748 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
753 rq = hang_create_request(&h, engine);
756 pr_err("[%s] Create hang request failed: %d!\n",
761 i915_request_get(rq);
762 i915_request_add(rq);
764 if (!wait_until_running(&h, rq)) {
765 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
767 pr_err("%s: Failed to start request %llx, at %x\n",
768 __func__, rq->fence.seqno, hws_seqno(&h, rq));
769 intel_engine_dump(engine, &p,
770 "%s\n", engine->name);
772 i915_request_put(rq);
779 err = intel_engine_reset(engine, NULL);
781 pr_err("intel_engine_reset(%s) failed, err:%d\n",
788 /* Ensure the reset happens and kills the engine */
789 err = intel_selftest_wait_for_rq(rq);
791 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
792 engine->name, rq->fence.context,
793 rq->fence.seqno, rq->context->guc_id.id, err);
798 i915_request_put(rq);
800 if (i915_reset_count(global) != reset_count) {
801 pr_err("Full GPU reset recorded! (engine reset expected)\n");
806 /* GuC based resets are not logged per engine */
808 if (i915_reset_engine_count(global, engine) !=
809 ++reset_engine_count) {
810 pr_err("%s engine reset not recorded!\n",
820 err2 = intel_selftest_restore_policy(engine, &saved);
822 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
827 } while (time_before(jiffies, end_time));
828 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
829 st_engine_heartbeat_enable(engine);
830 pr_info("%s: Completed %lu %s resets\n",
831 engine->name, count, active ? "active" : "idle");
836 err = igt_flush_test(gt->i915);
838 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
843 if (intel_gt_is_wedged(gt)) {
844 pr_err("GT is wedged!\n");
854 static int igt_reset_idle_engine(void *arg)
856 return __igt_reset_engine(arg, false);
859 static int igt_reset_active_engine(void *arg)
861 return __igt_reset_engine(arg, true);
864 struct active_engine {
865 struct task_struct *task;
866 struct intel_engine_cs *engine;
867 unsigned long resets;
871 #define TEST_ACTIVE BIT(0)
872 #define TEST_OTHERS BIT(1)
873 #define TEST_SELF BIT(2)
874 #define TEST_PRIORITY BIT(3)
876 static int active_request_put(struct i915_request *rq)
883 if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
884 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
890 intel_gt_set_wedged(rq->engine->gt);
894 i915_request_put(rq);
899 static int active_engine(void *data)
901 I915_RND_STATE(prng);
902 struct active_engine *arg = data;
903 struct intel_engine_cs *engine = arg->engine;
904 struct i915_request *rq[8] = {};
905 struct intel_context *ce[ARRAY_SIZE(rq)];
909 for (count = 0; count < ARRAY_SIZE(ce); count++) {
910 ce[count] = intel_context_create(engine);
911 if (IS_ERR(ce[count])) {
912 err = PTR_ERR(ce[count]);
913 pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
915 intel_context_put(ce[count]);
921 while (!kthread_should_stop()) {
922 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
923 struct i915_request *old = rq[idx];
924 struct i915_request *new;
926 new = intel_context_create_request(ce[idx]);
929 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
933 rq[idx] = i915_request_get(new);
934 i915_request_add(new);
936 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
937 struct i915_sched_attr attr = {
939 i915_prandom_u32_max_state(512, &prng),
941 engine->sched_engine->schedule(rq[idx], &attr);
944 err = active_request_put(old);
946 pr_err("[%s] Request put failed: %d!\n", engine->name, err);
953 for (count = 0; count < ARRAY_SIZE(rq); count++) {
954 int err__ = active_request_put(rq[count]);
957 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
959 /* Keep the first error */
963 intel_context_put(ce[count]);
969 static int __igt_reset_engines(struct intel_gt *gt,
970 const char *test_name,
973 struct i915_gpu_error *global = >->i915->gpu_error;
974 struct intel_engine_cs *engine, *other;
975 enum intel_engine_id id, tmp;
979 /* Check that issuing a reset on one engine does not interfere
980 * with any other engine.
983 if (!intel_has_reset_engine(gt))
986 if (flags & TEST_ACTIVE) {
987 err = hang_init(&h, gt);
991 if (flags & TEST_PRIORITY)
992 h.ctx->sched.priority = 1024;
995 for_each_engine(engine, gt, id) {
996 struct active_engine threads[I915_NUM_ENGINES] = {};
997 unsigned long device = i915_reset_count(global);
998 unsigned long count = 0, reported;
999 bool using_guc = intel_engine_uses_guc(engine);
1000 IGT_TIMEOUT(end_time);
1002 if (flags & TEST_ACTIVE) {
1003 if (!intel_engine_can_store_dword(engine))
1005 } else if (using_guc)
1008 if (!wait_for_idle(engine)) {
1009 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1010 engine->name, test_name);
1015 memset(threads, 0, sizeof(threads));
1016 for_each_engine(other, gt, tmp) {
1017 struct task_struct *tsk;
1019 threads[tmp].resets =
1020 i915_reset_engine_count(global, other);
1022 if (other == engine && !(flags & TEST_SELF))
1025 if (other != engine && !(flags & TEST_OTHERS))
1028 threads[tmp].engine = other;
1029 threads[tmp].flags = flags;
1031 tsk = kthread_run(active_engine, &threads[tmp],
1032 "igt/%s", other->name);
1035 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1039 threads[tmp].task = tsk;
1040 get_task_struct(tsk);
1043 yield(); /* start all threads before we begin */
1045 st_engine_heartbeat_disable_no_pm(engine);
1046 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
1048 struct i915_request *rq = NULL;
1049 struct intel_selftest_saved_policy saved;
1052 err = intel_selftest_modify_policy(engine, &saved,
1053 SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1055 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1059 if (flags & TEST_ACTIVE) {
1060 rq = hang_create_request(&h, engine);
1063 pr_err("[%s] Create hang request failed: %d!\n",
1068 i915_request_get(rq);
1069 i915_request_add(rq);
1071 if (!wait_until_running(&h, rq)) {
1072 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1074 pr_err("%s: Failed to start request %llx, at %x\n",
1075 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1076 intel_engine_dump(engine, &p,
1077 "%s\n", engine->name);
1079 i915_request_put(rq);
1084 intel_engine_pm_get(engine);
1088 err = intel_engine_reset(engine, NULL);
1090 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1091 engine->name, test_name, err);
1097 /* Ensure the reset happens and kills the engine */
1098 err = intel_selftest_wait_for_rq(rq);
1100 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1101 engine->name, rq->fence.context,
1102 rq->fence.seqno, rq->context->guc_id.id, err);
1108 if (rq->fence.error != -EIO) {
1109 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1110 engine->name, test_name,
1112 rq->fence.seqno, rq->context->guc_id.id);
1113 i915_request_put(rq);
1116 intel_gt_set_wedged(gt);
1121 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1122 struct drm_printer p =
1123 drm_info_printer(gt->i915->drm.dev);
1125 pr_err("i915_reset_engine(%s:%s):"
1126 " failed to complete request %llx:%lld after reset\n",
1127 engine->name, test_name,
1130 intel_engine_dump(engine, &p,
1131 "%s\n", engine->name);
1132 i915_request_put(rq);
1135 intel_gt_set_wedged(gt);
1140 i915_request_put(rq);
1143 if (!(flags & TEST_ACTIVE))
1144 intel_engine_pm_put(engine);
1146 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1147 struct drm_printer p =
1148 drm_info_printer(gt->i915->drm.dev);
1150 pr_err("i915_reset_engine(%s:%s):"
1151 " failed to idle after reset\n",
1152 engine->name, test_name);
1153 intel_engine_dump(engine, &p,
1154 "%s\n", engine->name);
1161 err2 = intel_selftest_restore_policy(engine, &saved);
1163 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1168 } while (time_before(jiffies, end_time));
1169 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
1170 st_engine_heartbeat_enable_no_pm(engine);
1172 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1173 engine->name, test_name, count);
1175 /* GuC based resets are not logged per engine */
1177 reported = i915_reset_engine_count(global, engine);
1178 reported -= threads[engine->id].resets;
1179 if (reported != count) {
1180 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1181 engine->name, test_name, count, reported);
1188 for_each_engine(other, gt, tmp) {
1191 if (!threads[tmp].task)
1194 ret = kthread_stop(threads[tmp].task);
1196 pr_err("kthread for other engine %s failed, err=%d\n",
1201 put_task_struct(threads[tmp].task);
1203 /* GuC based resets are not logged per engine */
1205 if (other->uabi_class != engine->uabi_class &&
1206 threads[tmp].resets !=
1207 i915_reset_engine_count(global, other)) {
1208 pr_err("Innocent engine %s was reset (count=%ld)\n",
1210 i915_reset_engine_count(global, other) -
1211 threads[tmp].resets);
1218 if (device != i915_reset_count(global)) {
1219 pr_err("Global reset (count=%ld)!\n",
1220 i915_reset_count(global) - device);
1228 err = igt_flush_test(gt->i915);
1230 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1235 if (intel_gt_is_wedged(gt))
1238 if (flags & TEST_ACTIVE)
1244 static int igt_reset_engines(void *arg)
1246 static const struct {
1251 { "active", TEST_ACTIVE },
1252 { "others-idle", TEST_OTHERS },
1253 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1256 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1260 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1264 struct intel_gt *gt = arg;
1268 for (p = phases; p->name; p++) {
1269 if (p->flags & TEST_PRIORITY) {
1270 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1274 err = __igt_reset_engines(arg, p->name, p->flags);
1282 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1284 u32 count = i915_reset_count(>->i915->gpu_error);
1286 intel_gt_reset(gt, mask, NULL);
1291 static int igt_reset_wait(void *arg)
1293 struct intel_gt *gt = arg;
1294 struct i915_gpu_error *global = >->i915->gpu_error;
1295 struct intel_engine_cs *engine = gt->engine[RCS0];
1296 struct i915_request *rq;
1297 unsigned int reset_count;
1302 if (!engine || !intel_engine_can_store_dword(engine))
1305 /* Check that we detect a stuck waiter and issue a reset */
1307 igt_global_reset_lock(gt);
1309 err = hang_init(&h, gt);
1311 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1315 rq = hang_create_request(&h, engine);
1318 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1322 i915_request_get(rq);
1323 i915_request_add(rq);
1325 if (!wait_until_running(&h, rq)) {
1326 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1328 pr_err("%s: Failed to start request %llx, at %x\n",
1329 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1330 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1332 intel_gt_set_wedged(gt);
1338 reset_count = fake_hangcheck(gt, ALL_ENGINES);
1340 timeout = i915_request_wait(rq, 0, 10);
1342 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1348 if (i915_reset_count(global) == reset_count) {
1349 pr_err("No GPU reset recorded!\n");
1355 i915_request_put(rq);
1359 igt_global_reset_unlock(gt);
1361 if (intel_gt_is_wedged(gt))
1368 struct completion completion;
1369 struct i915_vma *vma;
1372 static int evict_vma(void *data)
1374 struct evict_vma *arg = data;
1375 struct i915_address_space *vm = arg->vma->vm;
1376 struct drm_mm_node evict = arg->vma->node;
1379 complete(&arg->completion);
1381 mutex_lock(&vm->mutex);
1382 err = i915_gem_evict_for_node(vm, &evict, 0);
1383 mutex_unlock(&vm->mutex);
1388 static int evict_fence(void *data)
1390 struct evict_vma *arg = data;
1393 complete(&arg->completion);
1395 /* Mark the fence register as dirty to force the mmio update. */
1396 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1398 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1402 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1404 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1408 err = i915_vma_pin_fence(arg->vma);
1409 i915_vma_unpin(arg->vma);
1411 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1415 i915_vma_unpin_fence(arg->vma);
1420 static int __igt_reset_evict_vma(struct intel_gt *gt,
1421 struct i915_address_space *vm,
1425 struct intel_engine_cs *engine = gt->engine[RCS0];
1426 struct drm_i915_gem_object *obj;
1427 struct task_struct *tsk = NULL;
1428 struct i915_request *rq;
1429 struct evict_vma arg;
1431 unsigned int pin_flags;
1434 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1437 if (!engine || !intel_engine_can_store_dword(engine))
1440 /* Check that we can recover an unbind stuck on a hanging request */
1442 err = hang_init(&h, gt);
1444 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1448 obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1451 pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1455 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1456 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1458 pr_err("Invalid X-tiling settings; err:%d\n", err);
1463 arg.vma = i915_vma_instance(obj, vm, NULL);
1464 if (IS_ERR(arg.vma)) {
1465 err = PTR_ERR(arg.vma);
1466 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1470 rq = hang_create_request(&h, engine);
1473 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1477 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1479 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1480 pin_flags |= PIN_MAPPABLE;
1482 err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1484 i915_request_add(rq);
1485 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1489 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1490 err = i915_vma_pin_fence(arg.vma);
1492 pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1493 i915_vma_unpin(arg.vma);
1494 i915_request_add(rq);
1499 i915_vma_lock(arg.vma);
1500 err = i915_request_await_object(rq, arg.vma->obj,
1501 flags & EXEC_OBJECT_WRITE);
1503 err = i915_vma_move_to_active(arg.vma, rq, flags);
1505 pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1507 pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1510 i915_vma_unlock(arg.vma);
1512 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1513 i915_vma_unpin_fence(arg.vma);
1514 i915_vma_unpin(arg.vma);
1516 i915_request_get(rq);
1517 i915_request_add(rq);
1521 if (!wait_until_running(&h, rq)) {
1522 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1524 pr_err("%s: Failed to start request %llx, at %x\n",
1525 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1526 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1528 intel_gt_set_wedged(gt);
1532 init_completion(&arg.completion);
1534 tsk = kthread_run(fn, &arg, "igt/evict_vma");
1537 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1541 get_task_struct(tsk);
1543 wait_for_completion(&arg.completion);
1545 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1546 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1548 pr_err("igt/evict_vma kthread did not wait\n");
1549 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1551 intel_gt_set_wedged(gt);
1556 igt_global_reset_lock(gt);
1557 fake_hangcheck(gt, rq->engine->mask);
1558 igt_global_reset_unlock(gt);
1561 struct intel_wedge_me w;
1563 /* The reset, even indirectly, should take less than 10ms. */
1564 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1565 err = kthread_stop(tsk);
1567 put_task_struct(tsk);
1571 i915_request_put(rq);
1573 i915_gem_object_put(obj);
1576 if (intel_gt_is_wedged(gt))
1582 static int igt_reset_evict_ggtt(void *arg)
1584 struct intel_gt *gt = arg;
1586 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1587 evict_vma, EXEC_OBJECT_WRITE);
1590 static int igt_reset_evict_ppgtt(void *arg)
1592 struct intel_gt *gt = arg;
1593 struct i915_ppgtt *ppgtt;
1596 /* aliasing == global gtt locking, covered above */
1597 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1600 ppgtt = i915_ppgtt_create(gt, 0);
1602 return PTR_ERR(ppgtt);
1604 err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1605 evict_vma, EXEC_OBJECT_WRITE);
1606 i915_vm_put(&ppgtt->vm);
1611 static int igt_reset_evict_fence(void *arg)
1613 struct intel_gt *gt = arg;
1615 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1616 evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1619 static int wait_for_others(struct intel_gt *gt,
1620 struct intel_engine_cs *exclude)
1622 struct intel_engine_cs *engine;
1623 enum intel_engine_id id;
1625 for_each_engine(engine, gt, id) {
1626 if (engine == exclude)
1629 if (!wait_for_idle(engine))
1636 static int igt_reset_queue(void *arg)
1638 struct intel_gt *gt = arg;
1639 struct i915_gpu_error *global = >->i915->gpu_error;
1640 struct intel_engine_cs *engine;
1641 enum intel_engine_id id;
1645 /* Check that we replay pending requests following a hang */
1647 igt_global_reset_lock(gt);
1649 err = hang_init(&h, gt);
1653 for_each_engine(engine, gt, id) {
1654 struct intel_selftest_saved_policy saved;
1655 struct i915_request *prev;
1656 IGT_TIMEOUT(end_time);
1658 bool using_guc = intel_engine_uses_guc(engine);
1660 if (!intel_engine_can_store_dword(engine))
1664 err = intel_selftest_modify_policy(engine, &saved,
1665 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1667 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1672 prev = hang_create_request(&h, engine);
1674 err = PTR_ERR(prev);
1675 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1679 i915_request_get(prev);
1680 i915_request_add(prev);
1684 struct i915_request *rq;
1685 unsigned int reset_count;
1687 rq = hang_create_request(&h, engine);
1690 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1694 i915_request_get(rq);
1695 i915_request_add(rq);
1698 * XXX We don't handle resetting the kernel context
1699 * very well. If we trigger a device reset twice in
1700 * quick succession while the kernel context is
1701 * executing, we may end up skipping the breadcrumb.
1702 * This is really only a problem for the selftest as
1703 * normally there is a large interlude between resets
1704 * (hangcheck), or we focus on resetting just one
1705 * engine and so avoid repeatedly resetting innocents.
1707 err = wait_for_others(gt, engine);
1709 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1710 __func__, engine->name);
1711 i915_request_put(rq);
1712 i915_request_put(prev);
1715 intel_gt_set_wedged(gt);
1719 if (!wait_until_running(&h, prev)) {
1720 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1722 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1723 __func__, engine->name,
1724 prev->fence.seqno, hws_seqno(&h, prev));
1725 intel_engine_dump(engine, &p,
1726 "%s\n", engine->name);
1728 i915_request_put(rq);
1729 i915_request_put(prev);
1731 intel_gt_set_wedged(gt);
1737 reset_count = fake_hangcheck(gt, BIT(id));
1739 if (prev->fence.error != -EIO) {
1740 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1742 i915_request_put(rq);
1743 i915_request_put(prev);
1748 if (rq->fence.error) {
1749 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1751 i915_request_put(rq);
1752 i915_request_put(prev);
1757 if (i915_reset_count(global) == reset_count) {
1758 pr_err("No GPU reset recorded!\n");
1759 i915_request_put(rq);
1760 i915_request_put(prev);
1765 i915_request_put(prev);
1768 } while (time_before(jiffies, end_time));
1769 pr_info("%s: Completed %d queued resets\n",
1770 engine->name, count);
1772 *h.batch = MI_BATCH_BUFFER_END;
1773 intel_gt_chipset_flush(engine->gt);
1775 i915_request_put(prev);
1779 int err2 = intel_selftest_restore_policy(engine, &saved);
1782 pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1783 __func__, __LINE__, engine->name, err2);
1790 err = igt_flush_test(gt->i915);
1792 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1800 igt_global_reset_unlock(gt);
1802 if (intel_gt_is_wedged(gt))
1808 static int igt_handle_error(void *arg)
1810 struct intel_gt *gt = arg;
1811 struct i915_gpu_error *global = >->i915->gpu_error;
1812 struct intel_engine_cs *engine = gt->engine[RCS0];
1814 struct i915_request *rq;
1815 struct i915_gpu_coredump *error;
1818 /* Check that we can issue a global GPU and engine reset */
1820 if (!intel_has_reset_engine(gt))
1823 if (!engine || !intel_engine_can_store_dword(engine))
1826 err = hang_init(&h, gt);
1828 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1832 rq = hang_create_request(&h, engine);
1835 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1839 i915_request_get(rq);
1840 i915_request_add(rq);
1842 if (!wait_until_running(&h, rq)) {
1843 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1845 pr_err("%s: Failed to start request %llx, at %x\n",
1846 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1847 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1849 intel_gt_set_wedged(gt);
1855 /* Temporarily disable error capture */
1856 error = xchg(&global->first_error, (void *)-1);
1858 intel_gt_handle_error(gt, engine->mask, 0, NULL);
1860 xchg(&global->first_error, error);
1862 if (rq->fence.error != -EIO) {
1863 pr_err("Guilty request not identified!\n");
1869 i915_request_put(rq);
1875 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1876 const struct igt_atomic_section *p,
1879 struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1882 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1883 engine->name, mode, p->name);
1887 if (strcmp(p->name, "softirq"))
1889 p->critical_section_begin();
1891 err = __intel_engine_reset_bh(engine, NULL);
1893 p->critical_section_end();
1894 if (strcmp(p->name, "softirq"))
1898 tasklet_hi_schedule(t);
1902 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1903 engine->name, mode, p->name);
1908 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1909 const struct igt_atomic_section *p)
1911 struct i915_request *rq;
1915 err = __igt_atomic_reset_engine(engine, p, "idle");
1919 err = hang_init(&h, engine->gt);
1921 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1925 rq = hang_create_request(&h, engine);
1928 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1932 i915_request_get(rq);
1933 i915_request_add(rq);
1935 if (wait_until_running(&h, rq)) {
1936 err = __igt_atomic_reset_engine(engine, p, "active");
1938 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1939 __func__, engine->name,
1940 rq->fence.seqno, hws_seqno(&h, rq));
1941 intel_gt_set_wedged(engine->gt);
1946 struct intel_wedge_me w;
1948 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1949 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1950 if (intel_gt_is_wedged(engine->gt))
1954 i915_request_put(rq);
1960 static int igt_reset_engines_atomic(void *arg)
1962 struct intel_gt *gt = arg;
1963 const typeof(*igt_atomic_phases) *p;
1966 /* Check that the engines resets are usable from atomic context */
1968 if (!intel_has_reset_engine(gt))
1971 if (intel_uc_uses_guc_submission(>->uc))
1974 igt_global_reset_lock(gt);
1976 /* Flush any requests before we get started and check basics */
1977 if (!igt_force_reset(gt))
1980 for (p = igt_atomic_phases; p->name; p++) {
1981 struct intel_engine_cs *engine;
1982 enum intel_engine_id id;
1984 for_each_engine(engine, gt, id) {
1985 err = igt_atomic_reset_engine(engine, p);
1992 /* As we poke around the guts, do a full reset before continuing. */
1993 igt_force_reset(gt);
1995 igt_global_reset_unlock(gt);
2000 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2002 static const struct i915_subtest tests[] = {
2003 SUBTEST(igt_hang_sanitycheck),
2004 SUBTEST(igt_reset_nop),
2005 SUBTEST(igt_reset_nop_engine),
2006 SUBTEST(igt_reset_idle_engine),
2007 SUBTEST(igt_reset_active_engine),
2008 SUBTEST(igt_reset_fail_engine),
2009 SUBTEST(igt_reset_engines),
2010 SUBTEST(igt_reset_engines_atomic),
2011 SUBTEST(igt_reset_queue),
2012 SUBTEST(igt_reset_wait),
2013 SUBTEST(igt_reset_evict_ggtt),
2014 SUBTEST(igt_reset_evict_ppgtt),
2015 SUBTEST(igt_reset_evict_fence),
2016 SUBTEST(igt_handle_error),
2018 struct intel_gt *gt = &i915->gt;
2019 intel_wakeref_t wakeref;
2022 if (!intel_has_gpu_reset(gt))
2025 if (intel_gt_is_wedged(gt))
2026 return -EIO; /* we're long past hope of a successful reset */
2028 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2030 err = intel_gt_live_subtests(tests, gt);
2032 intel_runtime_pm_put(gt->uncore->rpm, wakeref);