drivers/gpu/drm/i915/gt/intel_lrc.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2014 Intel Corporation
   4  */
   5
   6 #include "gem/i915_gem_lmem.h"
   7
   8 #include "gen8_engine_cs.h"
   9 #include "i915_drv.h"
  10 #include "i915_perf.h"
  11 #include "intel_engine.h"
  12 #include "intel_engine_regs.h"
  13 #include "intel_gpu_commands.h"
  14 #include "intel_gt.h"
  15 #include "intel_lrc.h"
  16 #include "intel_lrc_reg.h"
  17 #include "intel_ring.h"
  18 #include "shmem_utils.h"
  19
  20 static void set_offsets(u32 *regs,
  21                         const u8 *data,
  22                         const struct intel_engine_cs *engine,
  23                         bool close)
  24 #define NOP(x) (BIT(7) | (x))
  25 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
  26 #define POSTED BIT(0)
  27 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
  28 #define REG16(x) \
  29         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
  30         (((x) >> 2) & 0x7f)
  31 #define END 0
  32 {
  33         const u32 base = engine->mmio_base;
  34
  35         while (*data) {
  36                 u8 count, flags;
  37
  38                 if (*data & BIT(7)) { /* skip */
  39                         count = *data++ & ~BIT(7);
  40                         regs += count;
  41                         continue;
  42                 }
  43
  44                 count = *data & 0x3f;
  45                 flags = *data >> 6;
  46                 data++;
  47
  48                 *regs = MI_LOAD_REGISTER_IMM(count);
  49                 if (flags & POSTED)
  50                         *regs |= MI_LRI_FORCE_POSTED;
  51                 if (GRAPHICS_VER(engine->i915) >= 11)
  52                         *regs |= MI_LRI_LRM_CS_MMIO;
  53                 regs++;
  54
  55                 GEM_BUG_ON(!count);
  56                 do {
  57                         u32 offset = 0;
  58                         u8 v;
  59
  60                         do {
  61                                 v = *data++;
  62                                 offset <<= 7;
  63                                 offset |= v & ~BIT(7);
  64                         } while (v & BIT(7));
  65
  66                         regs[0] = base + (offset << 2);
  67                         regs += 2;
  68                 } while (--count);
  69         }
  70
  71         if (close) {
  72                 /* Close the batch; used mainly by live_lrc_layout() */
  73                 *regs = MI_BATCH_BUFFER_END;
  74                 if (GRAPHICS_VER(engine->i915) >= 11)
  75                         *regs |= BIT(0);
  76         }
  77 }
  78
  79 static const u8 gen8_xcs_offsets[] = {
  80         NOP(1),
  81         LRI(11, 0),
  82         REG16(0x244),
  83         REG(0x034),
  84         REG(0x030),
  85         REG(0x038),
  86         REG(0x03c),
  87         REG(0x168),
  88         REG(0x140),
  89         REG(0x110),
  90         REG(0x11c),
  91         REG(0x114),
  92         REG(0x118),
  93
  94         NOP(9),
  95         LRI(9, 0),
  96         REG16(0x3a8),
  97         REG16(0x28c),
  98         REG16(0x288),
  99         REG16(0x284),
 100         REG16(0x280),
 101         REG16(0x27c),
 102         REG16(0x278),
 103         REG16(0x274),
 104         REG16(0x270),
 105
 106         NOP(13),
 107         LRI(2, 0),
 108         REG16(0x200),
 109         REG(0x028),
 110
 111         END
 112 };
 113
 114 static const u8 gen9_xcs_offsets[] = {
 115         NOP(1),
 116         LRI(14, POSTED),
 117         REG16(0x244),
 118         REG(0x034),
 119         REG(0x030),
 120         REG(0x038),
 121         REG(0x03c),
 122         REG(0x168),
 123         REG(0x140),
 124         REG(0x110),
 125         REG(0x11c),
 126         REG(0x114),
 127         REG(0x118),
 128         REG(0x1c0),
 129         REG(0x1c4),
 130         REG(0x1c8),
 131
 132         NOP(3),
 133         LRI(9, POSTED),
 134         REG16(0x3a8),
 135         REG16(0x28c),
 136         REG16(0x288),
 137         REG16(0x284),
 138         REG16(0x280),
 139         REG16(0x27c),
 140         REG16(0x278),
 141         REG16(0x274),
 142         REG16(0x270),
 143
 144         NOP(13),
 145         LRI(1, POSTED),
 146         REG16(0x200),
 147
 148         NOP(13),
 149         LRI(44, POSTED),
 150         REG(0x028),
 151         REG(0x09c),
 152         REG(0x0c0),
 153         REG(0x178),
 154         REG(0x17c),
 155         REG16(0x358),
 156         REG(0x170),
 157         REG(0x150),
 158         REG(0x154),
 159         REG(0x158),
 160         REG16(0x41c),
 161         REG16(0x600),
 162         REG16(0x604),
 163         REG16(0x608),
 164         REG16(0x60c),
 165         REG16(0x610),
 166         REG16(0x614),
 167         REG16(0x618),
 168         REG16(0x61c),
 169         REG16(0x620),
 170         REG16(0x624),
 171         REG16(0x628),
 172         REG16(0x62c),
 173         REG16(0x630),
 174         REG16(0x634),
 175         REG16(0x638),
 176         REG16(0x63c),
 177         REG16(0x640),
 178         REG16(0x644),
 179         REG16(0x648),
 180         REG16(0x64c),
 181         REG16(0x650),
 182         REG16(0x654),
 183         REG16(0x658),
 184         REG16(0x65c),
 185         REG16(0x660),
 186         REG16(0x664),
 187         REG16(0x668),
 188         REG16(0x66c),
 189         REG16(0x670),
 190         REG16(0x674),
 191         REG16(0x678),
 192         REG16(0x67c),
 193         REG(0x068),
 194
 195         END
 196 };
 197
 198 static const u8 gen12_xcs_offsets[] = {
 199         NOP(1),
 200         LRI(13, POSTED),
 201         REG16(0x244),
 202         REG(0x034),
 203         REG(0x030),
 204         REG(0x038),
 205         REG(0x03c),
 206         REG(0x168),
 207         REG(0x140),
 208         REG(0x110),
 209         REG(0x1c0),
 210         REG(0x1c4),
 211         REG(0x1c8),
 212         REG(0x180),
 213         REG16(0x2b4),
 214
 215         NOP(5),
 216         LRI(9, POSTED),
 217         REG16(0x3a8),
 218         REG16(0x28c),
 219         REG16(0x288),
 220         REG16(0x284),
 221         REG16(0x280),
 222         REG16(0x27c),
 223         REG16(0x278),
 224         REG16(0x274),
 225         REG16(0x270),
 226
 227         END
 228 };
 229
 230 static const u8 dg2_xcs_offsets[] = {
 231         NOP(1),
 232         LRI(15, POSTED),
 233         REG16(0x244),
 234         REG(0x034),
 235         REG(0x030),
 236         REG(0x038),
 237         REG(0x03c),
 238         REG(0x168),
 239         REG(0x140),
 240         REG(0x110),
 241         REG(0x1c0),
 242         REG(0x1c4),
 243         REG(0x1c8),
 244         REG(0x180),
 245         REG16(0x2b4),
 246         REG(0x120),
 247         REG(0x124),
 248
 249         NOP(1),
 250         LRI(9, POSTED),
 251         REG16(0x3a8),
 252         REG16(0x28c),
 253         REG16(0x288),
 254         REG16(0x284),
 255         REG16(0x280),
 256         REG16(0x27c),
 257         REG16(0x278),
 258         REG16(0x274),
 259         REG16(0x270),
 260
 261         END
 262 };
 263
 264 static const u8 gen8_rcs_offsets[] = {
 265         NOP(1),
 266         LRI(14, POSTED),
 267         REG16(0x244),
 268         REG(0x034),
 269         REG(0x030),
 270         REG(0x038),
 271         REG(0x03c),
 272         REG(0x168),
 273         REG(0x140),
 274         REG(0x110),
 275         REG(0x11c),
 276         REG(0x114),
 277         REG(0x118),
 278         REG(0x1c0),
 279         REG(0x1c4),
 280         REG(0x1c8),
 281
 282         NOP(3),
 283         LRI(9, POSTED),
 284         REG16(0x3a8),
 285         REG16(0x28c),
 286         REG16(0x288),
 287         REG16(0x284),
 288         REG16(0x280),
 289         REG16(0x27c),
 290         REG16(0x278),
 291         REG16(0x274),
 292         REG16(0x270),
 293
 294         NOP(13),
 295         LRI(1, 0),
 296         REG(0x0c8),
 297
 298         END
 299 };
 300
 301 static const u8 gen9_rcs_offsets[] = {
 302         NOP(1),
 303         LRI(14, POSTED),
 304         REG16(0x244),
 305         REG(0x34),
 306         REG(0x30),
 307         REG(0x38),
 308         REG(0x3c),
 309         REG(0x168),
 310         REG(0x140),
 311         REG(0x110),
 312         REG(0x11c),
 313         REG(0x114),
 314         REG(0x118),
 315         REG(0x1c0),
 316         REG(0x1c4),
 317         REG(0x1c8),
 318
 319         NOP(3),
 320         LRI(9, POSTED),
 321         REG16(0x3a8),
 322         REG16(0x28c),
 323         REG16(0x288),
 324         REG16(0x284),
 325         REG16(0x280),
 326         REG16(0x27c),
 327         REG16(0x278),
 328         REG16(0x274),
 329         REG16(0x270),
 330
 331         NOP(13),
 332         LRI(1, 0),
 333         REG(0xc8),
 334
 335         NOP(13),
 336         LRI(44, POSTED),
 337         REG(0x28),
 338         REG(0x9c),
 339         REG(0xc0),
 340         REG(0x178),
 341         REG(0x17c),
 342         REG16(0x358),
 343         REG(0x170),
 344         REG(0x150),
 345         REG(0x154),
 346         REG(0x158),
 347         REG16(0x41c),
 348         REG16(0x600),
 349         REG16(0x604),
 350         REG16(0x608),
 351         REG16(0x60c),
 352         REG16(0x610),
 353         REG16(0x614),
 354         REG16(0x618),
 355         REG16(0x61c),
 356         REG16(0x620),
 357         REG16(0x624),
 358         REG16(0x628),
 359         REG16(0x62c),
 360         REG16(0x630),
 361         REG16(0x634),
 362         REG16(0x638),
 363         REG16(0x63c),
 364         REG16(0x640),
 365         REG16(0x644),
 366         REG16(0x648),
 367         REG16(0x64c),
 368         REG16(0x650),
 369         REG16(0x654),
 370         REG16(0x658),
 371         REG16(0x65c),
 372         REG16(0x660),
 373         REG16(0x664),
 374         REG16(0x668),
 375         REG16(0x66c),
 376         REG16(0x670),
 377         REG16(0x674),
 378         REG16(0x678),
 379         REG16(0x67c),
 380         REG(0x68),
 381
 382         END
 383 };
 384
 385 static const u8 gen11_rcs_offsets[] = {
 386         NOP(1),
 387         LRI(15, POSTED),
 388         REG16(0x244),
 389         REG(0x034),
 390         REG(0x030),
 391         REG(0x038),
 392         REG(0x03c),
 393         REG(0x168),
 394         REG(0x140),
 395         REG(0x110),
 396         REG(0x11c),
 397         REG(0x114),
 398         REG(0x118),
 399         REG(0x1c0),
 400         REG(0x1c4),
 401         REG(0x1c8),
 402         REG(0x180),
 403
 404         NOP(1),
 405         LRI(9, POSTED),
 406         REG16(0x3a8),
 407         REG16(0x28c),
 408         REG16(0x288),
 409         REG16(0x284),
 410         REG16(0x280),
 411         REG16(0x27c),
 412         REG16(0x278),
 413         REG16(0x274),
 414         REG16(0x270),
 415
 416         LRI(1, POSTED),
 417         REG(0x1b0),
 418
 419         NOP(10),
 420         LRI(1, 0),
 421         REG(0x0c8),
 422
 423         END
 424 };
 425
 426 static const u8 gen12_rcs_offsets[] = {
 427         NOP(1),
 428         LRI(13, POSTED),
 429         REG16(0x244),
 430         REG(0x034),
 431         REG(0x030),
 432         REG(0x038),
 433         REG(0x03c),
 434         REG(0x168),
 435         REG(0x140),
 436         REG(0x110),
 437         REG(0x1c0),
 438         REG(0x1c4),
 439         REG(0x1c8),
 440         REG(0x180),
 441         REG16(0x2b4),
 442
 443         NOP(5),
 444         LRI(9, POSTED),
 445         REG16(0x3a8),
 446         REG16(0x28c),
 447         REG16(0x288),
 448         REG16(0x284),
 449         REG16(0x280),
 450         REG16(0x27c),
 451         REG16(0x278),
 452         REG16(0x274),
 453         REG16(0x270),
 454
 455         LRI(3, POSTED),
 456         REG(0x1b0),
 457         REG16(0x5a8),
 458         REG16(0x5ac),
 459
 460         NOP(6),
 461         LRI(1, 0),
 462         REG(0x0c8),
 463         NOP(3 + 9 + 1),
 464
 465         LRI(51, POSTED),
 466         REG16(0x588),
 467         REG16(0x588),
 468         REG16(0x588),
 469         REG16(0x588),
 470         REG16(0x588),
 471         REG16(0x588),
 472         REG(0x028),
 473         REG(0x09c),
 474         REG(0x0c0),
 475         REG(0x178),
 476         REG(0x17c),
 477         REG16(0x358),
 478         REG(0x170),
 479         REG(0x150),
 480         REG(0x154),
 481         REG(0x158),
 482         REG16(0x41c),
 483         REG16(0x600),
 484         REG16(0x604),
 485         REG16(0x608),
 486         REG16(0x60c),
 487         REG16(0x610),
 488         REG16(0x614),
 489         REG16(0x618),
 490         REG16(0x61c),
 491         REG16(0x620),
 492         REG16(0x624),
 493         REG16(0x628),
 494         REG16(0x62c),
 495         REG16(0x630),
 496         REG16(0x634),
 497         REG16(0x638),
 498         REG16(0x63c),
 499         REG16(0x640),
 500         REG16(0x644),
 501         REG16(0x648),
 502         REG16(0x64c),
 503         REG16(0x650),
 504         REG16(0x654),
 505         REG16(0x658),
 506         REG16(0x65c),
 507         REG16(0x660),
 508         REG16(0x664),
 509         REG16(0x668),
 510         REG16(0x66c),
 511         REG16(0x670),
 512         REG16(0x674),
 513         REG16(0x678),
 514         REG16(0x67c),
 515         REG(0x068),
 516         REG(0x084),
 517         NOP(1),
 518
 519         END
 520 };
 521
 522 static const u8 xehp_rcs_offsets[] = {
 523         NOP(1),
 524         LRI(13, POSTED),
 525         REG16(0x244),
 526         REG(0x034),
 527         REG(0x030),
 528         REG(0x038),
 529         REG(0x03c),
 530         REG(0x168),
 531         REG(0x140),
 532         REG(0x110),
 533         REG(0x1c0),
 534         REG(0x1c4),
 535         REG(0x1c8),
 536         REG(0x180),
 537         REG16(0x2b4),
 538
 539         NOP(5),
 540         LRI(9, POSTED),
 541         REG16(0x3a8),
 542         REG16(0x28c),
 543         REG16(0x288),
 544         REG16(0x284),
 545         REG16(0x280),
 546         REG16(0x27c),
 547         REG16(0x278),
 548         REG16(0x274),
 549         REG16(0x270),
 550
 551         LRI(3, POSTED),
 552         REG(0x1b0),
 553         REG16(0x5a8),
 554         REG16(0x5ac),
 555
 556         NOP(6),
 557         LRI(1, 0),
 558         REG(0x0c8),
 559
 560         END
 561 };
 562
 563 static const u8 dg2_rcs_offsets[] = {
 564         NOP(1),
 565         LRI(15, POSTED),
 566         REG16(0x244),
 567         REG(0x034),
 568         REG(0x030),
 569         REG(0x038),
 570         REG(0x03c),
 571         REG(0x168),
 572         REG(0x140),
 573         REG(0x110),
 574         REG(0x1c0),
 575         REG(0x1c4),
 576         REG(0x1c8),
 577         REG(0x180),
 578         REG16(0x2b4),
 579         REG(0x120),
 580         REG(0x124),
 581
 582         NOP(1),
 583         LRI(9, POSTED),
 584         REG16(0x3a8),
 585         REG16(0x28c),
 586         REG16(0x288),
 587         REG16(0x284),
 588         REG16(0x280),
 589         REG16(0x27c),
 590         REG16(0x278),
 591         REG16(0x274),
 592         REG16(0x270),
 593
 594         LRI(3, POSTED),
 595         REG(0x1b0),
 596         REG16(0x5a8),
 597         REG16(0x5ac),
 598
 599         NOP(6),
 600         LRI(1, 0),
 601         REG(0x0c8),
 602
 603         END
 604 };
 605
 606 #undef END
 607 #undef REG16
 608 #undef REG
 609 #undef LRI
 610 #undef NOP
 611
 612 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 613 {
 614         /*
 615          * The gen12+ lists only have the registers we program in the basic
 616          * default state. We rely on the context image using relative
 617          * addressing to automatic fixup the register state between the
 618          * physical engines for virtual engine.
 619          */
 620         GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
 621                    !intel_engine_has_relative_mmio(engine));
 622
 623         if (engine->class == RENDER_CLASS) {
 624                 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 625                         return dg2_rcs_offsets;
 626                 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 627                         return xehp_rcs_offsets;
 628                 else if (GRAPHICS_VER(engine->i915) >= 12)
 629                         return gen12_rcs_offsets;
 630                 else if (GRAPHICS_VER(engine->i915) >= 11)
 631                         return gen11_rcs_offsets;
 632                 else if (GRAPHICS_VER(engine->i915) >= 9)
 633                         return gen9_rcs_offsets;
 634                 else
 635                         return gen8_rcs_offsets;
 636         } else {
 637                 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 638                         return dg2_xcs_offsets;
 639                 else if (GRAPHICS_VER(engine->i915) >= 12)
 640                         return gen12_xcs_offsets;
 641                 else if (GRAPHICS_VER(engine->i915) >= 9)
 642                         return gen9_xcs_offsets;
 643                 else
 644                         return gen8_xcs_offsets;
 645         }
 646 }
 647
 648 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 649 {
 650         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 651                 return 0x70;
 652         else if (GRAPHICS_VER(engine->i915) >= 12)
 653                 return 0x60;
 654         else if (GRAPHICS_VER(engine->i915) >= 9)
 655                 return 0x54;
 656         else if (engine->class == RENDER_CLASS)
 657                 return 0x58;
 658         else
 659                 return -1;
 660 }
 661
 662 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 663 {
 664         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 665                 return 0x84;
 666         else if (GRAPHICS_VER(engine->i915) >= 12)
 667                 return 0x74;
 668         else if (GRAPHICS_VER(engine->i915) >= 9)
 669                 return 0x68;
 670         else if (engine->class == RENDER_CLASS)
 671                 return 0xd8;
 672         else
 673                 return -1;
 674 }
 675
 676 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 677 {
 678         if (GRAPHICS_VER(engine->i915) >= 12)
 679                 return 0x12;
 680         else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 681                 return 0x18;
 682         else
 683                 return -1;
 684 }
 685
 686 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 687 {
 688         int x;
 689
 690         x = lrc_ring_wa_bb_per_ctx(engine);
 691         if (x < 0)
 692                 return x;
 693
 694         return x + 2;
 695 }
 696
 697 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 698 {
 699         int x;
 700
 701         x = lrc_ring_indirect_ptr(engine);
 702         if (x < 0)
 703                 return x;
 704
 705         return x + 2;
 706 }
 707
 708 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 709 {
 710
 711         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 712                 /*
 713                  * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
 714                  * simply to match the RCS context image layout.
 715                  */
 716                 return 0xc6;
 717         else if (engine->class != RENDER_CLASS)
 718                 return -1;
 719         else if (GRAPHICS_VER(engine->i915) >= 12)
 720                 return 0xb6;
 721         else if (GRAPHICS_VER(engine->i915) >= 11)
 722                 return 0xaa;
 723         else
 724                 return -1;
 725 }
 726
 727 static u32
 728 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 729 {
 730         switch (GRAPHICS_VER(engine->i915)) {
 731         default:
 732                 MISSING_CASE(GRAPHICS_VER(engine->i915));
 733                 fallthrough;
 734         case 12:
 735                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 736         case 11:
 737                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 738         case 9:
 739                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 740         case 8:
 741                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 742         }
 743 }
 744
 745 static void
 746 lrc_setup_indirect_ctx(u32 *regs,
 747                        const struct intel_engine_cs *engine,
 748                        u32 ctx_bb_ggtt_addr,
 749                        u32 size)
 750 {
 751         GEM_BUG_ON(!size);
 752         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 753         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 754         regs[lrc_ring_indirect_ptr(engine) + 1] =
 755                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 756
 757         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 758         regs[lrc_ring_indirect_offset(engine) + 1] =
 759                 lrc_ring_indirect_offset_default(engine) << 6;
 760 }
 761
 762 static void init_common_regs(u32 * const regs,
 763                              const struct intel_context *ce,
 764                              const struct intel_engine_cs *engine,
 765                              bool inhibit)
 766 {
 767         u32 ctl;
 768
 769         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
 770         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
 771         if (inhibit)
 772                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
 773         if (GRAPHICS_VER(engine->i915) < 11)
 774                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
 775                                            CTX_CTRL_RS_CTX_ENABLE);
 776         regs[CTX_CONTEXT_CONTROL] = ctl;
 777
 778         regs[CTX_TIMESTAMP] = ce->runtime.last;
 779 }
 780
 781 static void init_wa_bb_regs(u32 * const regs,
 782                             const struct intel_engine_cs *engine)
 783 {
 784         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
 785
 786         if (wa_ctx->per_ctx.size) {
 787                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 788
 789                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
 790                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
 791                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
 792         }
 793
 794         if (wa_ctx->indirect_ctx.size) {
 795                 lrc_setup_indirect_ctx(regs, engine,
 796                                        i915_ggtt_offset(wa_ctx->vma) +
 797                                        wa_ctx->indirect_ctx.offset,
 798                                        wa_ctx->indirect_ctx.size);
 799         }
 800 }
 801
 802 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
 803 {
 804         if (i915_vm_is_4lvl(&ppgtt->vm)) {
 805                 /* 64b PPGTT (48bit canonical)
 806                  * PDP0_DESCRIPTOR contains the base address to PML4 and
 807                  * other PDP Descriptors are ignored.
 808                  */
 809                 ASSIGN_CTX_PML4(ppgtt, regs);
 810         } else {
 811                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
 812                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
 813                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
 814                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
 815         }
 816 }
 817
 818 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
 819 {
 820         if (i915_is_ggtt(vm))
 821                 return i915_vm_to_ggtt(vm)->alias;
 822         else
 823                 return i915_vm_to_ppgtt(vm);
 824 }
 825
 826 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
 827 {
 828         int x;
 829
 830         x = lrc_ring_mi_mode(engine);
 831         if (x != -1) {
 832                 regs[x + 1] &= ~STOP_RING;
 833                 regs[x + 1] |= STOP_RING << 16;
 834         }
 835 }
 836
 837 static void __lrc_init_regs(u32 *regs,
 838                             const struct intel_context *ce,
 839                             const struct intel_engine_cs *engine,
 840                             bool inhibit)
 841 {
 842         /*
 843          * A context is actually a big batch buffer with several
 844          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
 845          * values we are setting here are only for the first context restore:
 846          * on a subsequent save, the GPU will recreate this batchbuffer with new
 847          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
 848          * we are not initializing here).
 849          *
 850          * Must keep consistent with virtual_update_register_offsets().
 851          */
 852
 853         if (inhibit)
 854                 memset(regs, 0, PAGE_SIZE);
 855
 856         set_offsets(regs, reg_offsets(engine), engine, inhibit);
 857
 858         init_common_regs(regs, ce, engine, inhibit);
 859         init_ppgtt_regs(regs, vm_alias(ce->vm));
 860
 861         init_wa_bb_regs(regs, engine);
 862
 863         __reset_stop_ring(regs, engine);
 864 }
 865
 866 void lrc_init_regs(const struct intel_context *ce,
 867                    const struct intel_engine_cs *engine,
 868                    bool inhibit)
 869 {
 870         __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
 871 }
 872
 873 void lrc_reset_regs(const struct intel_context *ce,
 874                     const struct intel_engine_cs *engine)
 875 {
 876         __reset_stop_ring(ce->lrc_reg_state, engine);
 877 }
 878
 879 static void
 880 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
 881 {
 882         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 883                 return;
 884
 885         vaddr += engine->context_size;
 886
 887         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
 888 }
 889
 890 static void
 891 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
 892 {
 893         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 894                 return;
 895
 896         vaddr += engine->context_size;
 897
 898         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
 899                 drm_err_once(&engine->i915->drm,
 900                              "%s context redzone overwritten!\n",
 901                              engine->name);
 902 }
 903
 904 void lrc_init_state(struct intel_context *ce,
 905                     struct intel_engine_cs *engine,
 906                     void *state)
 907 {
 908         bool inhibit = true;
 909
 910         set_redzone(state, engine);
 911
 912         if (engine->default_state) {
 913                 shmem_read(engine->default_state, 0,
 914                            state, engine->context_size);
 915                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
 916                 inhibit = false;
 917         }
 918
 919         /* Clear the ppHWSP (inc. per-context counters) */
 920         memset(state, 0, PAGE_SIZE);
 921
 922         /*
 923          * The second page of the context object contains some registers which
 924          * must be set up prior to the first execution.
 925          */
 926         __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
 927 }
 928
 929 static struct i915_vma *
 930 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
 931 {
 932         struct drm_i915_gem_object *obj;
 933         struct i915_vma *vma;
 934         u32 context_size;
 935
 936         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
 937
 938         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 939                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
 940
 941         if (GRAPHICS_VER(engine->i915) == 12) {
 942                 ce->wa_bb_page = context_size / PAGE_SIZE;
 943                 context_size += PAGE_SIZE;
 944         }
 945
 946         if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
 947                 ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
 948                 context_size += PARENT_SCRATCH_SIZE;
 949         }
 950
 951         obj = i915_gem_object_create_lmem(engine->i915, context_size,
 952                                           I915_BO_ALLOC_PM_VOLATILE);
 953         if (IS_ERR(obj))
 954                 obj = i915_gem_object_create_shmem(engine->i915, context_size);
 955         if (IS_ERR(obj))
 956                 return ERR_CAST(obj);
 957
 958         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
 959         if (IS_ERR(vma)) {
 960                 i915_gem_object_put(obj);
 961                 return vma;
 962         }
 963
 964         return vma;
 965 }
 966
 967 static struct intel_timeline *
 968 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
 969 {
 970         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
 971
 972         return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
 973 }
 974
 975 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
 976 {
 977         struct intel_ring *ring;
 978         struct i915_vma *vma;
 979         int err;
 980
 981         GEM_BUG_ON(ce->state);
 982
 983         vma = __lrc_alloc_state(ce, engine);
 984         if (IS_ERR(vma))
 985                 return PTR_ERR(vma);
 986
 987         ring = intel_engine_create_ring(engine, ce->ring_size);
 988         if (IS_ERR(ring)) {
 989                 err = PTR_ERR(ring);
 990                 goto err_vma;
 991         }
 992
 993         if (!page_mask_bits(ce->timeline)) {
 994                 struct intel_timeline *tl;
 995
 996                 /*
 997                  * Use the static global HWSP for the kernel context, and
 998                  * a dynamically allocated cacheline for everyone else.
 999                  */
1000                 if (unlikely(ce->timeline))
1001                         tl = pinned_timeline(ce, engine);
1002                 else
1003                         tl = intel_timeline_create(engine->gt);
1004                 if (IS_ERR(tl)) {
1005                         err = PTR_ERR(tl);
1006                         goto err_ring;
1007                 }
1008
1009                 ce->timeline = tl;
1010         }
1011
1012         ce->ring = ring;
1013         ce->state = vma;
1014
1015         return 0;
1016
1017 err_ring:
1018         intel_ring_put(ring);
1019 err_vma:
1020         i915_vma_put(vma);
1021         return err;
1022 }
1023
1024 void lrc_reset(struct intel_context *ce)
1025 {
1026         GEM_BUG_ON(!intel_context_is_pinned(ce));
1027
1028         intel_ring_reset(ce->ring, ce->ring->emit);
1029
1030         /* Scrub away the garbage */
1031         lrc_init_regs(ce, ce->engine, true);
1032         ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1033 }
1034
1035 int
1036 lrc_pre_pin(struct intel_context *ce,
1037             struct intel_engine_cs *engine,
1038             struct i915_gem_ww_ctx *ww,
1039             void **vaddr)
1040 {
1041         GEM_BUG_ON(!ce->state);
1042         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1043
1044         *vaddr = i915_gem_object_pin_map(ce->state->obj,
1045                                          i915_coherent_map_type(ce->engine->i915,
1046                                                                 ce->state->obj,
1047                                                                 false) |
1048                                          I915_MAP_OVERRIDE);
1049
1050         return PTR_ERR_OR_ZERO(*vaddr);
1051 }
1052
1053 int
1054 lrc_pin(struct intel_context *ce,
1055         struct intel_engine_cs *engine,
1056         void *vaddr)
1057 {
1058         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1059
1060         if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1061                 lrc_init_state(ce, engine, vaddr);
1062
1063         ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1064         return 0;
1065 }
1066
1067 void lrc_unpin(struct intel_context *ce)
1068 {
1069         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1070                       ce->engine);
1071 }
1072
1073 void lrc_post_unpin(struct intel_context *ce)
1074 {
1075         i915_gem_object_unpin_map(ce->state->obj);
1076 }
1077
1078 void lrc_fini(struct intel_context *ce)
1079 {
1080         if (!ce->state)
1081                 return;
1082
1083         intel_ring_put(fetch_and_zero(&ce->ring));
1084         i915_vma_put(fetch_and_zero(&ce->state));
1085 }
1086
1087 void lrc_destroy(struct kref *kref)
1088 {
1089         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1090
1091         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1092         GEM_BUG_ON(intel_context_is_pinned(ce));
1093
1094         lrc_fini(ce);
1095
1096         intel_context_fini(ce);
1097         intel_context_free(ce);
1098 }
1099
1100 static u32 *
1101 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1102 {
1103         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1104                 MI_SRM_LRM_GLOBAL_GTT |
1105                 MI_LRI_LRM_CS_MMIO;
1106         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1107         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1108                 CTX_TIMESTAMP * sizeof(u32);
1109         *cs++ = 0;
1110
1111         *cs++ = MI_LOAD_REGISTER_REG |
1112                 MI_LRR_SOURCE_CS_MMIO |
1113                 MI_LRI_LRM_CS_MMIO;
1114         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1115         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1116
1117         *cs++ = MI_LOAD_REGISTER_REG |
1118                 MI_LRR_SOURCE_CS_MMIO |
1119                 MI_LRI_LRM_CS_MMIO;
1120         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1121         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1122
1123         return cs;
1124 }
1125
1126 static u32 *
1127 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1128 {
1129         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1130
1131         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1132                 MI_SRM_LRM_GLOBAL_GTT |
1133                 MI_LRI_LRM_CS_MMIO;
1134         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1135         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1136                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1137         *cs++ = 0;
1138
1139         return cs;
1140 }
1141
1142 static u32 *
1143 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1144 {
1145         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1146
1147         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1148                 MI_SRM_LRM_GLOBAL_GTT |
1149                 MI_LRI_LRM_CS_MMIO;
1150         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1151         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1152                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1153         *cs++ = 0;
1154
1155         *cs++ = MI_LOAD_REGISTER_REG |
1156                 MI_LRR_SOURCE_CS_MMIO |
1157                 MI_LRI_LRM_CS_MMIO;
1158         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1159         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1160
1161         return cs;
1162 }
1163
1164 static u32 *
1165 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1166 {
1167         cs = gen12_emit_timestamp_wa(ce, cs);
1168         cs = gen12_emit_cmd_buf_wa(ce, cs);
1169         cs = gen12_emit_restore_scratch(ce, cs);
1170
1171         /* Wa_16013000631:dg2 */
1172         if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1173             IS_DG2_G11(ce->engine->i915))
1174                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1175
1176         return cs;
1177 }
1178
1179 static u32 *
1180 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1181 {
1182         cs = gen12_emit_timestamp_wa(ce, cs);
1183         cs = gen12_emit_restore_scratch(ce, cs);
1184
1185         return cs;
1186 }
1187
1188 static u32 context_wa_bb_offset(const struct intel_context *ce)
1189 {
1190         return PAGE_SIZE * ce->wa_bb_page;
1191 }
1192
1193 static u32 *context_indirect_bb(const struct intel_context *ce)
1194 {
1195         void *ptr;
1196
1197         GEM_BUG_ON(!ce->wa_bb_page);
1198
1199         ptr = ce->lrc_reg_state;
1200         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1201         ptr += context_wa_bb_offset(ce);
1202
1203         return ptr;
1204 }
1205
1206 static void
1207 setup_indirect_ctx_bb(const struct intel_context *ce,
1208                       const struct intel_engine_cs *engine,
1209                       u32 *(*emit)(const struct intel_context *, u32 *))
1210 {
1211         u32 * const start = context_indirect_bb(ce);
1212         u32 *cs;
1213
1214         cs = emit(ce, start);
1215         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1216         while ((unsigned long)cs % CACHELINE_BYTES)
1217                 *cs++ = MI_NOOP;
1218
1219         lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1220                                i915_ggtt_offset(ce->state) +
1221                                context_wa_bb_offset(ce),
1222                                (cs - start) * sizeof(*cs));
1223 }
1224
1225 /*
1226  * The context descriptor encodes various attributes of a context,
1227  * including its GTT address and some flags. Because it's fairly
1228  * expensive to calculate, we'll just do it once and cache the result,
1229  * which remains valid until the context is unpinned.
1230  *
1231  * This is what a descriptor looks like, from LSB to MSB::
1232  *
1233  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1234  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1235  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1236  *      bits 53-54:    mbz, reserved for use by hardware
1237  *      bits 55-63:    group ID, currently unused and set to 0
1238  *
1239  * Starting from Gen11, the upper dword of the descriptor has a new format:
1240  *
1241  *      bits 32-36:    reserved
1242  *      bits 37-47:    SW context ID
1243  *      bits 48:53:    engine instance
1244  *      bit 54:        mbz, reserved for use by hardware
1245  *      bits 55-60:    SW counter
1246  *      bits 61-63:    engine class
1247  *
1248  * On Xe_HP, the upper dword of the descriptor has a new format:
1249  *
1250  *      bits 32-37:    virtual function number
1251  *      bit 38:        mbz, reserved for use by hardware
1252  *      bits 39-54:    SW context ID
1253  *      bits 55-57:    reserved
1254  *      bits 58-63:    SW counter
1255  *
1256  * engine info, SW context ID and SW counter need to form a unique number
1257  * (Context ID) per lrc.
1258  */
1259 static u32 lrc_descriptor(const struct intel_context *ce)
1260 {
1261         u32 desc;
1262
1263         desc = INTEL_LEGACY_32B_CONTEXT;
1264         if (i915_vm_is_4lvl(ce->vm))
1265                 desc = INTEL_LEGACY_64B_CONTEXT;
1266         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1267
1268         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1269         if (GRAPHICS_VER(ce->vm->i915) == 8)
1270                 desc |= GEN8_CTX_L3LLC_COHERENT;
1271
1272         return i915_ggtt_offset(ce->state) | desc;
1273 }
1274
1275 u32 lrc_update_regs(const struct intel_context *ce,
1276                     const struct intel_engine_cs *engine,
1277                     u32 head)
1278 {
1279         struct intel_ring *ring = ce->ring;
1280         u32 *regs = ce->lrc_reg_state;
1281
1282         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1283         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1284
1285         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1286         regs[CTX_RING_HEAD] = head;
1287         regs[CTX_RING_TAIL] = ring->tail;
1288         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1289
1290         /* RPCS */
1291         if (engine->class == RENDER_CLASS) {
1292                 regs[CTX_R_PWR_CLK_STATE] =
1293                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1294
1295                 i915_oa_init_reg_state(ce, engine);
1296         }
1297
1298         if (ce->wa_bb_page) {
1299                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1300
1301                 fn = gen12_emit_indirect_ctx_xcs;
1302                 if (ce->engine->class == RENDER_CLASS)
1303                         fn = gen12_emit_indirect_ctx_rcs;
1304
1305                 /* Mutually exclusive wrt to global indirect bb */
1306                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1307                 setup_indirect_ctx_bb(ce, engine, fn);
1308         }
1309
1310         return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1311 }
1312
1313 void lrc_update_offsets(struct intel_context *ce,
1314                         struct intel_engine_cs *engine)
1315 {
1316         set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1317 }
1318
1319 void lrc_check_regs(const struct intel_context *ce,
1320                     const struct intel_engine_cs *engine,
1321                     const char *when)
1322 {
1323         const struct intel_ring *ring = ce->ring;
1324         u32 *regs = ce->lrc_reg_state;
1325         bool valid = true;
1326         int x;
1327
1328         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1329                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1330                        engine->name,
1331                        regs[CTX_RING_START],
1332                        i915_ggtt_offset(ring->vma));
1333                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1334                 valid = false;
1335         }
1336
1337         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1338             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1339                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1340                        engine->name,
1341                        regs[CTX_RING_CTL],
1342                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1343                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1344                 valid = false;
1345         }
1346
1347         x = lrc_ring_mi_mode(engine);
1348         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1349                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1350                        engine->name, regs[x + 1]);
1351                 regs[x + 1] &= ~STOP_RING;
1352                 regs[x + 1] |= STOP_RING << 16;
1353                 valid = false;
1354         }
1355
1356         WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1357 }
1358
1359 /*
1360  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1361  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1362  * but there is a slight complication as this is applied in WA batch where the
1363  * values are only initialized once so we cannot take register value at the
1364  * beginning and reuse it further; hence we save its value to memory, upload a
1365  * constant value with bit21 set and then we restore it back with the saved value.
1366  * To simplify the WA, a constant value is formed by using the default value
1367  * of this register. This shouldn't be a problem because we are only modifying
1368  * it for a short period and this batch in non-premptible. We can ofcourse
1369  * use additional instructions that read the actual value of the register
1370  * at that time and set our bit of interest but it makes the WA complicated.
1371  *
1372  * This WA is also required for Gen9 so extracting as a function avoids
1373  * code duplication.
1374  */
1375 static u32 *
1376 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1377 {
1378         /* NB no one else is allowed to scribble over scratch + 256! */
1379         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1380         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1381         *batch++ = intel_gt_scratch_offset(engine->gt,
1382                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1383         *batch++ = 0;
1384
1385         *batch++ = MI_LOAD_REGISTER_IMM(1);
1386         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1387         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1388
1389         batch = gen8_emit_pipe_control(batch,
1390                                        PIPE_CONTROL_CS_STALL |
1391                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1392                                        0);
1393
1394         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1395         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1396         *batch++ = intel_gt_scratch_offset(engine->gt,
1397                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1398         *batch++ = 0;
1399
1400         return batch;
1401 }
1402
1403 /*
1404  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1405  * initialized at the beginning and shared across all contexts but this field
1406  * helps us to have multiple batches at different offsets and select them based
1407  * on a criteria. At the moment this batch always start at the beginning of the page
1408  * and at this point we don't have multiple wa_ctx batch buffers.
1409  *
1410  * The number of WA applied are not known at the beginning; we use this field
1411  * to return the no of DWORDS written.
1412  *
1413  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1414  * so it adds NOOPs as padding to make it cacheline aligned.
1415  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1416  * makes a complete batch buffer.
1417  */
1418 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1419 {
1420         /* WaDisableCtxRestoreArbitration:bdw,chv */
1421         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1422
1423         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1424         if (IS_BROADWELL(engine->i915))
1425                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1426
1427         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1428         /* Actual scratch location is at 128 bytes offset */
1429         batch = gen8_emit_pipe_control(batch,
1430                                        PIPE_CONTROL_FLUSH_L3 |
1431                                        PIPE_CONTROL_STORE_DATA_INDEX |
1432                                        PIPE_CONTROL_CS_STALL |
1433                                        PIPE_CONTROL_QW_WRITE,
1434                                        LRC_PPHWSP_SCRATCH_ADDR);
1435
1436         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1437
1438         /* Pad to end of cacheline */
1439         while ((unsigned long)batch % CACHELINE_BYTES)
1440                 *batch++ = MI_NOOP;
1441
1442         /*
1443          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1444          * execution depends on the length specified in terms of cache lines
1445          * in the register CTX_RCS_INDIRECT_CTX
1446          */
1447
1448         return batch;
1449 }
1450
1451 struct lri {
1452         i915_reg_t reg;
1453         u32 value;
1454 };
1455
1456 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1457 {
1458         GEM_BUG_ON(!count || count > 63);
1459
1460         *batch++ = MI_LOAD_REGISTER_IMM(count);
1461         do {
1462                 *batch++ = i915_mmio_reg_offset(lri->reg);
1463                 *batch++ = lri->value;
1464         } while (lri++, --count);
1465         *batch++ = MI_NOOP;
1466
1467         return batch;
1468 }
1469
1470 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1471 {
1472         static const struct lri lri[] = {
1473                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1474                 {
1475                         COMMON_SLICE_CHICKEN2,
1476                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1477                                        0),
1478                 },
1479
1480                 /* BSpec: 11391 */
1481                 {
1482                         FF_SLICE_CHICKEN,
1483                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1484                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1485                 },
1486
1487                 /* BSpec: 11299 */
1488                 {
1489                         _3D_CHICKEN3,
1490                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1491                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1492                 }
1493         };
1494
1495         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1496
1497         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1498         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1499
1500         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1501         batch = gen8_emit_pipe_control(batch,
1502                                        PIPE_CONTROL_FLUSH_L3 |
1503                                        PIPE_CONTROL_STORE_DATA_INDEX |
1504                                        PIPE_CONTROL_CS_STALL |
1505                                        PIPE_CONTROL_QW_WRITE,
1506                                        LRC_PPHWSP_SCRATCH_ADDR);
1507
1508         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1509
1510         /* WaMediaPoolStateCmdInWABB:bxt,glk */
1511         if (HAS_POOLED_EU(engine->i915)) {
1512                 /*
1513                  * EU pool configuration is setup along with golden context
1514                  * during context initialization. This value depends on
1515                  * device type (2x6 or 3x6) and needs to be updated based
1516                  * on which subslice is disabled especially for 2x6
1517                  * devices, however it is safe to load default
1518                  * configuration of 3x6 device instead of masking off
1519                  * corresponding bits because HW ignores bits of a disabled
1520                  * subslice and drops down to appropriate config. Please
1521                  * see render_state_setup() in i915_gem_render_state.c for
1522                  * possible configurations, to avoid duplication they are
1523                  * not shown here again.
1524                  */
1525                 *batch++ = GEN9_MEDIA_POOL_STATE;
1526                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1527                 *batch++ = 0x00777000;
1528                 *batch++ = 0;
1529                 *batch++ = 0;
1530                 *batch++ = 0;
1531         }
1532
1533         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1534
1535         /* Pad to end of cacheline */
1536         while ((unsigned long)batch % CACHELINE_BYTES)
1537                 *batch++ = MI_NOOP;
1538
1539         return batch;
1540 }
1541
1542 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1543
1544 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1545 {
1546         struct drm_i915_gem_object *obj;
1547         struct i915_vma *vma;
1548         int err;
1549
1550         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1551         if (IS_ERR(obj))
1552                 return PTR_ERR(obj);
1553
1554         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1555         if (IS_ERR(vma)) {
1556                 err = PTR_ERR(vma);
1557                 goto err;
1558         }
1559
1560         engine->wa_ctx.vma = vma;
1561         return 0;
1562
1563 err:
1564         i915_gem_object_put(obj);
1565         return err;
1566 }
1567
1568 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1569 {
1570         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1571 }
1572
1573 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1574
1575 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1576 {
1577         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1578         struct i915_wa_ctx_bb *wa_bb[] = {
1579                 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1580         };
1581         wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1582         struct i915_gem_ww_ctx ww;
1583         void *batch, *batch_ptr;
1584         unsigned int i;
1585         int err;
1586
1587         if (engine->class != RENDER_CLASS)
1588                 return;
1589
1590         switch (GRAPHICS_VER(engine->i915)) {
1591         case 12:
1592         case 11:
1593                 return;
1594         case 9:
1595                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1596                 wa_bb_fn[1] = NULL;
1597                 break;
1598         case 8:
1599                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1600                 wa_bb_fn[1] = NULL;
1601                 break;
1602         default:
1603                 MISSING_CASE(GRAPHICS_VER(engine->i915));
1604                 return;
1605         }
1606
1607         err = lrc_create_wa_ctx(engine);
1608         if (err) {
1609                 /*
1610                  * We continue even if we fail to initialize WA batch
1611                  * because we only expect rare glitches but nothing
1612                  * critical to prevent us from using GPU
1613                  */
1614                 drm_err(&engine->i915->drm,
1615                         "Ignoring context switch w/a allocation error:%d\n",
1616                         err);
1617                 return;
1618         }
1619
1620         if (!engine->wa_ctx.vma)
1621                 return;
1622
1623         i915_gem_ww_ctx_init(&ww, true);
1624 retry:
1625         err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1626         if (!err)
1627                 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1628         if (err)
1629                 goto err;
1630
1631         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1632         if (IS_ERR(batch)) {
1633                 err = PTR_ERR(batch);
1634                 goto err_unpin;
1635         }
1636
1637         /*
1638          * Emit the two workaround batch buffers, recording the offset from the
1639          * start of the workaround batch buffer object for each and their
1640          * respective sizes.
1641          */
1642         batch_ptr = batch;
1643         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1644                 wa_bb[i]->offset = batch_ptr - batch;
1645                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1646                                                   CACHELINE_BYTES))) {
1647                         err = -EINVAL;
1648                         break;
1649                 }
1650                 if (wa_bb_fn[i])
1651                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1652                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1653         }
1654         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1655
1656         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1657         __i915_gem_object_release_map(wa_ctx->vma->obj);
1658
1659         /* Verify that we can handle failure to setup the wa_ctx */
1660         if (!err)
1661                 err = i915_inject_probe_error(engine->i915, -ENODEV);
1662
1663 err_unpin:
1664         if (err)
1665                 i915_vma_unpin(wa_ctx->vma);
1666 err:
1667         if (err == -EDEADLK) {
1668                 err = i915_gem_ww_ctx_backoff(&ww);
1669                 if (!err)
1670                         goto retry;
1671         }
1672         i915_gem_ww_ctx_fini(&ww);
1673
1674         if (err) {
1675                 i915_vma_put(engine->wa_ctx.vma);
1676
1677                 /* Clear all flags to prevent further use */
1678                 memset(wa_ctx, 0, sizeof(*wa_ctx));
1679         }
1680 }
1681
1682 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1683 {
1684 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1685         ce->runtime.num_underflow++;
1686         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1687 #endif
1688 }
1689
1690 void lrc_update_runtime(struct intel_context *ce)
1691 {
1692         u32 old;
1693         s32 dt;
1694
1695         if (intel_context_is_barrier(ce))
1696                 return;
1697
1698         old = ce->runtime.last;
1699         ce->runtime.last = lrc_get_runtime(ce);
1700         dt = ce->runtime.last - old;
1701
1702         if (unlikely(dt < 0)) {
1703                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1704                          old, ce->runtime.last, dt);
1705                 st_update_runtime_underflow(ce, dt);
1706                 return;
1707         }
1708
1709         ewma_runtime_add(&ce->runtime.avg, dt);
1710         ce->runtime.total += dt;
1711 }
1712
1713 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1714 #include "selftest_lrc.c"
1715 #endif