1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2021 Intel Corporation 4 */ 5 6 #include "xe_lrc.h" 7 8 #include <generated/xe_wa_oob.h> 9 10 #include <linux/ascii85.h> 11 #include <linux/panic.h> 12 13 #include "instructions/xe_mi_commands.h" 14 #include "instructions/xe_gfxpipe_commands.h" 15 #include "instructions/xe_gfx_state_commands.h" 16 #include "regs/xe_engine_regs.h" 17 #include "regs/xe_gt_regs.h" 18 #include "regs/xe_lrc_layout.h" 19 #include "xe_bb.h" 20 #include "xe_bo.h" 21 #include "xe_configfs.h" 22 #include "xe_device.h" 23 #include "xe_drm_client.h" 24 #include "xe_exec_queue_types.h" 25 #include "xe_gt.h" 26 #include "xe_gt_printk.h" 27 #include "xe_hw_fence.h" 28 #include "xe_map.h" 29 #include "xe_memirq.h" 30 #include "xe_mmio.h" 31 #include "xe_ring_ops.h" 32 #include "xe_sriov.h" 33 #include "xe_trace_lrc.h" 34 #include "xe_vm.h" 35 #include "xe_wa.h" 36 37 #define LRC_VALID BIT_ULL(0) 38 #define LRC_PRIVILEGE BIT_ULL(8) 39 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3) 40 #define LRC_LEGACY_64B_CONTEXT 3 41 42 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61) 43 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48) 44 45 #define LRC_PPHWSP_SIZE SZ_4K 46 #define LRC_INDIRECT_CTX_BO_SIZE SZ_4K 47 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K 48 49 #define LRC_PRIORITY GENMASK_ULL(10, 9) 50 #define LRC_PRIORITY_LOW 0 51 #define LRC_PRIORITY_NORMAL 1 52 #define LRC_PRIORITY_HIGH 2 53 54 /* 55 * Layout of the LRC and associated data allocated as 56 * lrc->bo: 57 * 58 * Region Size 59 * +============================+=================================+ <- __xe_lrc_ring_offset() 60 * | Ring | ring_size, see | 61 * | | xe_lrc_init() | 62 * +============================+=================================+ <- __xe_lrc_pphwsp_offset() 63 * | PPHWSP (includes SW state) | 4K | 64 * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset() 65 * | Engine Context Image | n * 4K, see | 66 * | | xe_gt_lrc_size() | 67 * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset() 68 * | Indirect Ring State Page | 0 or 4k, see | 69 * | | XE_LRC_FLAG_INDIRECT_RING_STATE | 70 * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset() 71 * | Indirect Context Page | 0 or 4k, see | 72 * | | XE_LRC_FLAG_INDIRECT_CTX | 73 * +============================+=================================+ <- __xe_lrc_wa_bb_offset() 74 * | WA BB Per Ctx | 4k | 75 * +============================+=================================+ <- xe_bo_size(lrc->bo) 76 */ 77 78 static struct xe_device * 79 lrc_to_xe(struct xe_lrc *lrc) 80 { 81 return gt_to_xe(lrc->fence_ctx.gt); 82 } 83 84 static bool 85 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class) 86 { 87 struct xe_device *xe = gt_to_xe(gt); 88 89 if (XE_GT_WA(gt, 16010904313) && 90 (class == XE_ENGINE_CLASS_RENDER || 91 class == XE_ENGINE_CLASS_COMPUTE)) 92 return true; 93 94 if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 95 class, NULL)) 96 return true; 97 98 if (gt->ring_ops[class]->emit_aux_table_inv) 99 return true; 100 101 return false; 102 } 103 104 /** 105 * xe_gt_lrc_hang_replay_size() - Hang replay size 106 * @gt: The GT 107 * @class: Hardware engine class 108 * 109 * Determine size of GPU hang replay state for a GT and hardware engine class. 110 * 111 * Return: Size of GPU hang replay size 112 */ 113 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class) 114 { 115 struct xe_device *xe = gt_to_xe(gt); 116 size_t size = 0; 117 118 /* Engine context image */ 119 switch (class) { 120 case XE_ENGINE_CLASS_RENDER: 121 if (GRAPHICS_VERx100(xe) >= 3510) 122 size += 7 * SZ_4K; 123 else if (GRAPHICS_VER(xe) >= 20) 124 size += 3 * SZ_4K; 125 else 126 size += 13 * SZ_4K; 127 break; 128 case XE_ENGINE_CLASS_COMPUTE: 129 if (GRAPHICS_VERx100(xe) >= 3510) 130 size += 5 * SZ_4K; 131 else if (GRAPHICS_VER(xe) >= 20) 132 size += 2 * SZ_4K; 133 else 134 size += 13 * SZ_4K; 135 break; 136 default: 137 WARN(1, "Unknown engine class: %d", class); 138 fallthrough; 139 case XE_ENGINE_CLASS_COPY: 140 case XE_ENGINE_CLASS_VIDEO_DECODE: 141 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 142 case XE_ENGINE_CLASS_OTHER: 143 size += 1 * SZ_4K; 144 } 145 146 return size; 147 } 148 149 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class) 150 { 151 size_t size = xe_gt_lrc_hang_replay_size(gt, class); 152 153 /* Add indirect ring state page */ 154 if (xe_gt_has_indirect_ring_state(gt)) 155 size += LRC_INDIRECT_RING_STATE_SIZE; 156 157 return size + LRC_PPHWSP_SIZE; 158 } 159 160 /* 161 * The per-platform tables are u8-encoded in @data. Decode @data and set the 162 * addresses' offset and commands in @regs. The following encoding is used 163 * for each byte. There are 2 steps: decoding commands and decoding addresses. 164 * 165 * Commands: 166 * [7]: create NOPs - number of NOPs are set in lower bits 167 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 168 * MI_LRI_FORCE_POSTED 169 * [5:0]: Number of NOPs or registers to set values to in case of 170 * MI_LOAD_REGISTER_IMM 171 * 172 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 173 * number of registers. They are set by using the REG/REG16 macros: the former 174 * is used for offsets smaller than 0x200 while the latter is for values bigger 175 * than that. Those macros already set all the bits documented below correctly: 176 * 177 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 178 * follow, for the lower bits 179 * [6:0]: Register offset, without considering the engine base. 180 * 181 * This function only tweaks the commands and register offsets. Values are not 182 * filled out. 183 */ 184 static void set_offsets(u32 *regs, 185 const u8 *data, 186 const struct xe_hw_engine *hwe) 187 #define NOP(x) (BIT(7) | (x)) 188 #define LRI(count, flags) ((flags) << 6 | (count) | \ 189 BUILD_BUG_ON_ZERO(count >= BIT(6))) 190 #define POSTED BIT(0) 191 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 192 #define REG16(x) \ 193 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 194 (((x) >> 2) & 0x7f) 195 { 196 const u32 base = hwe->mmio_base; 197 198 while (*data) { 199 u8 count, flags; 200 201 if (*data & BIT(7)) { /* skip */ 202 count = *data++ & ~BIT(7); 203 regs += count; 204 continue; 205 } 206 207 count = *data & 0x3f; 208 flags = *data >> 6; 209 data++; 210 211 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 212 if (flags & POSTED) 213 *regs |= MI_LRI_FORCE_POSTED; 214 *regs |= MI_LRI_LRM_CS_MMIO; 215 regs++; 216 217 xe_gt_assert(hwe->gt, count); 218 do { 219 u32 offset = 0; 220 u8 v; 221 222 do { 223 v = *data++; 224 offset <<= 7; 225 offset |= v & ~BIT(7); 226 } while (v & BIT(7)); 227 228 regs[0] = base + (offset << 2); 229 regs += 2; 230 } while (--count); 231 } 232 233 *regs = MI_BATCH_BUFFER_END | BIT(0); 234 } 235 236 static const u8 gen12_xcs_offsets[] = { 237 NOP(1), 238 LRI(13, POSTED), 239 REG16(0x244), 240 REG(0x034), 241 REG(0x030), 242 REG(0x038), 243 REG(0x03c), 244 REG(0x168), 245 REG(0x140), 246 REG(0x110), 247 REG(0x1c0), 248 REG(0x1c4), 249 REG(0x1c8), 250 REG(0x180), 251 REG16(0x2b4), 252 253 NOP(5), 254 LRI(9, POSTED), 255 REG16(0x3a8), 256 REG16(0x28c), 257 REG16(0x288), 258 REG16(0x284), 259 REG16(0x280), 260 REG16(0x27c), 261 REG16(0x278), 262 REG16(0x274), 263 REG16(0x270), 264 265 0 266 }; 267 268 static const u8 dg2_xcs_offsets[] = { 269 NOP(1), 270 LRI(15, POSTED), 271 REG16(0x244), 272 REG(0x034), 273 REG(0x030), 274 REG(0x038), 275 REG(0x03c), 276 REG(0x168), 277 REG(0x140), 278 REG(0x110), 279 REG(0x1c0), 280 REG(0x1c4), 281 REG(0x1c8), 282 REG(0x180), 283 REG16(0x2b4), 284 REG(0x120), 285 REG(0x124), 286 287 NOP(1), 288 LRI(9, POSTED), 289 REG16(0x3a8), 290 REG16(0x28c), 291 REG16(0x288), 292 REG16(0x284), 293 REG16(0x280), 294 REG16(0x27c), 295 REG16(0x278), 296 REG16(0x274), 297 REG16(0x270), 298 299 0 300 }; 301 302 static const u8 gen12_rcs_offsets[] = { 303 NOP(1), 304 LRI(13, POSTED), 305 REG16(0x244), 306 REG(0x034), 307 REG(0x030), 308 REG(0x038), 309 REG(0x03c), 310 REG(0x168), 311 REG(0x140), 312 REG(0x110), 313 REG(0x1c0), 314 REG(0x1c4), 315 REG(0x1c8), 316 REG(0x180), 317 REG16(0x2b4), 318 319 NOP(5), 320 LRI(9, POSTED), 321 REG16(0x3a8), 322 REG16(0x28c), 323 REG16(0x288), 324 REG16(0x284), 325 REG16(0x280), 326 REG16(0x27c), 327 REG16(0x278), 328 REG16(0x274), 329 REG16(0x270), 330 331 LRI(3, POSTED), 332 REG(0x1b0), 333 REG16(0x5a8), 334 REG16(0x5ac), 335 336 NOP(6), 337 LRI(1, 0), 338 REG(0x0c8), 339 NOP(3 + 9 + 1), 340 341 LRI(51, POSTED), 342 REG16(0x588), 343 REG16(0x588), 344 REG16(0x588), 345 REG16(0x588), 346 REG16(0x588), 347 REG16(0x588), 348 REG(0x028), 349 REG(0x09c), 350 REG(0x0c0), 351 REG(0x178), 352 REG(0x17c), 353 REG16(0x358), 354 REG(0x170), 355 REG(0x150), 356 REG(0x154), 357 REG(0x158), 358 REG16(0x41c), 359 REG16(0x600), 360 REG16(0x604), 361 REG16(0x608), 362 REG16(0x60c), 363 REG16(0x610), 364 REG16(0x614), 365 REG16(0x618), 366 REG16(0x61c), 367 REG16(0x620), 368 REG16(0x624), 369 REG16(0x628), 370 REG16(0x62c), 371 REG16(0x630), 372 REG16(0x634), 373 REG16(0x638), 374 REG16(0x63c), 375 REG16(0x640), 376 REG16(0x644), 377 REG16(0x648), 378 REG16(0x64c), 379 REG16(0x650), 380 REG16(0x654), 381 REG16(0x658), 382 REG16(0x65c), 383 REG16(0x660), 384 REG16(0x664), 385 REG16(0x668), 386 REG16(0x66c), 387 REG16(0x670), 388 REG16(0x674), 389 REG16(0x678), 390 REG16(0x67c), 391 REG(0x068), 392 REG(0x084), 393 NOP(1), 394 395 0 396 }; 397 398 static const u8 xehp_rcs_offsets[] = { 399 NOP(1), 400 LRI(13, POSTED), 401 REG16(0x244), 402 REG(0x034), 403 REG(0x030), 404 REG(0x038), 405 REG(0x03c), 406 REG(0x168), 407 REG(0x140), 408 REG(0x110), 409 REG(0x1c0), 410 REG(0x1c4), 411 REG(0x1c8), 412 REG(0x180), 413 REG16(0x2b4), 414 415 NOP(5), 416 LRI(9, POSTED), 417 REG16(0x3a8), 418 REG16(0x28c), 419 REG16(0x288), 420 REG16(0x284), 421 REG16(0x280), 422 REG16(0x27c), 423 REG16(0x278), 424 REG16(0x274), 425 REG16(0x270), 426 427 LRI(3, POSTED), 428 REG(0x1b0), 429 REG16(0x5a8), 430 REG16(0x5ac), 431 432 NOP(6), 433 LRI(1, 0), 434 REG(0x0c8), 435 436 0 437 }; 438 439 static const u8 dg2_rcs_offsets[] = { 440 NOP(1), 441 LRI(15, POSTED), 442 REG16(0x244), 443 REG(0x034), 444 REG(0x030), 445 REG(0x038), 446 REG(0x03c), 447 REG(0x168), 448 REG(0x140), 449 REG(0x110), 450 REG(0x1c0), 451 REG(0x1c4), 452 REG(0x1c8), 453 REG(0x180), 454 REG16(0x2b4), 455 REG(0x120), 456 REG(0x124), 457 458 NOP(1), 459 LRI(9, POSTED), 460 REG16(0x3a8), 461 REG16(0x28c), 462 REG16(0x288), 463 REG16(0x284), 464 REG16(0x280), 465 REG16(0x27c), 466 REG16(0x278), 467 REG16(0x274), 468 REG16(0x270), 469 470 LRI(3, POSTED), 471 REG(0x1b0), 472 REG16(0x5a8), 473 REG16(0x5ac), 474 475 NOP(6), 476 LRI(1, 0), 477 REG(0x0c8), 478 479 0 480 }; 481 482 static const u8 mtl_rcs_offsets[] = { 483 NOP(1), 484 LRI(15, POSTED), 485 REG16(0x244), 486 REG(0x034), 487 REG(0x030), 488 REG(0x038), 489 REG(0x03c), 490 REG(0x168), 491 REG(0x140), 492 REG(0x110), 493 REG(0x1c0), 494 REG(0x1c4), 495 REG(0x1c8), 496 REG(0x180), 497 REG16(0x2b4), 498 REG(0x120), 499 REG(0x124), 500 501 NOP(1), 502 LRI(9, POSTED), 503 REG16(0x3a8), 504 REG16(0x28c), 505 REG16(0x288), 506 REG16(0x284), 507 REG16(0x280), 508 REG16(0x27c), 509 REG16(0x278), 510 REG16(0x274), 511 REG16(0x270), 512 513 NOP(2), 514 LRI(2, POSTED), 515 REG16(0x5a8), 516 REG16(0x5ac), 517 518 NOP(6), 519 LRI(1, 0), 520 REG(0x0c8), 521 522 0 523 }; 524 525 #define XE2_CTX_COMMON \ 526 NOP(1), /* [0x00] */ \ 527 LRI(15, POSTED), /* [0x01] */ \ 528 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ 529 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ 530 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ 531 REG(0x038), /* [0x08] RING_BUFFER_START */ \ 532 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ 533 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ 534 REG(0x140), /* [0x0e] BB_ADDR */ \ 535 REG(0x110), /* [0x10] BB_STATE */ \ 536 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ 537 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ 538 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ 539 REG(0x180), /* [0x18] CCID */ \ 540 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ 541 REG(0x120), /* [0x1c] PRT_BB_STATE */ \ 542 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ 543 \ 544 NOP(1), /* [0x20] */ \ 545 LRI(9, POSTED), /* [0x21] */ \ 546 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ 547 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ 548 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ 549 REG16(0x284), /* [0x28] dummy reg */ \ 550 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ 551 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ 552 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ 553 REG16(0x274), /* [0x30] PTBP_UDW */ \ 554 REG16(0x270) /* [0x32] PTBP_LDW */ 555 556 static const u8 xe2_rcs_offsets[] = { 557 XE2_CTX_COMMON, 558 559 NOP(2), /* [0x34] */ 560 LRI(2, POSTED), /* [0x36] */ 561 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ 562 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ 563 564 NOP(6), /* [0x41] */ 565 LRI(1, 0), /* [0x47] */ 566 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ 567 568 0 569 }; 570 571 static const u8 xe2_bcs_offsets[] = { 572 XE2_CTX_COMMON, 573 574 NOP(4 + 8 + 1), /* [0x34] */ 575 LRI(2, POSTED), /* [0x41] */ 576 REG16(0x200), /* [0x42] BCS_SWCTRL */ 577 REG16(0x204), /* [0x44] BLIT_CCTL */ 578 579 0 580 }; 581 582 static const u8 xe2_xcs_offsets[] = { 583 XE2_CTX_COMMON, 584 585 0 586 }; 587 588 static const u8 xe2_indirect_ring_state_offsets[] = { 589 NOP(1), /* [0x00] */ 590 LRI(5, POSTED), /* [0x01] */ 591 REG(0x034), /* [0x02] RING_BUFFER_HEAD */ 592 REG(0x030), /* [0x04] RING_BUFFER_TAIL */ 593 REG(0x038), /* [0x06] RING_BUFFER_START */ 594 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */ 595 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ 596 597 NOP(5), /* [0x0c] */ 598 LRI(9, POSTED), /* [0x11] */ 599 REG(0x168), /* [0x12] BB_ADDR_UDW */ 600 REG(0x140), /* [0x14] BB_ADDR */ 601 REG(0x110), /* [0x16] BB_STATE */ 602 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */ 603 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */ 604 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */ 605 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */ 606 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */ 607 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */ 608 609 NOP(12), /* [0x00] */ 610 611 0 612 }; 613 614 #undef REG16 615 #undef REG 616 #undef LRI 617 #undef NOP 618 619 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) 620 { 621 if (class == XE_ENGINE_CLASS_RENDER) { 622 if (GRAPHICS_VER(xe) >= 20) 623 return xe2_rcs_offsets; 624 else if (GRAPHICS_VERx100(xe) >= 1270) 625 return mtl_rcs_offsets; 626 else if (GRAPHICS_VERx100(xe) >= 1255) 627 return dg2_rcs_offsets; 628 else if (GRAPHICS_VERx100(xe) >= 1250) 629 return xehp_rcs_offsets; 630 else 631 return gen12_rcs_offsets; 632 } else if (class == XE_ENGINE_CLASS_COPY) { 633 if (GRAPHICS_VER(xe) >= 20) 634 return xe2_bcs_offsets; 635 else 636 return gen12_xcs_offsets; 637 } else { 638 if (GRAPHICS_VER(xe) >= 20) 639 return xe2_xcs_offsets; 640 else if (GRAPHICS_VERx100(xe) >= 1255) 641 return dg2_xcs_offsets; 642 else 643 return gen12_xcs_offsets; 644 } 645 } 646 647 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) 648 { 649 regs[CTX_CONTEXT_CONTROL] = REG_MASKED_FIELD_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 650 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 651 652 if (xe_gt_has_indirect_ring_state(hwe->gt)) 653 regs[CTX_CONTEXT_CONTROL] |= 654 REG_MASKED_FIELD_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE); 655 } 656 657 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) 658 { 659 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq; 660 struct xe_device *xe = gt_to_xe(hwe->gt); 661 u8 num_regs; 662 663 if (!xe_device_uses_memirq(xe)) 664 return; 665 666 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM | 667 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT; 668 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; 669 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); 670 671 num_regs = xe_device_has_msix(xe) ? 3 : 2; 672 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) | 673 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; 674 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; 675 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe); 676 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; 677 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe); 678 679 if (xe_device_has_msix(xe)) { 680 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr; 681 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */ 682 } 683 } 684 685 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) 686 { 687 struct xe_device *xe = gt_to_xe(hwe->gt); 688 689 if (GRAPHICS_VERx100(xe) >= 1250) 690 return 0x70; 691 else 692 return 0x60; 693 } 694 695 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe) 696 { 697 int x; 698 699 x = lrc_ring_mi_mode(hwe); 700 regs[x + 1] &= ~STOP_RING; 701 regs[x + 1] |= STOP_RING << 16; 702 } 703 704 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc) 705 { 706 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE; 707 } 708 709 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) 710 { 711 return 0; 712 } 713 714 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) 715 { 716 return lrc->ring.size; 717 } 718 719 /* Make the magic macros work */ 720 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 721 #define __xe_lrc_regs_offset xe_lrc_regs_offset 722 723 #define LRC_CTX_JOB_TIMESTAMP_OFFSET 512 724 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024 725 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 726 727 #define LRC_SEQNO_OFFSET 0 728 #define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8) 729 730 u32 xe_lrc_regs_offset(struct xe_lrc *lrc) 731 { 732 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 733 } 734 735 /** 736 * xe_lrc_reg_size() - Get size of the LRC registers area within queues 737 * @xe: the &xe_device struct instance 738 * 739 * Returns: Size of the LRC registers area for current platform 740 */ 741 size_t xe_lrc_reg_size(struct xe_device *xe) 742 { 743 if (GRAPHICS_VERx100(xe) >= 1250) 744 return 96 * sizeof(u32); 745 else 746 return 80 * sizeof(u32); 747 } 748 749 size_t xe_lrc_skip_size(struct xe_device *xe) 750 { 751 return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe); 752 } 753 754 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) 755 { 756 return LRC_SEQNO_OFFSET; 757 } 758 759 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) 760 { 761 return LRC_START_SEQNO_OFFSET; 762 } 763 764 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) 765 { 766 /* This is stored in the driver-defined portion of PPHWSP */ 767 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; 768 } 769 770 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) 771 { 772 /* The parallel is stored in the driver-defined portion of PPHWSP */ 773 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 774 } 775 776 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) 777 { 778 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; 779 } 780 781 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) 782 { 783 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); 784 } 785 786 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) 787 { 788 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); 789 } 790 791 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) 792 { 793 u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - 794 LRC_INDIRECT_RING_STATE_SIZE; 795 796 if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX) 797 offset -= LRC_INDIRECT_CTX_BO_SIZE; 798 799 return offset; 800 } 801 802 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc) 803 { 804 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE; 805 } 806 807 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc) 808 { 809 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE; 810 } 811 812 #define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \ 813 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ 814 { \ 815 struct xe_bo *bo = (bo_expr); \ 816 struct iosys_map map = bo->vmap; \ 817 \ 818 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ 819 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ 820 return map; \ 821 } \ 822 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ 823 { \ 824 struct xe_bo *bo = (bo_expr); \ 825 \ 826 return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \ 827 } \ 828 829 DECL_MAP_ADDR_HELPERS(ring, lrc->bo) 830 DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo) 831 DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo) 832 DECL_MAP_ADDR_HELPERS(regs, lrc->bo) 833 DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo) 834 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo) 835 DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo) 836 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo) 837 DECL_MAP_ADDR_HELPERS(parallel, lrc->bo) 838 DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo) 839 DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo) 840 841 #undef DECL_MAP_ADDR_HELPERS 842 843 /** 844 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address 845 * @lrc: Pointer to the lrc. 846 * 847 * Returns: ctx timestamp GGTT address 848 */ 849 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc) 850 { 851 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 852 } 853 854 /** 855 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address 856 * @lrc: Pointer to the lrc. 857 * 858 * Returns: ctx timestamp udw GGTT address 859 */ 860 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) 861 { 862 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 863 } 864 865 /** 866 * xe_lrc_ctx_timestamp() - Read ctx timestamp value 867 * @lrc: Pointer to the lrc. 868 * 869 * Returns: ctx timestamp value 870 */ 871 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 872 { 873 struct xe_device *xe = lrc_to_xe(lrc); 874 struct iosys_map map; 875 u32 ldw, udw = 0; 876 877 map = __xe_lrc_ctx_timestamp_map(lrc); 878 ldw = xe_map_read32(xe, &map); 879 880 if (xe->info.has_64bit_timestamp) { 881 map = __xe_lrc_ctx_timestamp_udw_map(lrc); 882 udw = xe_map_read32(xe, &map); 883 } 884 885 return (u64)udw << 32 | ldw; 886 } 887 888 /** 889 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address 890 * @lrc: Pointer to the lrc. 891 * 892 * Returns: ctx timestamp job GGTT address 893 */ 894 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc) 895 { 896 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); 897 } 898 899 /** 900 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value 901 * @lrc: Pointer to the lrc. 902 * 903 * Returns: ctx timestamp job value 904 */ 905 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc) 906 { 907 struct xe_device *xe = lrc_to_xe(lrc); 908 struct iosys_map map; 909 910 map = __xe_lrc_ctx_job_timestamp_map(lrc); 911 return xe_map_read32(xe, &map); 912 } 913 914 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) 915 { 916 return __xe_lrc_pphwsp_ggtt_addr(lrc); 917 } 918 919 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc) 920 { 921 if (!xe_lrc_has_indirect_ring_state(lrc)) 922 return 0; 923 924 return __xe_lrc_indirect_ring_ggtt_addr(lrc); 925 } 926 927 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr) 928 { 929 struct xe_device *xe = lrc_to_xe(lrc); 930 struct iosys_map map; 931 932 map = __xe_lrc_indirect_ring_map(lrc); 933 iosys_map_incr(&map, reg_nr * sizeof(u32)); 934 return xe_map_read32(xe, &map); 935 } 936 937 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc, 938 int reg_nr, u32 val) 939 { 940 struct xe_device *xe = lrc_to_xe(lrc); 941 struct iosys_map map; 942 943 map = __xe_lrc_indirect_ring_map(lrc); 944 iosys_map_incr(&map, reg_nr * sizeof(u32)); 945 xe_map_write32(xe, &map, val); 946 } 947 948 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) 949 { 950 struct xe_device *xe = lrc_to_xe(lrc); 951 struct iosys_map map; 952 953 map = __xe_lrc_regs_map(lrc); 954 iosys_map_incr(&map, reg_nr * sizeof(u32)); 955 return xe_map_read32(xe, &map); 956 } 957 958 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) 959 { 960 struct xe_device *xe = lrc_to_xe(lrc); 961 struct iosys_map map; 962 963 map = __xe_lrc_regs_map(lrc); 964 iosys_map_incr(&map, reg_nr * sizeof(u32)); 965 xe_map_write32(xe, &map, val); 966 } 967 968 static void *empty_lrc_data(struct xe_hw_engine *hwe) 969 { 970 struct xe_gt *gt = hwe->gt; 971 void *data; 972 u32 *regs; 973 974 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL); 975 if (!data) 976 return NULL; 977 978 /* 1st page: Per-Process of HW status Page */ 979 regs = data + LRC_PPHWSP_SIZE; 980 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe); 981 set_context_control(regs, hwe); 982 set_memory_based_intr(regs, hwe); 983 reset_stop_ring(regs, hwe); 984 if (xe_gt_has_indirect_ring_state(gt)) { 985 regs = data + xe_gt_lrc_size(gt, hwe->class) - 986 LRC_INDIRECT_RING_STATE_SIZE; 987 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe); 988 } 989 990 return data; 991 } 992 993 /** 994 * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC 995 * of given engine. 996 * @hwe: the &xe_hw_engine struct instance 997 */ 998 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe) 999 { 1000 struct xe_gt *gt = hwe->gt; 1001 u32 *regs; 1002 1003 if (!gt->default_lrc[hwe->class]) 1004 return; 1005 1006 regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE; 1007 set_memory_based_intr(regs, hwe); 1008 } 1009 1010 /** 1011 * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data 1012 * for given LRC. 1013 * @lrc: the &xe_lrc struct instance 1014 * @hwe: the &xe_hw_engine struct instance 1015 * @regs: scratch buffer to be used as temporary storage 1016 */ 1017 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1018 u32 *regs) 1019 { 1020 struct xe_gt *gt = hwe->gt; 1021 struct iosys_map map; 1022 size_t regs_len; 1023 1024 if (!xe_device_uses_memirq(gt_to_xe(gt))) 1025 return; 1026 1027 map = __xe_lrc_regs_map(lrc); 1028 regs_len = xe_lrc_reg_size(gt_to_xe(gt)); 1029 xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len); 1030 set_memory_based_intr(regs, hwe); 1031 xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len); 1032 } 1033 1034 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 1035 { 1036 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); 1037 1038 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 1039 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 1040 } 1041 1042 static void xe_lrc_finish(struct xe_lrc *lrc) 1043 { 1044 xe_hw_fence_ctx_finish(&lrc->fence_ctx); 1045 xe_bo_unpin_map_no_vm(lrc->bo); 1046 xe_bo_unpin_map_no_vm(lrc->seqno_bo); 1047 } 1048 1049 /* 1050 * wa_bb_setup_utilization() - Write commands to wa bb to assist 1051 * in calculating active context run ticks. 1052 * 1053 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the 1054 * context, but only gets updated when the context switches out. In order to 1055 * check how long a context has been active before it switches out, two things 1056 * are required: 1057 * 1058 * (1) Determine if the context is running: 1059 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in 1060 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is 1061 * initialized. During a query, we just check for this value to determine if the 1062 * context is active. If the context switched out, it would overwrite this 1063 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as 1064 * the last part of context restore, so reusing this LRC location will not 1065 * clobber anything. 1066 * 1067 * (2) Calculate the time that the context has been active for: 1068 * The CTX_TIMESTAMP ticks only when the context is active. If a context is 1069 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. 1070 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific 1071 * engine instance. Since we do not know which instance the context is running 1072 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and 1073 * store it in the PPHSWP. 1074 */ 1075 #define CONTEXT_ACTIVE 1ULL 1076 static ssize_t setup_utilization_wa(struct xe_lrc *lrc, 1077 struct xe_hw_engine *hwe, 1078 u32 *batch, 1079 size_t max_len) 1080 { 1081 u32 *cmd = batch; 1082 1083 if (IS_SRIOV_VF(gt_to_xe(lrc->gt))) 1084 return 0; 1085 1086 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1087 return -ENOSPC; 1088 1089 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 1090 *cmd++ = ENGINE_ID(0).addr; 1091 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); 1092 *cmd++ = 0; 1093 1094 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1095 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1096 *cmd++ = 0; 1097 *cmd++ = lower_32_bits(CONTEXT_ACTIVE); 1098 1099 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { 1100 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1101 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 1102 *cmd++ = 0; 1103 *cmd++ = upper_32_bits(CONTEXT_ACTIVE); 1104 } 1105 1106 return cmd - batch; 1107 } 1108 1109 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1110 u32 *batch, size_t max_len) 1111 { 1112 const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1113 u32 *cmd = batch; 1114 1115 if (!XE_GT_WA(lrc->gt, 16010904313) || 1116 !(hwe->class == XE_ENGINE_CLASS_RENDER || 1117 hwe->class == XE_ENGINE_CLASS_COMPUTE || 1118 hwe->class == XE_ENGINE_CLASS_COPY || 1119 hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE || 1120 hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE)) 1121 return 0; 1122 1123 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1124 return -ENOSPC; 1125 1126 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1127 MI_LRM_ASYNC; 1128 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1129 *cmd++ = ts_addr; 1130 *cmd++ = 0; 1131 1132 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1133 MI_LRM_ASYNC; 1134 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1135 *cmd++ = ts_addr; 1136 *cmd++ = 0; 1137 1138 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO; 1139 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1140 *cmd++ = ts_addr; 1141 *cmd++ = 0; 1142 1143 return cmd - batch; 1144 } 1145 1146 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc, 1147 struct xe_hw_engine *hwe, 1148 u32 *batch, size_t max_len) 1149 { 1150 struct xe_device *xe = gt_to_xe(lrc->gt); 1151 const u32 *user_batch; 1152 u32 *cmd = batch; 1153 u32 count; 1154 1155 count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev), 1156 hwe->class, &user_batch); 1157 if (!count) 1158 return 0; 1159 1160 if (count > max_len) 1161 return -ENOSPC; 1162 1163 /* 1164 * This should be used only for tests and validation. Taint the kernel 1165 * as anything could be submitted directly in context switches 1166 */ 1167 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1168 1169 memcpy(cmd, user_batch, count * sizeof(u32)); 1170 cmd += count; 1171 1172 return cmd - batch; 1173 } 1174 1175 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc, 1176 struct xe_hw_engine *hwe, 1177 u32 *batch, size_t max_len) 1178 { 1179 struct xe_device *xe = gt_to_xe(lrc->gt); 1180 const u32 *user_batch; 1181 u32 *cmd = batch; 1182 u32 count; 1183 1184 count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 1185 hwe->class, &user_batch); 1186 if (!count) 1187 return 0; 1188 1189 if (count > max_len) 1190 return -ENOSPC; 1191 1192 /* 1193 * This should be used only for tests and validation. Taint the kernel 1194 * as anything could be submitted directly in context switches 1195 */ 1196 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1197 1198 memcpy(cmd, user_batch, count * sizeof(u32)); 1199 cmd += count; 1200 1201 return cmd - batch; 1202 } 1203 1204 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc, 1205 struct xe_hw_engine *hwe, 1206 u32 *batch, size_t max_len) 1207 { 1208 u32 *cmd = batch; 1209 1210 if (!XE_GT_WA(lrc->gt, 18022495364) || 1211 hwe->class != XE_ENGINE_CLASS_RENDER) 1212 return 0; 1213 1214 if (xe_gt_WARN_ON(lrc->gt, max_len < 3)) 1215 return -ENOSPC; 1216 1217 *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); 1218 *cmd++ = CS_DEBUG_MODE2(0).addr; 1219 *cmd++ = REG_MASKED_FIELD_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); 1220 1221 return cmd - batch; 1222 } 1223 1224 static ssize_t setup_invalidate_auxccs_wa(struct xe_lrc *lrc, 1225 struct xe_hw_engine *hwe, 1226 u32 *batch, size_t max_len) 1227 { 1228 struct xe_gt *gt = lrc->gt; 1229 u32 *(*emit)(struct xe_gt *gt, u32 *cmd) = 1230 gt->ring_ops[hwe->class]->emit_aux_table_inv; 1231 1232 if (!emit) 1233 return 0; 1234 1235 if (xe_gt_WARN_ON(gt, max_len < 8)) 1236 return -ENOSPC; 1237 1238 return emit(gt, batch) - batch; 1239 } 1240 1241 struct bo_setup { 1242 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1243 u32 *batch, size_t max_size); 1244 }; 1245 1246 struct bo_setup_state { 1247 /* Input: */ 1248 struct xe_lrc *lrc; 1249 struct xe_hw_engine *hwe; 1250 size_t max_size; 1251 size_t reserve_dw; 1252 unsigned int offset; 1253 const struct bo_setup *funcs; 1254 unsigned int num_funcs; 1255 1256 /* State: */ 1257 u32 *buffer; 1258 u32 *ptr; 1259 unsigned int written; 1260 }; 1261 1262 static int setup_bo(struct bo_setup_state *state) 1263 { 1264 ssize_t remain; 1265 1266 if (state->lrc->bo->vmap.is_iomem) { 1267 xe_gt_assert(state->hwe->gt, state->buffer); 1268 state->ptr = state->buffer; 1269 } else { 1270 state->ptr = state->lrc->bo->vmap.vaddr + state->offset; 1271 } 1272 1273 remain = state->max_size / sizeof(u32); 1274 1275 for (size_t i = 0; i < state->num_funcs; i++) { 1276 ssize_t len = state->funcs[i].setup(state->lrc, state->hwe, 1277 state->ptr, remain); 1278 1279 remain -= len; 1280 1281 /* 1282 * Caller has asked for at least reserve_dw to remain unused. 1283 */ 1284 if (len < 0 || 1285 xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw)) 1286 goto fail; 1287 1288 state->ptr += len; 1289 state->written += len; 1290 } 1291 1292 return 0; 1293 1294 fail: 1295 return -ENOSPC; 1296 } 1297 1298 static void finish_bo(struct bo_setup_state *state) 1299 { 1300 if (!state->lrc->bo->vmap.is_iomem) 1301 return; 1302 1303 xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap, 1304 state->offset, state->buffer, 1305 state->written * sizeof(u32)); 1306 } 1307 1308 /** 1309 * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks. 1310 * @lrc: the &xe_lrc struct instance 1311 * @hwe: the &xe_hw_engine struct instance 1312 * @scratch: preallocated scratch buffer for temporary storage 1313 * Return: 0 on success, negative error code on failure 1314 */ 1315 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch) 1316 { 1317 static const struct bo_setup funcs[] = { 1318 { .setup = setup_timestamp_wa }, 1319 { .setup = setup_invalidate_state_cache_wa }, 1320 { .setup = setup_utilization_wa }, 1321 { .setup = setup_configfs_post_ctx_restore_bb }, 1322 }; 1323 struct bo_setup_state state = { 1324 .lrc = lrc, 1325 .hwe = hwe, 1326 .max_size = LRC_WA_BB_SIZE, 1327 .buffer = scratch, 1328 .reserve_dw = 1, 1329 .offset = __xe_lrc_wa_bb_offset(lrc), 1330 .funcs = funcs, 1331 .num_funcs = ARRAY_SIZE(funcs), 1332 }; 1333 int ret; 1334 1335 ret = setup_bo(&state); 1336 if (ret) 1337 return ret; 1338 1339 *state.ptr++ = MI_BATCH_BUFFER_END; 1340 state.written++; 1341 1342 finish_bo(&state); 1343 1344 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, 1345 xe_bo_ggtt_addr(lrc->bo) + state.offset + 1); 1346 1347 return 0; 1348 } 1349 1350 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1351 { 1352 u32 *buf = NULL; 1353 int ret; 1354 1355 if (lrc->bo->vmap.is_iomem) { 1356 buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL); 1357 if (!buf) 1358 return -ENOMEM; 1359 } 1360 1361 ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf); 1362 1363 kfree(buf); 1364 1365 return ret; 1366 } 1367 1368 static int 1369 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1370 { 1371 static const struct bo_setup rcs_funcs[] = { 1372 { .setup = setup_timestamp_wa }, 1373 { .setup = setup_invalidate_auxccs_wa }, 1374 { .setup = setup_configfs_mid_ctx_restore_bb }, 1375 }; 1376 static const struct bo_setup xcs_funcs[] = { 1377 { .setup = setup_invalidate_auxccs_wa }, 1378 { .setup = setup_configfs_mid_ctx_restore_bb }, 1379 }; 1380 struct bo_setup_state state = { 1381 .lrc = lrc, 1382 .hwe = hwe, 1383 .max_size = (63 * 64) /* max 63 cachelines */, 1384 .buffer = NULL, 1385 .offset = __xe_lrc_indirect_ctx_offset(lrc), 1386 }; 1387 int ret; 1388 1389 if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)) 1390 return 0; 1391 1392 if (hwe->class == XE_ENGINE_CLASS_RENDER || 1393 hwe->class == XE_ENGINE_CLASS_COMPUTE) { 1394 state.funcs = rcs_funcs; 1395 state.num_funcs = ARRAY_SIZE(rcs_funcs); 1396 } else { 1397 state.funcs = xcs_funcs; 1398 state.num_funcs = ARRAY_SIZE(xcs_funcs); 1399 } 1400 1401 if (xe_gt_WARN_ON(lrc->gt, !state.funcs)) 1402 return 0; 1403 1404 if (lrc->bo->vmap.is_iomem) { 1405 state.buffer = kmalloc(state.max_size, GFP_KERNEL); 1406 if (!state.buffer) 1407 return -ENOMEM; 1408 } 1409 1410 ret = setup_bo(&state); 1411 if (ret) { 1412 kfree(state.buffer); 1413 return ret; 1414 } 1415 1416 /* 1417 * Align to 64B cacheline so there's no garbage at the end for CS to 1418 * execute: size for indirect ctx must be a multiple of 64. 1419 */ 1420 while (state.written & 0xf) { 1421 *state.ptr++ = MI_NOOP; 1422 state.written++; 1423 } 1424 1425 finish_bo(&state); 1426 kfree(state.buffer); 1427 1428 /* 1429 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it 1430 * varies per engine class, but the default is good enough 1431 */ 1432 xe_lrc_write_ctx_reg(lrc, 1433 CTX_CS_INDIRECT_CTX, 1434 (xe_bo_ggtt_addr(lrc->bo) + state.offset) | 1435 /* Size in CLs. */ 1436 (state.written * sizeof(u32) / 64)); 1437 1438 return 0; 1439 } 1440 1441 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1442 { 1443 struct xe_device *xe = gt_to_xe(lrc->gt); 1444 1445 xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW && 1446 priority <= XE_MULTI_QUEUE_PRIORITY_HIGH)); 1447 1448 /* xe_multi_queue_priority is directly mapped to LRC priority values */ 1449 return priority; 1450 } 1451 1452 /** 1453 * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC 1454 * @lrc: Logical Ring Context 1455 * @priority: Multi queue priority of the exec queue 1456 * 1457 * Convert @priority to LRC multi queue priority and update the @lrc descriptor 1458 */ 1459 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1460 { 1461 lrc->desc &= ~LRC_PRIORITY; 1462 lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority)); 1463 } 1464 1465 static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm, 1466 void *replay_state, u16 msix_vec, u32 init_flags) 1467 { 1468 struct xe_gt *gt = hwe->gt; 1469 struct xe_tile *tile = gt_to_tile(gt); 1470 struct xe_device *xe = gt_to_xe(gt); 1471 struct iosys_map map; 1472 u32 arb_enable; 1473 u32 state_cache_perf_fix[3]; 1474 int err; 1475 1476 /* 1477 * Init Per-Process of HW status Page, LRC / context state to known 1478 * values. If there's already a primed default_lrc, just copy it, otherwise 1479 * it's the early submission to record the lrc: build a new empty one from 1480 * scratch. 1481 */ 1482 map = __xe_lrc_pphwsp_map(lrc); 1483 if (gt->default_lrc[hwe->class] || replay_state) { 1484 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ 1485 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1486 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, 1487 lrc->size - LRC_PPHWSP_SIZE); 1488 if (replay_state) 1489 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1490 replay_state, lrc->replay_size); 1491 } else { 1492 void *init_data = empty_lrc_data(hwe); 1493 1494 if (!init_data) { 1495 return -ENOMEM; 1496 } 1497 1498 xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size); 1499 kfree(init_data); 1500 } 1501 1502 if (vm) 1503 xe_lrc_set_ppgtt(lrc, vm); 1504 1505 if (xe_device_has_msix(xe)) { 1506 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR, 1507 xe_memirq_status_ptr(&tile->memirq, hwe)); 1508 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR, 1509 xe_memirq_source_ptr(&tile->memirq, hwe)); 1510 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec); 1511 } 1512 1513 if (xe_gt_has_indirect_ring_state(gt)) { 1514 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1515 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1516 1517 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1518 __xe_lrc_ring_ggtt_addr(lrc)); 1519 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0); 1520 1521 /* Match head and tail pointers */ 1522 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail); 1523 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail); 1524 1525 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL, 1526 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1527 } else { 1528 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1529 1530 /* Match head and tail pointers */ 1531 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail); 1532 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); 1533 1534 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, 1535 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1536 } 1537 1538 if (init_flags & XE_LRC_CREATE_RUNALONE) 1539 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1540 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1541 REG_MASKED_FIELD_ENABLE(CTX_CTRL_RUN_ALONE)); 1542 1543 if (init_flags & XE_LRC_CREATE_PXP) 1544 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1545 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1546 REG_MASKED_FIELD_ENABLE(CTX_CTRL_PXP_ENABLE)); 1547 1548 lrc->ctx_timestamp = 0; 1549 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 1550 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1551 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); 1552 1553 if (xe->info.has_asid && vm) 1554 xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid); 1555 1556 lrc->desc = LRC_VALID; 1557 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT); 1558 /* TODO: Priority */ 1559 1560 /* While this appears to have something about privileged batches or 1561 * some such, it really just means PPGTT mode. 1562 */ 1563 if (vm) 1564 lrc->desc |= LRC_PRIVILEGE; 1565 1566 if (GRAPHICS_VERx100(xe) < 1250) { 1567 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance); 1568 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class); 1569 } 1570 1571 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1572 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); 1573 1574 if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) { 1575 state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); 1576 state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr; 1577 state_cache_perf_fix[2] = REG_MASKED_FIELD_ENABLE(DISABLE_STATE_CACHE_PERF_FIX); 1578 xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix)); 1579 } 1580 1581 map = __xe_lrc_seqno_map(lrc); 1582 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1583 1584 map = __xe_lrc_start_seqno_map(lrc); 1585 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1586 1587 err = setup_wa_bb(lrc, hwe); 1588 if (err) 1589 return err; 1590 1591 err = setup_indirect_ctx(lrc, hwe); 1592 1593 return err; 1594 } 1595 1596 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm, 1597 void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags) 1598 { 1599 struct xe_gt *gt = hwe->gt; 1600 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class); 1601 u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE; 1602 struct xe_tile *tile = gt_to_tile(gt); 1603 struct xe_device *xe = gt_to_xe(gt); 1604 struct xe_bo *bo; 1605 u32 bo_flags; 1606 int err; 1607 1608 kref_init(&lrc->refcount); 1609 lrc->gt = gt; 1610 lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class); 1611 lrc->size = lrc_size; 1612 lrc->flags = 0; 1613 lrc->ring.size = ring_size; 1614 lrc->ring.tail = 0; 1615 1616 if (gt_engine_needs_indirect_ctx(gt, hwe->class)) { 1617 lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX; 1618 bo_size += LRC_INDIRECT_CTX_BO_SIZE; 1619 } 1620 1621 if (xe_gt_has_indirect_ring_state(gt)) 1622 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 1623 1624 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | 1625 XE_BO_FLAG_GGTT_INVALIDATE; 1626 1627 if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */ 1628 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM; 1629 1630 bo = xe_bo_create_pin_map_novm(xe, tile, bo_size, 1631 ttm_bo_type_kernel, 1632 bo_flags, false); 1633 if (IS_ERR(bo)) 1634 return PTR_ERR(bo); 1635 1636 lrc->bo = bo; 1637 1638 bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE, 1639 ttm_bo_type_kernel, 1640 XE_BO_FLAG_GGTT | 1641 XE_BO_FLAG_GGTT_INVALIDATE | 1642 XE_BO_FLAG_SYSTEM, false); 1643 if (IS_ERR(bo)) { 1644 err = PTR_ERR(bo); 1645 goto err_lrc_finish; 1646 } 1647 lrc->seqno_bo = bo; 1648 1649 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 1650 hwe->fence_irq, hwe->name); 1651 1652 err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags); 1653 if (err) 1654 goto err_lrc_finish; 1655 1656 if (vm && vm->xef) 1657 xe_drm_client_add_bo(vm->xef->client, lrc->bo); 1658 1659 return 0; 1660 1661 err_lrc_finish: 1662 xe_lrc_finish(lrc); 1663 return err; 1664 } 1665 1666 /** 1667 * xe_lrc_create - Create a LRC 1668 * @hwe: Hardware Engine 1669 * @vm: The VM (address space) 1670 * @replay_state: GPU hang replay state 1671 * @ring_size: LRC ring size 1672 * @msix_vec: MSI-X interrupt vector (for platforms that support it) 1673 * @flags: LRC initialization flags 1674 * 1675 * Allocate and initialize the Logical Ring Context (LRC). 1676 * 1677 * Return pointer to created LRC upon success and an error pointer 1678 * upon failure. 1679 */ 1680 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 1681 void *replay_state, u32 ring_size, u16 msix_vec, u32 flags) 1682 { 1683 struct xe_lrc *lrc; 1684 int err; 1685 1686 lrc = kzalloc_obj(*lrc); 1687 if (!lrc) 1688 return ERR_PTR(-ENOMEM); 1689 1690 err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags); 1691 if (err) { 1692 kfree(lrc); 1693 return ERR_PTR(err); 1694 } 1695 1696 return lrc; 1697 } 1698 1699 /** 1700 * xe_lrc_destroy - Destroy the LRC 1701 * @ref: reference to LRC 1702 * 1703 * Called when ref == 0, release resources held by the Logical Ring Context 1704 * (LRC) and free the LRC memory. 1705 */ 1706 void xe_lrc_destroy(struct kref *ref) 1707 { 1708 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 1709 1710 xe_lrc_finish(lrc); 1711 kfree(lrc); 1712 } 1713 1714 /** 1715 * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC. 1716 * @lrc: the &xe_lrc struct instance 1717 */ 1718 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc) 1719 { 1720 if (xe_lrc_has_indirect_ring_state(lrc)) { 1721 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1722 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1723 1724 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1725 __xe_lrc_ring_ggtt_addr(lrc)); 1726 } else { 1727 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1728 } 1729 } 1730 1731 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail) 1732 { 1733 if (xe_lrc_has_indirect_ring_state(lrc)) 1734 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail); 1735 else 1736 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail); 1737 } 1738 1739 u32 xe_lrc_ring_tail(struct xe_lrc *lrc) 1740 { 1741 if (xe_lrc_has_indirect_ring_state(lrc)) 1742 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR; 1743 else 1744 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR; 1745 } 1746 1747 static u32 xe_lrc_ring_start(struct xe_lrc *lrc) 1748 { 1749 if (xe_lrc_has_indirect_ring_state(lrc)) 1750 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START); 1751 else 1752 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START); 1753 } 1754 1755 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) 1756 { 1757 if (xe_lrc_has_indirect_ring_state(lrc)) 1758 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head); 1759 else 1760 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); 1761 } 1762 1763 u32 xe_lrc_ring_head(struct xe_lrc *lrc) 1764 { 1765 if (xe_lrc_has_indirect_ring_state(lrc)) 1766 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR; 1767 else 1768 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; 1769 } 1770 1771 u32 xe_lrc_ring_space(struct xe_lrc *lrc) 1772 { 1773 const u32 head = xe_lrc_ring_head(lrc); 1774 const u32 tail = lrc->ring.tail; 1775 const u32 size = lrc->ring.size; 1776 1777 return ((head - tail - 1) & (size - 1)) + 1; 1778 } 1779 1780 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, 1781 const void *data, size_t size) 1782 { 1783 struct xe_device *xe = lrc_to_xe(lrc); 1784 1785 iosys_map_incr(&ring, lrc->ring.tail); 1786 xe_map_memcpy_to(xe, &ring, 0, data, size); 1787 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); 1788 } 1789 1790 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) 1791 { 1792 struct xe_device *xe = lrc_to_xe(lrc); 1793 struct iosys_map ring; 1794 u32 rhs; 1795 size_t aligned_size; 1796 1797 xe_assert(xe, IS_ALIGNED(size, 4)); 1798 aligned_size = ALIGN(size, 8); 1799 1800 ring = __xe_lrc_ring_map(lrc); 1801 1802 xe_assert(xe, lrc->ring.tail < lrc->ring.size); 1803 rhs = lrc->ring.size - lrc->ring.tail; 1804 if (size > rhs) { 1805 __xe_lrc_write_ring(lrc, ring, data, rhs); 1806 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); 1807 } else { 1808 __xe_lrc_write_ring(lrc, ring, data, size); 1809 } 1810 1811 if (aligned_size > size) { 1812 u32 noop = MI_NOOP; 1813 1814 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); 1815 } 1816 } 1817 1818 u64 xe_lrc_descriptor(struct xe_lrc *lrc) 1819 { 1820 return lrc->desc | xe_lrc_ggtt_addr(lrc); 1821 } 1822 1823 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) 1824 { 1825 return __xe_lrc_seqno_ggtt_addr(lrc); 1826 } 1827 1828 /** 1829 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence. 1830 * 1831 * Allocate but don't initialize an lrc seqno fence. 1832 * 1833 * Return: Pointer to the allocated fence or 1834 * negative error pointer on error. 1835 */ 1836 struct dma_fence *xe_lrc_alloc_seqno_fence(void) 1837 { 1838 return xe_hw_fence_alloc(); 1839 } 1840 1841 /** 1842 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence. 1843 * @fence: Pointer to the fence to free. 1844 * 1845 * Frees an lrc seqno fence that hasn't yet been 1846 * initialized. 1847 */ 1848 void xe_lrc_free_seqno_fence(struct dma_fence *fence) 1849 { 1850 xe_hw_fence_free(fence); 1851 } 1852 1853 /** 1854 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence. 1855 * @lrc: Pointer to the lrc. 1856 * @fence: Pointer to the fence to initialize. 1857 * 1858 * Initializes a pre-allocated lrc seqno fence. 1859 * After initialization, the fence is subject to normal 1860 * dma-fence refcounting. 1861 */ 1862 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence) 1863 { 1864 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc)); 1865 } 1866 1867 s32 xe_lrc_seqno(struct xe_lrc *lrc) 1868 { 1869 struct iosys_map map = __xe_lrc_seqno_map(lrc); 1870 1871 return xe_map_read32(lrc_to_xe(lrc), &map); 1872 } 1873 1874 s32 xe_lrc_start_seqno(struct xe_lrc *lrc) 1875 { 1876 struct iosys_map map = __xe_lrc_start_seqno_map(lrc); 1877 1878 return xe_map_read32(lrc_to_xe(lrc), &map); 1879 } 1880 1881 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) 1882 { 1883 return __xe_lrc_start_seqno_ggtt_addr(lrc); 1884 } 1885 1886 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) 1887 { 1888 return __xe_lrc_parallel_ggtt_addr(lrc); 1889 } 1890 1891 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1892 { 1893 return __xe_lrc_parallel_map(lrc); 1894 } 1895 1896 /** 1897 * xe_lrc_engine_id() - Read engine id value 1898 * @lrc: Pointer to the lrc. 1899 * 1900 * Returns: context id value 1901 */ 1902 static u32 xe_lrc_engine_id(struct xe_lrc *lrc) 1903 { 1904 struct xe_device *xe = lrc_to_xe(lrc); 1905 struct iosys_map map; 1906 1907 map = __xe_lrc_engine_id_map(lrc); 1908 return xe_map_read32(xe, &map); 1909 } 1910 1911 static int instr_dw(u32 cmd_header) 1912 { 1913 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ 1914 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == 1915 GFXPIPE_SINGLE_DW_CMD(0, 0)) 1916 return 1; 1917 1918 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ 1919 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) 1920 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; 1921 1922 /* Most instructions have the # of dwords (minus 2) in 7:0 */ 1923 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; 1924 } 1925 1926 static int dump_mi_command(struct drm_printer *p, 1927 struct xe_gt *gt, 1928 u32 *start, 1929 u32 *dw, 1930 int remaining_dw) 1931 { 1932 u32 inst_header = *dw; 1933 u32 numdw = instr_dw(inst_header); 1934 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); 1935 int num_noop; 1936 1937 /* First check for commands that don't have/use a '# DW' field */ 1938 switch (inst_header & MI_OPCODE) { 1939 case MI_NOOP: 1940 num_noop = 1; 1941 while (num_noop < remaining_dw && 1942 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) 1943 num_noop++; 1944 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_NOOP (%d dwords)\n", 1945 dw - num_noop - start, inst_header, num_noop); 1946 return num_noop; 1947 1948 case MI_TOPOLOGY_FILTER: 1949 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_TOPOLOGY_FILTER\n", 1950 dw - start, inst_header); 1951 return 1; 1952 1953 case MI_BATCH_BUFFER_END: 1954 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_BATCH_BUFFER_END\n", 1955 dw - start, inst_header); 1956 /* Return 'remaining_dw' to consume the rest of the LRC */ 1957 return remaining_dw; 1958 } 1959 1960 /* 1961 * Any remaining commands include a # of dwords. We should make sure 1962 * it doesn't exceed the remaining size of the LRC. 1963 */ 1964 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1965 numdw = remaining_dw; 1966 1967 switch (inst_header & MI_OPCODE) { 1968 case MI_LOAD_REGISTER_IMM: 1969 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", 1970 dw - start, inst_header, (numdw - 1) / 2); 1971 for (int i = 1; i < numdw; i += 2) 1972 drm_printf(p, "LRC[%#5tx] = - %#6x = %#010x\n", 1973 &dw[i] - start, dw[i], dw[i + 1]); 1974 return numdw; 1975 1976 case MI_LOAD_REGISTER_MEM & MI_OPCODE: 1977 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_LOAD_REGISTER_MEM: %s%s\n", 1978 dw - start, inst_header, 1979 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "", 1980 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : ""); 1981 if (numdw == 4) 1982 drm_printf(p, "LRC[%#5tx] = - %#6x = %#010llx\n", 1983 dw - start, 1984 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2]))); 1985 else 1986 drm_printf(p, "LRC[%#5tx] = - %*ph (%s)\n", 1987 dw - start, (int)sizeof(u32) * (numdw - 1), 1988 dw + 1, numdw < 4 ? "truncated" : "malformed"); 1989 return numdw; 1990 1991 case MI_FORCE_WAKEUP: 1992 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_FORCE_WAKEUP\n", 1993 dw - start, inst_header); 1994 return numdw; 1995 1996 default: 1997 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown MI opcode %#x, likely %d dwords\n", 1998 dw - start, inst_header, opcode, numdw); 1999 return numdw; 2000 } 2001 } 2002 2003 static int dump_gfxpipe_command(struct drm_printer *p, 2004 struct xe_gt *gt, 2005 u32 *start, 2006 u32 *dw, 2007 int remaining_dw) 2008 { 2009 u32 numdw = instr_dw(*dw); 2010 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); 2011 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); 2012 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); 2013 2014 /* 2015 * Make sure we haven't mis-parsed a number of dwords that exceeds the 2016 * remaining size of the LRC. 2017 */ 2018 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 2019 numdw = remaining_dw; 2020 2021 switch (*dw & GFXPIPE_MATCH_MASK) { 2022 #define MATCH(cmd) \ 2023 case cmd: \ 2024 drm_printf(p, "LRC[%#5tx] = [%#010x] " #cmd " (%d dwords)\n", \ 2025 dw - start, *dw, numdw); \ 2026 return numdw 2027 #define MATCH3D(cmd) \ 2028 case CMD_##cmd: \ 2029 drm_printf(p, "LRC[%#5tx] = [%#010x] " #cmd " (%d dwords)\n", \ 2030 dw - start, *dw, numdw); \ 2031 return numdw 2032 2033 MATCH(STATE_BASE_ADDRESS); 2034 MATCH(STATE_SIP); 2035 MATCH(GPGPU_CSR_BASE_ADDRESS); 2036 MATCH(STATE_COMPUTE_MODE); 2037 MATCH3D(3DSTATE_BTD); 2038 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS); 2039 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS); 2040 2041 MATCH3D(3DSTATE_VF_STATISTICS); 2042 2043 MATCH(PIPELINE_SELECT); 2044 2045 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); 2046 MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN); 2047 MATCH3D(3DSTATE_CLEAR_PARAMS); 2048 MATCH3D(3DSTATE_DEPTH_BUFFER); 2049 MATCH3D(3DSTATE_STENCIL_BUFFER); 2050 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); 2051 MATCH3D(3DSTATE_VERTEX_BUFFERS); 2052 MATCH3D(3DSTATE_VERTEX_ELEMENTS); 2053 MATCH3D(3DSTATE_INDEX_BUFFER); 2054 MATCH3D(3DSTATE_VF); 2055 MATCH3D(3DSTATE_MULTISAMPLE); 2056 MATCH3D(3DSTATE_CC_STATE_POINTERS); 2057 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); 2058 MATCH3D(3DSTATE_VS); 2059 MATCH3D(3DSTATE_GS); 2060 MATCH3D(3DSTATE_CLIP); 2061 MATCH3D(3DSTATE_SF); 2062 MATCH3D(3DSTATE_WM); 2063 MATCH3D(3DSTATE_CONSTANT_VS); 2064 MATCH3D(3DSTATE_CONSTANT_GS); 2065 MATCH3D(3DSTATE_CONSTANT_PS); 2066 MATCH3D(3DSTATE_SAMPLE_MASK); 2067 MATCH3D(3DSTATE_CONSTANT_HS); 2068 MATCH3D(3DSTATE_CONSTANT_DS); 2069 MATCH3D(3DSTATE_HS); 2070 MATCH3D(3DSTATE_TE); 2071 MATCH3D(3DSTATE_DS); 2072 MATCH3D(3DSTATE_STREAMOUT); 2073 MATCH3D(3DSTATE_SBE); 2074 MATCH3D(3DSTATE_PS); 2075 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); 2076 MATCH3D(3DSTATE_CPS_POINTERS); 2077 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); 2078 MATCH3D(3DSTATE_BLEND_STATE_POINTERS); 2079 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); 2080 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); 2081 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); 2082 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); 2083 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); 2084 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); 2085 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); 2086 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); 2087 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); 2088 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); 2089 MATCH3D(3DSTATE_VF_INSTANCING); 2090 MATCH3D(3DSTATE_VF_SGVS); 2091 MATCH3D(3DSTATE_VF_TOPOLOGY); 2092 MATCH3D(3DSTATE_WM_CHROMAKEY); 2093 MATCH3D(3DSTATE_PS_BLEND); 2094 MATCH3D(3DSTATE_WM_DEPTH_STENCIL); 2095 MATCH3D(3DSTATE_PS_EXTRA); 2096 MATCH3D(3DSTATE_RASTER); 2097 MATCH3D(3DSTATE_SBE_SWIZ); 2098 MATCH3D(3DSTATE_WM_HZ_OP); 2099 MATCH3D(3DSTATE_VF_COMPONENT_PACKING); 2100 MATCH3D(3DSTATE_VF_SGVS_2); 2101 MATCH3D(3DSTATE_VFG); 2102 MATCH3D(3DSTATE_URB_ALLOC_VS); 2103 MATCH3D(3DSTATE_URB_ALLOC_HS); 2104 MATCH3D(3DSTATE_URB_ALLOC_DS); 2105 MATCH3D(3DSTATE_URB_ALLOC_GS); 2106 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); 2107 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); 2108 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); 2109 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); 2110 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); 2111 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); 2112 MATCH3D(3DSTATE_AMFS); 2113 MATCH3D(3DSTATE_DEPTH_BOUNDS); 2114 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); 2115 MATCH3D(3DSTATE_CONSTANT_TS_POINTER); 2116 MATCH3D(3DSTATE_MESH_CONTROL); 2117 MATCH3D(3DSTATE_MESH_DISTRIB); 2118 MATCH3D(3DSTATE_TASK_REDISTRIB); 2119 MATCH3D(3DSTATE_MESH_SHADER); 2120 MATCH3D(3DSTATE_MESH_SHADER_DATA); 2121 MATCH3D(3DSTATE_TASK_CONTROL); 2122 MATCH3D(3DSTATE_TASK_SHADER); 2123 MATCH3D(3DSTATE_TASK_SHADER_DATA); 2124 MATCH3D(3DSTATE_URB_ALLOC_MESH); 2125 MATCH3D(3DSTATE_URB_ALLOC_TASK); 2126 MATCH3D(3DSTATE_CLIP_MESH); 2127 MATCH3D(3DSTATE_SBE_MESH); 2128 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); 2129 MATCH3D(3DSTATE_COARSE_PIXEL); 2130 MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT); 2131 MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT); 2132 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2); 2133 MATCH3D(3DSTATE_CC_STATE_POINTERS_2); 2134 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2); 2135 MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2); 2136 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2); 2137 2138 MATCH3D(3DSTATE_DRAWING_RECTANGLE); 2139 MATCH3D(3DSTATE_URB_MEMORY); 2140 MATCH3D(3DSTATE_CHROMA_KEY); 2141 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); 2142 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); 2143 MATCH3D(3DSTATE_LINE_STIPPLE); 2144 MATCH3D(3DSTATE_AA_LINE_PARAMETERS); 2145 MATCH3D(3DSTATE_MONOFILTER_SIZE); 2146 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); 2147 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); 2148 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); 2149 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); 2150 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); 2151 MATCH3D(3DSTATE_SO_DECL_LIST); 2152 MATCH3D(3DSTATE_SO_BUFFER); 2153 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); 2154 MATCH3D(3DSTATE_SAMPLE_PATTERN); 2155 MATCH3D(3DSTATE_3D_MODE); 2156 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); 2157 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); 2158 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); 2159 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2); 2160 2161 default: 2162 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", 2163 dw - start, *dw, pipeline, opcode, subopcode, numdw); 2164 return numdw; 2165 } 2166 } 2167 2168 static int dump_gfx_state_command(struct drm_printer *p, 2169 struct xe_gt *gt, 2170 u32 *start, 2171 u32 *dw, 2172 int remaining_dw) 2173 { 2174 u32 numdw = instr_dw(*dw); 2175 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw); 2176 2177 /* 2178 * Make sure we haven't mis-parsed a number of dwords that exceeds the 2179 * remaining size of the LRC. 2180 */ 2181 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 2182 numdw = remaining_dw; 2183 2184 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) { 2185 MATCH(STATE_WRITE_INLINE); 2186 2187 default: 2188 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n", 2189 dw - start, *dw, opcode, numdw); 2190 return numdw; 2191 } 2192 } 2193 2194 void xe_lrc_dump_default(struct drm_printer *p, 2195 struct xe_gt *gt, 2196 enum xe_engine_class hwe_class) 2197 { 2198 u32 *dw, *start; 2199 int remaining_dw, num_dw; 2200 2201 if (!gt->default_lrc[hwe_class]) { 2202 drm_printf(p, "No default LRC for class %d\n", hwe_class); 2203 return; 2204 } 2205 2206 /* 2207 * Skip the beginning of the LRC since it contains the per-process 2208 * hardware status page. 2209 */ 2210 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2211 start = dw; 2212 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2213 2214 while (remaining_dw > 0) { 2215 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { 2216 num_dw = dump_mi_command(p, gt, start, dw, remaining_dw); 2217 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { 2218 num_dw = dump_gfxpipe_command(p, gt, start, dw, remaining_dw); 2219 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) { 2220 num_dw = dump_gfx_state_command(p, gt, start, dw, remaining_dw); 2221 } else { 2222 num_dw = min(instr_dw(*dw), remaining_dw); 2223 drm_printf(p, "LRC[%#5tx] = [%#10x] Unknown instruction of type %#x, likely %d dwords\n", 2224 dw - start, 2225 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), 2226 num_dw); 2227 } 2228 2229 dw += num_dw; 2230 remaining_dw -= num_dw; 2231 } 2232 } 2233 2234 /* 2235 * Lookup the value of a register within the offset/value pairs of an 2236 * MI_LOAD_REGISTER_IMM instruction. 2237 * 2238 * Return -ENOENT if the register is not present in the MI_LRI instruction. 2239 */ 2240 static int lookup_reg_in_mi_lri(u32 offset, u32 *value, 2241 const u32 *dword_pair, int num_regs) 2242 { 2243 for (int i = 0; i < num_regs; i++) { 2244 if (dword_pair[2 * i] == offset) { 2245 *value = dword_pair[2 * i + 1]; 2246 return 0; 2247 } 2248 } 2249 2250 return -ENOENT; 2251 } 2252 2253 /* 2254 * Lookup the value of a register in a specific engine type's default LRC. 2255 * 2256 * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register 2257 * cannot be found in the default LRC. 2258 */ 2259 int xe_lrc_lookup_default_reg_value(struct xe_gt *gt, 2260 enum xe_engine_class hwe_class, 2261 u32 offset, 2262 u32 *value) 2263 { 2264 u32 *dw; 2265 int remaining_dw, ret; 2266 2267 if (!gt->default_lrc[hwe_class]) 2268 return -EINVAL; 2269 2270 /* 2271 * Skip the beginning of the LRC since it contains the per-process 2272 * hardware status page. 2273 */ 2274 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2275 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2276 2277 while (remaining_dw > 0) { 2278 u32 num_dw = instr_dw(*dw); 2279 2280 if (num_dw > remaining_dw) 2281 num_dw = remaining_dw; 2282 2283 switch (*dw & XE_INSTR_CMD_TYPE) { 2284 case XE_INSTR_MI: 2285 switch (*dw & MI_OPCODE) { 2286 case MI_BATCH_BUFFER_END: 2287 /* End of LRC; register not found */ 2288 return -ENOENT; 2289 2290 case MI_NOOP: 2291 case MI_TOPOLOGY_FILTER: 2292 /* 2293 * MI_NOOP and MI_TOPOLOGY_FILTER don't have 2294 * a length field and are always 1-dword 2295 * instructions. 2296 */ 2297 remaining_dw--; 2298 dw++; 2299 break; 2300 2301 case MI_LOAD_REGISTER_IMM: 2302 ret = lookup_reg_in_mi_lri(offset, value, 2303 dw + 1, (num_dw - 1) / 2); 2304 if (ret == 0) 2305 return 0; 2306 2307 fallthrough; 2308 2309 default: 2310 /* 2311 * Jump to next instruction based on length 2312 * field. 2313 */ 2314 remaining_dw -= num_dw; 2315 dw += num_dw; 2316 break; 2317 } 2318 break; 2319 2320 default: 2321 /* Jump to next instruction based on length field. */ 2322 remaining_dw -= num_dw; 2323 dw += num_dw; 2324 } 2325 } 2326 2327 return -ENOENT; 2328 } 2329 2330 struct instr_state { 2331 u32 instr; 2332 u16 num_dw; 2333 }; 2334 2335 static const struct instr_state xe_hpg_svg_state[] = { 2336 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, 2337 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, 2338 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, 2339 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, 2340 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, 2341 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, 2342 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, 2343 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, 2344 { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, 2345 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, 2346 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, 2347 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, 2348 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, 2349 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, 2350 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, 2351 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, 2352 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, 2353 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, 2354 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, 2355 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, 2356 { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, 2357 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, 2358 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, 2359 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, 2360 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, 2361 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, 2362 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, 2363 { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, 2364 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, 2365 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, 2366 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, 2367 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, 2368 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, 2369 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, 2370 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, 2371 { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, 2372 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, 2373 { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, 2374 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, 2375 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, 2376 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, 2377 { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, 2378 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, 2379 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, 2380 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, 2381 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, 2382 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, 2383 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, 2384 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, 2385 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, 2386 }; 2387 2388 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs) 2389 { 2390 struct xe_gt *gt = q->hwe->gt; 2391 struct xe_device *xe = gt_to_xe(gt); 2392 const struct instr_state *state_table = NULL; 2393 int state_table_size = 0; 2394 2395 /* 2396 * Wa_14019789679 2397 * 2398 * If the driver doesn't explicitly emit the SVG instructions while 2399 * setting up the default LRC, the context switch will write 0's 2400 * (noops) into the LRC memory rather than the expected instruction 2401 * headers. Application contexts start out as a copy of the default 2402 * LRC, and if they also do not emit specific settings for some SVG 2403 * state, then on context restore they'll unintentionally inherit 2404 * whatever state setting the previous context had programmed into the 2405 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will 2406 * prevent the hardware from resetting that state back to any specific 2407 * value). 2408 * 2409 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL 2410 * since that's a specific state setting that can easily cause GPU 2411 * hangs if unintentionally inherited. However to be safe we'll 2412 * continue to emit all of the SVG state since it's best not to leak 2413 * any of the state between contexts, even if that leakage is harmless. 2414 */ 2415 if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) { 2416 state_table = xe_hpg_svg_state; 2417 state_table_size = ARRAY_SIZE(xe_hpg_svg_state); 2418 } 2419 2420 if (!state_table) { 2421 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", 2422 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); 2423 return cs; 2424 } 2425 2426 for (int i = 0; i < state_table_size; i++) { 2427 u32 instr = state_table[i].instr; 2428 u16 num_dw = state_table[i].num_dw; 2429 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); 2430 2431 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); 2432 xe_gt_assert(gt, num_dw != 0); 2433 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); 2434 2435 /* 2436 * Xe2's SVG context is the same as the one on DG2 / MTL 2437 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has 2438 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). 2439 * Just make the replacement here rather than defining a 2440 * whole separate table for the single trivial change. 2441 */ 2442 if (GRAPHICS_VER(xe) >= 20 && 2443 instr == CMD_3DSTATE_DRAWING_RECTANGLE) 2444 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; 2445 2446 *cs = instr; 2447 if (!is_single_dw) 2448 *cs |= (num_dw - 2); 2449 2450 cs += num_dw; 2451 } 2452 2453 return cs; 2454 } 2455 2456 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) 2457 { 2458 struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT); 2459 2460 if (!snapshot) 2461 return NULL; 2462 2463 snapshot->context_desc = xe_lrc_ggtt_addr(lrc); 2464 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc); 2465 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); 2466 snapshot->head = xe_lrc_ring_head(lrc); 2467 snapshot->tail.internal = lrc->ring.tail; 2468 snapshot->tail.memory = xe_lrc_ring_tail(lrc); 2469 snapshot->start = xe_lrc_ring_start(lrc); 2470 snapshot->start_seqno = xe_lrc_start_seqno(lrc); 2471 snapshot->seqno = xe_lrc_seqno(lrc); 2472 snapshot->lrc_bo = xe_bo_get(lrc->bo); 2473 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 2474 snapshot->lrc_size = lrc->size; 2475 snapshot->replay_offset = 0; 2476 snapshot->replay_size = lrc->replay_size; 2477 snapshot->lrc_snapshot = NULL; 2478 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); 2479 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); 2480 return snapshot; 2481 } 2482 2483 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) 2484 { 2485 struct xe_bo *bo; 2486 struct iosys_map src; 2487 2488 if (!snapshot) 2489 return; 2490 2491 bo = snapshot->lrc_bo; 2492 snapshot->lrc_bo = NULL; 2493 2494 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); 2495 if (!snapshot->lrc_snapshot) 2496 goto put_bo; 2497 2498 xe_bo_lock(bo, false); 2499 if (!ttm_bo_vmap(&bo->ttm, &src)) { 2500 xe_map_memcpy_from(xe_bo_device(bo), 2501 snapshot->lrc_snapshot, &src, snapshot->lrc_offset, 2502 snapshot->lrc_size); 2503 ttm_bo_vunmap(&bo->ttm, &src); 2504 } else { 2505 kvfree(snapshot->lrc_snapshot); 2506 snapshot->lrc_snapshot = NULL; 2507 } 2508 xe_bo_unlock(bo); 2509 put_bo: 2510 xe_bo_put(bo); 2511 } 2512 2513 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) 2514 { 2515 unsigned long i; 2516 2517 if (!snapshot) 2518 return; 2519 2520 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc); 2521 drm_printf(p, "\tHW Ring address: 0x%08x\n", 2522 snapshot->ring_addr); 2523 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n", 2524 snapshot->indirect_context_desc); 2525 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head); 2526 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", 2527 snapshot->tail.internal, snapshot->tail.memory); 2528 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start); 2529 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno); 2530 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno); 2531 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp); 2532 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp); 2533 2534 if (!snapshot->lrc_snapshot) 2535 return; 2536 2537 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE); 2538 drm_puts(p, "\t[HWSP].data: "); 2539 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) { 2540 u32 *val = snapshot->lrc_snapshot + i; 2541 char dumped[ASCII85_BUFSZ]; 2542 2543 drm_puts(p, ascii85_encode(*val, dumped)); 2544 } 2545 2546 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE); 2547 drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset); 2548 drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size); 2549 2550 drm_puts(p, "\t[HWCTX].data: "); 2551 for (; i < snapshot->lrc_size; i += sizeof(u32)) { 2552 u32 *val = snapshot->lrc_snapshot + i; 2553 char dumped[ASCII85_BUFSZ]; 2554 2555 drm_puts(p, ascii85_encode(*val, dumped)); 2556 } 2557 drm_puts(p, "\n"); 2558 } 2559 2560 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) 2561 { 2562 if (!snapshot) 2563 return; 2564 2565 kvfree(snapshot->lrc_snapshot); 2566 if (snapshot->lrc_bo) 2567 xe_bo_put(snapshot->lrc_bo); 2568 2569 kfree(snapshot); 2570 } 2571 2572 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) 2573 { 2574 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); 2575 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); 2576 struct xe_hw_engine *hwe; 2577 u64 val; 2578 2579 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); 2580 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), 2581 "Unexpected engine class:instance %d:%d for context utilization\n", 2582 class, instance)) 2583 return -1; 2584 2585 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 2586 val = xe_mmio_read64_2x32(&hwe->gt->mmio, 2587 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2588 else 2589 val = xe_mmio_read32(&hwe->gt->mmio, 2590 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2591 2592 *reg_ctx_ts = val; 2593 2594 return 0; 2595 } 2596 2597 /** 2598 * xe_lrc_timestamp() - Current ctx timestamp 2599 * @lrc: Pointer to the lrc. 2600 * 2601 * Return latest ctx timestamp. With support for active contexts, the 2602 * calculation may be slightly racy, so follow a read-again logic to ensure that 2603 * the context is still active before returning the right timestamp. 2604 * 2605 * Returns: New ctx timestamp value 2606 */ 2607 u64 xe_lrc_timestamp(struct xe_lrc *lrc) 2608 { 2609 u64 lrc_ts, reg_ts, new_ts = lrc->ctx_timestamp; 2610 u32 engine_id; 2611 2612 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2613 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ 2614 if (IS_SRIOV_VF(lrc_to_xe(lrc))) { 2615 new_ts = lrc_ts; 2616 goto done; 2617 } 2618 2619 if (lrc_ts == CONTEXT_ACTIVE) { 2620 engine_id = xe_lrc_engine_id(lrc); 2621 if (!get_ctx_timestamp(lrc, engine_id, ®_ts)) 2622 new_ts = reg_ts; 2623 2624 /* read lrc again to ensure context is still active */ 2625 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2626 } 2627 2628 /* 2629 * If context switched out, just use the lrc_ts. Note that this needs to 2630 * be a separate if condition. 2631 */ 2632 if (lrc_ts != CONTEXT_ACTIVE) 2633 new_ts = lrc_ts; 2634 2635 done: 2636 return new_ts; 2637 } 2638 2639 /** 2640 * xe_lrc_update_timestamp() - Update ctx timestamp 2641 * @lrc: Pointer to the lrc. 2642 * @old_ts: Old timestamp value 2643 * 2644 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and 2645 * update saved value. 2646 * 2647 * Returns: New ctx timestamp value 2648 */ 2649 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) 2650 { 2651 *old_ts = lrc->ctx_timestamp; 2652 lrc->ctx_timestamp = xe_lrc_timestamp(lrc); 2653 2654 trace_xe_lrc_update_timestamp(lrc, *old_ts); 2655 2656 return lrc->ctx_timestamp; 2657 } 2658 2659 /** 2660 * xe_lrc_ring_is_idle() - LRC is idle 2661 * @lrc: Pointer to the lrc. 2662 * 2663 * Compare LRC ring head and tail to determine if idle. 2664 * 2665 * Return: True is ring is idle, False otherwise 2666 */ 2667 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc) 2668 { 2669 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc); 2670 } 2671