1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2021 Intel Corporation
4 */
5
6 #include "xe_lrc.h"
7
8 #include <generated/xe_wa_oob.h>
9
10 #include <linux/ascii85.h>
11
12 #include "instructions/xe_mi_commands.h"
13 #include "instructions/xe_gfxpipe_commands.h"
14 #include "instructions/xe_gfx_state_commands.h"
15 #include "regs/xe_engine_regs.h"
16 #include "regs/xe_lrc_layout.h"
17 #include "xe_bb.h"
18 #include "xe_bo.h"
19 #include "xe_device.h"
20 #include "xe_drm_client.h"
21 #include "xe_exec_queue_types.h"
22 #include "xe_gt.h"
23 #include "xe_gt_printk.h"
24 #include "xe_hw_fence.h"
25 #include "xe_map.h"
26 #include "xe_memirq.h"
27 #include "xe_mmio.h"
28 #include "xe_sriov.h"
29 #include "xe_trace_lrc.h"
30 #include "xe_vm.h"
31 #include "xe_wa.h"
32
33 #define LRC_VALID BIT_ULL(0)
34 #define LRC_PRIVILEGE BIT_ULL(8)
35 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3)
36 #define LRC_LEGACY_64B_CONTEXT 3
37
38 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61)
39 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
40
41 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
42
43 static struct xe_device *
lrc_to_xe(struct xe_lrc * lrc)44 lrc_to_xe(struct xe_lrc *lrc)
45 {
46 return gt_to_xe(lrc->fence_ctx.gt);
47 }
48
xe_gt_lrc_size(struct xe_gt * gt,enum xe_engine_class class)49 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
50 {
51 struct xe_device *xe = gt_to_xe(gt);
52 size_t size;
53
54 switch (class) {
55 case XE_ENGINE_CLASS_RENDER:
56 if (GRAPHICS_VER(xe) >= 20)
57 size = 4 * SZ_4K;
58 else
59 size = 14 * SZ_4K;
60 break;
61 case XE_ENGINE_CLASS_COMPUTE:
62 /* 14 pages since graphics_ver == 11 */
63 if (GRAPHICS_VER(xe) >= 20)
64 size = 3 * SZ_4K;
65 else
66 size = 14 * SZ_4K;
67 break;
68 default:
69 WARN(1, "Unknown engine class: %d", class);
70 fallthrough;
71 case XE_ENGINE_CLASS_COPY:
72 case XE_ENGINE_CLASS_VIDEO_DECODE:
73 case XE_ENGINE_CLASS_VIDEO_ENHANCE:
74 case XE_ENGINE_CLASS_OTHER:
75 size = 2 * SZ_4K;
76 }
77
78 /* Add indirect ring state page */
79 if (xe_gt_has_indirect_ring_state(gt))
80 size += LRC_INDIRECT_RING_STATE_SIZE;
81
82 return size;
83 }
84
85 /*
86 * The per-platform tables are u8-encoded in @data. Decode @data and set the
87 * addresses' offset and commands in @regs. The following encoding is used
88 * for each byte. There are 2 steps: decoding commands and decoding addresses.
89 *
90 * Commands:
91 * [7]: create NOPs - number of NOPs are set in lower bits
92 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
93 * MI_LRI_FORCE_POSTED
94 * [5:0]: Number of NOPs or registers to set values to in case of
95 * MI_LOAD_REGISTER_IMM
96 *
97 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
98 * number of registers. They are set by using the REG/REG16 macros: the former
99 * is used for offsets smaller than 0x200 while the latter is for values bigger
100 * than that. Those macros already set all the bits documented below correctly:
101 *
102 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
103 * follow, for the lower bits
104 * [6:0]: Register offset, without considering the engine base.
105 *
106 * This function only tweaks the commands and register offsets. Values are not
107 * filled out.
108 */
set_offsets(u32 * regs,const u8 * data,const struct xe_hw_engine * hwe)109 static void set_offsets(u32 *regs,
110 const u8 *data,
111 const struct xe_hw_engine *hwe)
112 #define NOP(x) (BIT(7) | (x))
113 #define LRI(count, flags) ((flags) << 6 | (count) | \
114 BUILD_BUG_ON_ZERO(count >= BIT(6)))
115 #define POSTED BIT(0)
116 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
117 #define REG16(x) \
118 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
119 (((x) >> 2) & 0x7f)
120 {
121 const u32 base = hwe->mmio_base;
122
123 while (*data) {
124 u8 count, flags;
125
126 if (*data & BIT(7)) { /* skip */
127 count = *data++ & ~BIT(7);
128 regs += count;
129 continue;
130 }
131
132 count = *data & 0x3f;
133 flags = *data >> 6;
134 data++;
135
136 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
137 if (flags & POSTED)
138 *regs |= MI_LRI_FORCE_POSTED;
139 *regs |= MI_LRI_LRM_CS_MMIO;
140 regs++;
141
142 xe_gt_assert(hwe->gt, count);
143 do {
144 u32 offset = 0;
145 u8 v;
146
147 do {
148 v = *data++;
149 offset <<= 7;
150 offset |= v & ~BIT(7);
151 } while (v & BIT(7));
152
153 regs[0] = base + (offset << 2);
154 regs += 2;
155 } while (--count);
156 }
157
158 *regs = MI_BATCH_BUFFER_END | BIT(0);
159 }
160
161 static const u8 gen12_xcs_offsets[] = {
162 NOP(1),
163 LRI(13, POSTED),
164 REG16(0x244),
165 REG(0x034),
166 REG(0x030),
167 REG(0x038),
168 REG(0x03c),
169 REG(0x168),
170 REG(0x140),
171 REG(0x110),
172 REG(0x1c0),
173 REG(0x1c4),
174 REG(0x1c8),
175 REG(0x180),
176 REG16(0x2b4),
177
178 NOP(5),
179 LRI(9, POSTED),
180 REG16(0x3a8),
181 REG16(0x28c),
182 REG16(0x288),
183 REG16(0x284),
184 REG16(0x280),
185 REG16(0x27c),
186 REG16(0x278),
187 REG16(0x274),
188 REG16(0x270),
189
190 0
191 };
192
193 static const u8 dg2_xcs_offsets[] = {
194 NOP(1),
195 LRI(15, POSTED),
196 REG16(0x244),
197 REG(0x034),
198 REG(0x030),
199 REG(0x038),
200 REG(0x03c),
201 REG(0x168),
202 REG(0x140),
203 REG(0x110),
204 REG(0x1c0),
205 REG(0x1c4),
206 REG(0x1c8),
207 REG(0x180),
208 REG16(0x2b4),
209 REG(0x120),
210 REG(0x124),
211
212 NOP(1),
213 LRI(9, POSTED),
214 REG16(0x3a8),
215 REG16(0x28c),
216 REG16(0x288),
217 REG16(0x284),
218 REG16(0x280),
219 REG16(0x27c),
220 REG16(0x278),
221 REG16(0x274),
222 REG16(0x270),
223
224 0
225 };
226
227 static const u8 gen12_rcs_offsets[] = {
228 NOP(1),
229 LRI(13, POSTED),
230 REG16(0x244),
231 REG(0x034),
232 REG(0x030),
233 REG(0x038),
234 REG(0x03c),
235 REG(0x168),
236 REG(0x140),
237 REG(0x110),
238 REG(0x1c0),
239 REG(0x1c4),
240 REG(0x1c8),
241 REG(0x180),
242 REG16(0x2b4),
243
244 NOP(5),
245 LRI(9, POSTED),
246 REG16(0x3a8),
247 REG16(0x28c),
248 REG16(0x288),
249 REG16(0x284),
250 REG16(0x280),
251 REG16(0x27c),
252 REG16(0x278),
253 REG16(0x274),
254 REG16(0x270),
255
256 LRI(3, POSTED),
257 REG(0x1b0),
258 REG16(0x5a8),
259 REG16(0x5ac),
260
261 NOP(6),
262 LRI(1, 0),
263 REG(0x0c8),
264 NOP(3 + 9 + 1),
265
266 LRI(51, POSTED),
267 REG16(0x588),
268 REG16(0x588),
269 REG16(0x588),
270 REG16(0x588),
271 REG16(0x588),
272 REG16(0x588),
273 REG(0x028),
274 REG(0x09c),
275 REG(0x0c0),
276 REG(0x178),
277 REG(0x17c),
278 REG16(0x358),
279 REG(0x170),
280 REG(0x150),
281 REG(0x154),
282 REG(0x158),
283 REG16(0x41c),
284 REG16(0x600),
285 REG16(0x604),
286 REG16(0x608),
287 REG16(0x60c),
288 REG16(0x610),
289 REG16(0x614),
290 REG16(0x618),
291 REG16(0x61c),
292 REG16(0x620),
293 REG16(0x624),
294 REG16(0x628),
295 REG16(0x62c),
296 REG16(0x630),
297 REG16(0x634),
298 REG16(0x638),
299 REG16(0x63c),
300 REG16(0x640),
301 REG16(0x644),
302 REG16(0x648),
303 REG16(0x64c),
304 REG16(0x650),
305 REG16(0x654),
306 REG16(0x658),
307 REG16(0x65c),
308 REG16(0x660),
309 REG16(0x664),
310 REG16(0x668),
311 REG16(0x66c),
312 REG16(0x670),
313 REG16(0x674),
314 REG16(0x678),
315 REG16(0x67c),
316 REG(0x068),
317 REG(0x084),
318 NOP(1),
319
320 0
321 };
322
323 static const u8 xehp_rcs_offsets[] = {
324 NOP(1),
325 LRI(13, POSTED),
326 REG16(0x244),
327 REG(0x034),
328 REG(0x030),
329 REG(0x038),
330 REG(0x03c),
331 REG(0x168),
332 REG(0x140),
333 REG(0x110),
334 REG(0x1c0),
335 REG(0x1c4),
336 REG(0x1c8),
337 REG(0x180),
338 REG16(0x2b4),
339
340 NOP(5),
341 LRI(9, POSTED),
342 REG16(0x3a8),
343 REG16(0x28c),
344 REG16(0x288),
345 REG16(0x284),
346 REG16(0x280),
347 REG16(0x27c),
348 REG16(0x278),
349 REG16(0x274),
350 REG16(0x270),
351
352 LRI(3, POSTED),
353 REG(0x1b0),
354 REG16(0x5a8),
355 REG16(0x5ac),
356
357 NOP(6),
358 LRI(1, 0),
359 REG(0x0c8),
360
361 0
362 };
363
364 static const u8 dg2_rcs_offsets[] = {
365 NOP(1),
366 LRI(15, POSTED),
367 REG16(0x244),
368 REG(0x034),
369 REG(0x030),
370 REG(0x038),
371 REG(0x03c),
372 REG(0x168),
373 REG(0x140),
374 REG(0x110),
375 REG(0x1c0),
376 REG(0x1c4),
377 REG(0x1c8),
378 REG(0x180),
379 REG16(0x2b4),
380 REG(0x120),
381 REG(0x124),
382
383 NOP(1),
384 LRI(9, POSTED),
385 REG16(0x3a8),
386 REG16(0x28c),
387 REG16(0x288),
388 REG16(0x284),
389 REG16(0x280),
390 REG16(0x27c),
391 REG16(0x278),
392 REG16(0x274),
393 REG16(0x270),
394
395 LRI(3, POSTED),
396 REG(0x1b0),
397 REG16(0x5a8),
398 REG16(0x5ac),
399
400 NOP(6),
401 LRI(1, 0),
402 REG(0x0c8),
403
404 0
405 };
406
407 static const u8 mtl_rcs_offsets[] = {
408 NOP(1),
409 LRI(15, POSTED),
410 REG16(0x244),
411 REG(0x034),
412 REG(0x030),
413 REG(0x038),
414 REG(0x03c),
415 REG(0x168),
416 REG(0x140),
417 REG(0x110),
418 REG(0x1c0),
419 REG(0x1c4),
420 REG(0x1c8),
421 REG(0x180),
422 REG16(0x2b4),
423 REG(0x120),
424 REG(0x124),
425
426 NOP(1),
427 LRI(9, POSTED),
428 REG16(0x3a8),
429 REG16(0x28c),
430 REG16(0x288),
431 REG16(0x284),
432 REG16(0x280),
433 REG16(0x27c),
434 REG16(0x278),
435 REG16(0x274),
436 REG16(0x270),
437
438 NOP(2),
439 LRI(2, POSTED),
440 REG16(0x5a8),
441 REG16(0x5ac),
442
443 NOP(6),
444 LRI(1, 0),
445 REG(0x0c8),
446
447 0
448 };
449
450 #define XE2_CTX_COMMON \
451 NOP(1), /* [0x00] */ \
452 LRI(15, POSTED), /* [0x01] */ \
453 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \
454 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \
455 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \
456 REG(0x038), /* [0x08] RING_BUFFER_START */ \
457 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \
458 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \
459 REG(0x140), /* [0x0e] BB_ADDR */ \
460 REG(0x110), /* [0x10] BB_STATE */ \
461 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \
462 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \
463 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
464 REG(0x180), /* [0x18] CCID */ \
465 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \
466 REG(0x120), /* [0x1c] PRT_BB_STATE */ \
467 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \
468 \
469 NOP(1), /* [0x20] */ \
470 LRI(9, POSTED), /* [0x21] */ \
471 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \
472 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \
473 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \
474 REG16(0x284), /* [0x28] dummy reg */ \
475 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \
476 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \
477 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \
478 REG16(0x274), /* [0x30] PTBP_UDW */ \
479 REG16(0x270) /* [0x32] PTBP_LDW */
480
481 static const u8 xe2_rcs_offsets[] = {
482 XE2_CTX_COMMON,
483
484 NOP(2), /* [0x34] */
485 LRI(2, POSTED), /* [0x36] */
486 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
487 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */
488
489 NOP(6), /* [0x41] */
490 LRI(1, 0), /* [0x47] */
491 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */
492
493 0
494 };
495
496 static const u8 xe2_bcs_offsets[] = {
497 XE2_CTX_COMMON,
498
499 NOP(4 + 8 + 1), /* [0x34] */
500 LRI(2, POSTED), /* [0x41] */
501 REG16(0x200), /* [0x42] BCS_SWCTRL */
502 REG16(0x204), /* [0x44] BLIT_CCTL */
503
504 0
505 };
506
507 static const u8 xe2_xcs_offsets[] = {
508 XE2_CTX_COMMON,
509
510 0
511 };
512
513 static const u8 xe2_indirect_ring_state_offsets[] = {
514 NOP(1), /* [0x00] */
515 LRI(5, POSTED), /* [0x01] */
516 REG(0x034), /* [0x02] RING_BUFFER_HEAD */
517 REG(0x030), /* [0x04] RING_BUFFER_TAIL */
518 REG(0x038), /* [0x06] RING_BUFFER_START */
519 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */
520 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */
521
522 NOP(5), /* [0x0c] */
523 LRI(9, POSTED), /* [0x11] */
524 REG(0x168), /* [0x12] BB_ADDR_UDW */
525 REG(0x140), /* [0x14] BB_ADDR */
526 REG(0x110), /* [0x16] BB_STATE */
527 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */
528 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */
529 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */
530 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */
531 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */
532 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */
533
534 NOP(12), /* [0x00] */
535
536 0
537 };
538
539 #undef REG16
540 #undef REG
541 #undef LRI
542 #undef NOP
543
reg_offsets(struct xe_device * xe,enum xe_engine_class class)544 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
545 {
546 if (class == XE_ENGINE_CLASS_RENDER) {
547 if (GRAPHICS_VER(xe) >= 20)
548 return xe2_rcs_offsets;
549 else if (GRAPHICS_VERx100(xe) >= 1270)
550 return mtl_rcs_offsets;
551 else if (GRAPHICS_VERx100(xe) >= 1255)
552 return dg2_rcs_offsets;
553 else if (GRAPHICS_VERx100(xe) >= 1250)
554 return xehp_rcs_offsets;
555 else
556 return gen12_rcs_offsets;
557 } else if (class == XE_ENGINE_CLASS_COPY) {
558 if (GRAPHICS_VER(xe) >= 20)
559 return xe2_bcs_offsets;
560 else
561 return gen12_xcs_offsets;
562 } else {
563 if (GRAPHICS_VER(xe) >= 20)
564 return xe2_xcs_offsets;
565 else if (GRAPHICS_VERx100(xe) >= 1255)
566 return dg2_xcs_offsets;
567 else
568 return gen12_xcs_offsets;
569 }
570 }
571
set_context_control(u32 * regs,struct xe_hw_engine * hwe)572 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
573 {
574 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
575 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
576
577 if (xe_gt_has_indirect_ring_state(hwe->gt))
578 regs[CTX_CONTEXT_CONTROL] |=
579 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
580
581 /* TODO: Timestamp */
582 }
583
set_memory_based_intr(u32 * regs,struct xe_hw_engine * hwe)584 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
585 {
586 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq;
587 struct xe_device *xe = gt_to_xe(hwe->gt);
588 u8 num_regs;
589
590 if (!xe_device_uses_memirq(xe))
591 return;
592
593 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
594 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
595 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
596 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
597
598 num_regs = xe_device_has_msix(xe) ? 3 : 2;
599 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
600 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
601 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
602 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
603 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
604 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
605
606 if (xe_device_has_msix(xe)) {
607 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
608 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
609 }
610 }
611
lrc_ring_mi_mode(struct xe_hw_engine * hwe)612 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
613 {
614 struct xe_device *xe = gt_to_xe(hwe->gt);
615
616 if (GRAPHICS_VERx100(xe) >= 1250)
617 return 0x70;
618 else
619 return 0x60;
620 }
621
reset_stop_ring(u32 * regs,struct xe_hw_engine * hwe)622 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
623 {
624 int x;
625
626 x = lrc_ring_mi_mode(hwe);
627 regs[x + 1] &= ~STOP_RING;
628 regs[x + 1] |= STOP_RING << 16;
629 }
630
xe_lrc_has_indirect_ring_state(struct xe_lrc * lrc)631 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
632 {
633 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
634 }
635
__xe_lrc_ring_offset(struct xe_lrc * lrc)636 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
637 {
638 return 0;
639 }
640
xe_lrc_pphwsp_offset(struct xe_lrc * lrc)641 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
642 {
643 return lrc->ring.size;
644 }
645
646 /* Make the magic macros work */
647 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
648 #define __xe_lrc_regs_offset xe_lrc_regs_offset
649
650 #define LRC_SEQNO_PPHWSP_OFFSET 512
651 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
652 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
653 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
654 #define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
655 #define LRC_PPHWSP_SIZE SZ_4K
656
xe_lrc_regs_offset(struct xe_lrc * lrc)657 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
658 {
659 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
660 }
661
lrc_reg_size(struct xe_device * xe)662 static size_t lrc_reg_size(struct xe_device *xe)
663 {
664 if (GRAPHICS_VERx100(xe) >= 1250)
665 return 96 * sizeof(u32);
666 else
667 return 80 * sizeof(u32);
668 }
669
xe_lrc_skip_size(struct xe_device * xe)670 size_t xe_lrc_skip_size(struct xe_device *xe)
671 {
672 return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
673 }
674
__xe_lrc_seqno_offset(struct xe_lrc * lrc)675 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
676 {
677 /* The seqno is stored in the driver-defined portion of PPHWSP */
678 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
679 }
680
__xe_lrc_start_seqno_offset(struct xe_lrc * lrc)681 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
682 {
683 /* The start seqno is stored in the driver-defined portion of PPHWSP */
684 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
685 }
686
__xe_lrc_ctx_job_timestamp_offset(struct xe_lrc * lrc)687 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
688 {
689 /* This is stored in the driver-defined portion of PPHWSP */
690 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
691 }
692
__xe_lrc_parallel_offset(struct xe_lrc * lrc)693 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
694 {
695 /* The parallel is stored in the driver-defined portion of PPHWSP */
696 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
697 }
698
__xe_lrc_engine_id_offset(struct xe_lrc * lrc)699 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
700 {
701 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
702 }
703
__xe_lrc_ctx_timestamp_offset(struct xe_lrc * lrc)704 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
705 {
706 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
707 }
708
__xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc * lrc)709 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
710 {
711 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
712 }
713
__xe_lrc_indirect_ring_offset(struct xe_lrc * lrc)714 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
715 {
716 /* Indirect ring state page is at the very end of LRC */
717 return lrc->size - LRC_INDIRECT_RING_STATE_SIZE;
718 }
719
720 #define DECL_MAP_ADDR_HELPERS(elem) \
721 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
722 { \
723 struct iosys_map map = lrc->bo->vmap; \
724 \
725 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \
726 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
727 return map; \
728 } \
729 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
730 { \
731 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
732 } \
733
734 DECL_MAP_ADDR_HELPERS(ring)
DECL_MAP_ADDR_HELPERS(pphwsp)735 DECL_MAP_ADDR_HELPERS(pphwsp)
736 DECL_MAP_ADDR_HELPERS(seqno)
737 DECL_MAP_ADDR_HELPERS(regs)
738 DECL_MAP_ADDR_HELPERS(start_seqno)
739 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
740 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
741 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
742 DECL_MAP_ADDR_HELPERS(parallel)
743 DECL_MAP_ADDR_HELPERS(indirect_ring)
744 DECL_MAP_ADDR_HELPERS(engine_id)
745
746 #undef DECL_MAP_ADDR_HELPERS
747
748 /**
749 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
750 * @lrc: Pointer to the lrc.
751 *
752 * Returns: ctx timestamp GGTT address
753 */
754 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
755 {
756 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
757 }
758
759 /**
760 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
761 * @lrc: Pointer to the lrc.
762 *
763 * Returns: ctx timestamp udw GGTT address
764 */
xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc * lrc)765 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
766 {
767 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
768 }
769
770 /**
771 * xe_lrc_ctx_timestamp() - Read ctx timestamp value
772 * @lrc: Pointer to the lrc.
773 *
774 * Returns: ctx timestamp value
775 */
xe_lrc_ctx_timestamp(struct xe_lrc * lrc)776 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
777 {
778 struct xe_device *xe = lrc_to_xe(lrc);
779 struct iosys_map map;
780 u32 ldw, udw = 0;
781
782 map = __xe_lrc_ctx_timestamp_map(lrc);
783 ldw = xe_map_read32(xe, &map);
784
785 if (xe->info.has_64bit_timestamp) {
786 map = __xe_lrc_ctx_timestamp_udw_map(lrc);
787 udw = xe_map_read32(xe, &map);
788 }
789
790 return (u64)udw << 32 | ldw;
791 }
792
793 /**
794 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
795 * @lrc: Pointer to the lrc.
796 *
797 * Returns: ctx timestamp job GGTT address
798 */
xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc * lrc)799 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
800 {
801 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
802 }
803
804 /**
805 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
806 * @lrc: Pointer to the lrc.
807 *
808 * Returns: ctx timestamp job value
809 */
xe_lrc_ctx_job_timestamp(struct xe_lrc * lrc)810 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
811 {
812 struct xe_device *xe = lrc_to_xe(lrc);
813 struct iosys_map map;
814
815 map = __xe_lrc_ctx_job_timestamp_map(lrc);
816 return xe_map_read32(xe, &map);
817 }
818
xe_lrc_ggtt_addr(struct xe_lrc * lrc)819 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
820 {
821 return __xe_lrc_pphwsp_ggtt_addr(lrc);
822 }
823
xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc * lrc)824 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
825 {
826 if (!xe_lrc_has_indirect_ring_state(lrc))
827 return 0;
828
829 return __xe_lrc_indirect_ring_ggtt_addr(lrc);
830 }
831
xe_lrc_read_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr)832 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
833 {
834 struct xe_device *xe = lrc_to_xe(lrc);
835 struct iosys_map map;
836
837 map = __xe_lrc_indirect_ring_map(lrc);
838 iosys_map_incr(&map, reg_nr * sizeof(u32));
839 return xe_map_read32(xe, &map);
840 }
841
xe_lrc_write_indirect_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)842 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
843 int reg_nr, u32 val)
844 {
845 struct xe_device *xe = lrc_to_xe(lrc);
846 struct iosys_map map;
847
848 map = __xe_lrc_indirect_ring_map(lrc);
849 iosys_map_incr(&map, reg_nr * sizeof(u32));
850 xe_map_write32(xe, &map, val);
851 }
852
xe_lrc_read_ctx_reg(struct xe_lrc * lrc,int reg_nr)853 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
854 {
855 struct xe_device *xe = lrc_to_xe(lrc);
856 struct iosys_map map;
857
858 map = __xe_lrc_regs_map(lrc);
859 iosys_map_incr(&map, reg_nr * sizeof(u32));
860 return xe_map_read32(xe, &map);
861 }
862
xe_lrc_write_ctx_reg(struct xe_lrc * lrc,int reg_nr,u32 val)863 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
864 {
865 struct xe_device *xe = lrc_to_xe(lrc);
866 struct iosys_map map;
867
868 map = __xe_lrc_regs_map(lrc);
869 iosys_map_incr(&map, reg_nr * sizeof(u32));
870 xe_map_write32(xe, &map, val);
871 }
872
empty_lrc_data(struct xe_hw_engine * hwe)873 static void *empty_lrc_data(struct xe_hw_engine *hwe)
874 {
875 struct xe_gt *gt = hwe->gt;
876 void *data;
877 u32 *regs;
878
879 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
880 if (!data)
881 return NULL;
882
883 /* 1st page: Per-Process of HW status Page */
884 regs = data + LRC_PPHWSP_SIZE;
885 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
886 set_context_control(regs, hwe);
887 set_memory_based_intr(regs, hwe);
888 reset_stop_ring(regs, hwe);
889 if (xe_gt_has_indirect_ring_state(gt)) {
890 regs = data + xe_gt_lrc_size(gt, hwe->class) -
891 LRC_INDIRECT_RING_STATE_SIZE;
892 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
893 }
894
895 return data;
896 }
897
xe_lrc_set_ppgtt(struct xe_lrc * lrc,struct xe_vm * vm)898 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
899 {
900 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
901
902 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
903 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
904 }
905
xe_lrc_finish(struct xe_lrc * lrc)906 static void xe_lrc_finish(struct xe_lrc *lrc)
907 {
908 xe_hw_fence_ctx_finish(&lrc->fence_ctx);
909 xe_bo_lock(lrc->bo, false);
910 xe_bo_unpin(lrc->bo);
911 xe_bo_unlock(lrc->bo);
912 xe_bo_put(lrc->bo);
913 xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
914 }
915
916 /*
917 * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
918 * context run ticks.
919 * @lrc: Pointer to the lrc.
920 *
921 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
922 * context, but only gets updated when the context switches out. In order to
923 * check how long a context has been active before it switches out, two things
924 * are required:
925 *
926 * (1) Determine if the context is running:
927 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
928 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
929 * initialized. During a query, we just check for this value to determine if the
930 * context is active. If the context switched out, it would overwrite this
931 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
932 * the last part of context restore, so reusing this LRC location will not
933 * clobber anything.
934 *
935 * (2) Calculate the time that the context has been active for:
936 * The CTX_TIMESTAMP ticks only when the context is active. If a context is
937 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
938 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
939 * engine instance. Since we do not know which instance the context is running
940 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
941 * store it in the PPHSWP.
942 */
943 #define CONTEXT_ACTIVE 1ULL
xe_lrc_setup_utilization(struct xe_lrc * lrc)944 static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
945 {
946 u32 *cmd;
947
948 cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
949
950 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
951 *cmd++ = ENGINE_ID(0).addr;
952 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
953 *cmd++ = 0;
954
955 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
956 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
957 *cmd++ = 0;
958 *cmd++ = lower_32_bits(CONTEXT_ACTIVE);
959
960 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
961 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
962 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
963 *cmd++ = 0;
964 *cmd++ = upper_32_bits(CONTEXT_ACTIVE);
965 }
966
967 *cmd++ = MI_BATCH_BUFFER_END;
968
969 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
970 xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
971
972 }
973
974 #define PVC_CTX_ASID (0x2e + 1)
975 #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1)
976
xe_lrc_init(struct xe_lrc * lrc,struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 init_flags)977 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
978 struct xe_vm *vm, u32 ring_size, u16 msix_vec,
979 u32 init_flags)
980 {
981 struct xe_gt *gt = hwe->gt;
982 struct xe_tile *tile = gt_to_tile(gt);
983 struct xe_device *xe = gt_to_xe(gt);
984 struct iosys_map map;
985 void *init_data = NULL;
986 u32 arb_enable;
987 u32 lrc_size;
988 u32 bo_flags;
989 int err;
990
991 kref_init(&lrc->refcount);
992 lrc->gt = gt;
993 lrc->flags = 0;
994 lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class);
995 if (xe_gt_has_indirect_ring_state(gt))
996 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
997
998 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
999 XE_BO_FLAG_GGTT_INVALIDATE;
1000
1001 /*
1002 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
1003 * via VM bind calls.
1004 */
1005 lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size,
1006 ttm_bo_type_kernel,
1007 bo_flags);
1008 if (IS_ERR(lrc->bo))
1009 return PTR_ERR(lrc->bo);
1010
1011 lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
1012 ttm_bo_type_kernel,
1013 bo_flags);
1014 if (IS_ERR(lrc->bb_per_ctx_bo)) {
1015 err = PTR_ERR(lrc->bb_per_ctx_bo);
1016 goto err_lrc_finish;
1017 }
1018
1019 lrc->size = lrc_size;
1020 lrc->ring.size = ring_size;
1021 lrc->ring.tail = 0;
1022
1023 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1024 hwe->fence_irq, hwe->name);
1025
1026 if (!gt->default_lrc[hwe->class]) {
1027 init_data = empty_lrc_data(hwe);
1028 if (!init_data) {
1029 err = -ENOMEM;
1030 goto err_lrc_finish;
1031 }
1032 }
1033
1034 /*
1035 * Init Per-Process of HW status Page, LRC / context state to known
1036 * values
1037 */
1038 map = __xe_lrc_pphwsp_map(lrc);
1039 if (!init_data) {
1040 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
1041 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1042 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1043 xe_gt_lrc_size(gt, hwe->class) - LRC_PPHWSP_SIZE);
1044 } else {
1045 xe_map_memcpy_to(xe, &map, 0, init_data,
1046 xe_gt_lrc_size(gt, hwe->class));
1047 kfree(init_data);
1048 }
1049
1050 if (vm) {
1051 xe_lrc_set_ppgtt(lrc, vm);
1052
1053 if (vm->xef)
1054 xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1055 }
1056
1057 if (xe_device_has_msix(xe)) {
1058 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1059 xe_memirq_status_ptr(&tile->memirq, hwe));
1060 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1061 xe_memirq_source_ptr(&tile->memirq, hwe));
1062 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1063 }
1064
1065 if (xe_gt_has_indirect_ring_state(gt)) {
1066 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1067 __xe_lrc_indirect_ring_ggtt_addr(lrc));
1068
1069 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1070 __xe_lrc_ring_ggtt_addr(lrc));
1071 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1072 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1073 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1074 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1075 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1076 } else {
1077 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1078 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1079 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1080 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1081 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1082 }
1083
1084 if (init_flags & XE_LRC_CREATE_RUNALONE)
1085 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1086 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1087 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1088
1089 if (init_flags & XE_LRC_CREATE_PXP)
1090 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1091 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1092 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1093
1094 lrc->ctx_timestamp = 0;
1095 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1096 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1097 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1098
1099 if (xe->info.has_asid && vm)
1100 xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
1101
1102 lrc->desc = LRC_VALID;
1103 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1104 /* TODO: Priority */
1105
1106 /* While this appears to have something about privileged batches or
1107 * some such, it really just means PPGTT mode.
1108 */
1109 if (vm)
1110 lrc->desc |= LRC_PRIVILEGE;
1111
1112 if (GRAPHICS_VERx100(xe) < 1250) {
1113 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1114 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1115 }
1116
1117 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1118 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1119
1120 map = __xe_lrc_seqno_map(lrc);
1121 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1122
1123 map = __xe_lrc_start_seqno_map(lrc);
1124 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1125
1126 xe_lrc_setup_utilization(lrc);
1127
1128 return 0;
1129
1130 err_lrc_finish:
1131 xe_lrc_finish(lrc);
1132 return err;
1133 }
1134
1135 /**
1136 * xe_lrc_create - Create a LRC
1137 * @hwe: Hardware Engine
1138 * @vm: The VM (address space)
1139 * @ring_size: LRC ring size
1140 * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1141 * @flags: LRC initialization flags
1142 *
1143 * Allocate and initialize the Logical Ring Context (LRC).
1144 *
1145 * Return pointer to created LRC upon success and an error pointer
1146 * upon failure.
1147 */
xe_lrc_create(struct xe_hw_engine * hwe,struct xe_vm * vm,u32 ring_size,u16 msix_vec,u32 flags)1148 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1149 u32 ring_size, u16 msix_vec, u32 flags)
1150 {
1151 struct xe_lrc *lrc;
1152 int err;
1153
1154 lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1155 if (!lrc)
1156 return ERR_PTR(-ENOMEM);
1157
1158 err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1159 if (err) {
1160 kfree(lrc);
1161 return ERR_PTR(err);
1162 }
1163
1164 return lrc;
1165 }
1166
1167 /**
1168 * xe_lrc_destroy - Destroy the LRC
1169 * @ref: reference to LRC
1170 *
1171 * Called when ref == 0, release resources held by the Logical Ring Context
1172 * (LRC) and free the LRC memory.
1173 */
xe_lrc_destroy(struct kref * ref)1174 void xe_lrc_destroy(struct kref *ref)
1175 {
1176 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1177
1178 xe_lrc_finish(lrc);
1179 kfree(lrc);
1180 }
1181
xe_lrc_set_ring_tail(struct xe_lrc * lrc,u32 tail)1182 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1183 {
1184 if (xe_lrc_has_indirect_ring_state(lrc))
1185 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1186 else
1187 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1188 }
1189
xe_lrc_ring_tail(struct xe_lrc * lrc)1190 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1191 {
1192 if (xe_lrc_has_indirect_ring_state(lrc))
1193 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1194 else
1195 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1196 }
1197
xe_lrc_ring_start(struct xe_lrc * lrc)1198 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1199 {
1200 if (xe_lrc_has_indirect_ring_state(lrc))
1201 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1202 else
1203 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1204 }
1205
xe_lrc_set_ring_head(struct xe_lrc * lrc,u32 head)1206 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1207 {
1208 if (xe_lrc_has_indirect_ring_state(lrc))
1209 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1210 else
1211 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1212 }
1213
xe_lrc_ring_head(struct xe_lrc * lrc)1214 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1215 {
1216 if (xe_lrc_has_indirect_ring_state(lrc))
1217 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1218 else
1219 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1220 }
1221
xe_lrc_ring_space(struct xe_lrc * lrc)1222 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1223 {
1224 const u32 head = xe_lrc_ring_head(lrc);
1225 const u32 tail = lrc->ring.tail;
1226 const u32 size = lrc->ring.size;
1227
1228 return ((head - tail - 1) & (size - 1)) + 1;
1229 }
1230
__xe_lrc_write_ring(struct xe_lrc * lrc,struct iosys_map ring,const void * data,size_t size)1231 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1232 const void *data, size_t size)
1233 {
1234 struct xe_device *xe = lrc_to_xe(lrc);
1235
1236 iosys_map_incr(&ring, lrc->ring.tail);
1237 xe_map_memcpy_to(xe, &ring, 0, data, size);
1238 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1239 }
1240
xe_lrc_write_ring(struct xe_lrc * lrc,const void * data,size_t size)1241 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1242 {
1243 struct xe_device *xe = lrc_to_xe(lrc);
1244 struct iosys_map ring;
1245 u32 rhs;
1246 size_t aligned_size;
1247
1248 xe_assert(xe, IS_ALIGNED(size, 4));
1249 aligned_size = ALIGN(size, 8);
1250
1251 ring = __xe_lrc_ring_map(lrc);
1252
1253 xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1254 rhs = lrc->ring.size - lrc->ring.tail;
1255 if (size > rhs) {
1256 __xe_lrc_write_ring(lrc, ring, data, rhs);
1257 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1258 } else {
1259 __xe_lrc_write_ring(lrc, ring, data, size);
1260 }
1261
1262 if (aligned_size > size) {
1263 u32 noop = MI_NOOP;
1264
1265 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1266 }
1267 }
1268
xe_lrc_descriptor(struct xe_lrc * lrc)1269 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1270 {
1271 return lrc->desc | xe_lrc_ggtt_addr(lrc);
1272 }
1273
xe_lrc_seqno_ggtt_addr(struct xe_lrc * lrc)1274 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1275 {
1276 return __xe_lrc_seqno_ggtt_addr(lrc);
1277 }
1278
1279 /**
1280 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1281 *
1282 * Allocate but don't initialize an lrc seqno fence.
1283 *
1284 * Return: Pointer to the allocated fence or
1285 * negative error pointer on error.
1286 */
xe_lrc_alloc_seqno_fence(void)1287 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1288 {
1289 return xe_hw_fence_alloc();
1290 }
1291
1292 /**
1293 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1294 * @fence: Pointer to the fence to free.
1295 *
1296 * Frees an lrc seqno fence that hasn't yet been
1297 * initialized.
1298 */
xe_lrc_free_seqno_fence(struct dma_fence * fence)1299 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1300 {
1301 xe_hw_fence_free(fence);
1302 }
1303
1304 /**
1305 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1306 * @lrc: Pointer to the lrc.
1307 * @fence: Pointer to the fence to initialize.
1308 *
1309 * Initializes a pre-allocated lrc seqno fence.
1310 * After initialization, the fence is subject to normal
1311 * dma-fence refcounting.
1312 */
xe_lrc_init_seqno_fence(struct xe_lrc * lrc,struct dma_fence * fence)1313 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1314 {
1315 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1316 }
1317
xe_lrc_seqno(struct xe_lrc * lrc)1318 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1319 {
1320 struct iosys_map map = __xe_lrc_seqno_map(lrc);
1321
1322 return xe_map_read32(lrc_to_xe(lrc), &map);
1323 }
1324
xe_lrc_start_seqno(struct xe_lrc * lrc)1325 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1326 {
1327 struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1328
1329 return xe_map_read32(lrc_to_xe(lrc), &map);
1330 }
1331
xe_lrc_start_seqno_ggtt_addr(struct xe_lrc * lrc)1332 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1333 {
1334 return __xe_lrc_start_seqno_ggtt_addr(lrc);
1335 }
1336
xe_lrc_parallel_ggtt_addr(struct xe_lrc * lrc)1337 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1338 {
1339 return __xe_lrc_parallel_ggtt_addr(lrc);
1340 }
1341
xe_lrc_parallel_map(struct xe_lrc * lrc)1342 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1343 {
1344 return __xe_lrc_parallel_map(lrc);
1345 }
1346
1347 /**
1348 * xe_lrc_engine_id() - Read engine id value
1349 * @lrc: Pointer to the lrc.
1350 *
1351 * Returns: context id value
1352 */
xe_lrc_engine_id(struct xe_lrc * lrc)1353 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1354 {
1355 struct xe_device *xe = lrc_to_xe(lrc);
1356 struct iosys_map map;
1357
1358 map = __xe_lrc_engine_id_map(lrc);
1359 return xe_map_read32(xe, &map);
1360 }
1361
instr_dw(u32 cmd_header)1362 static int instr_dw(u32 cmd_header)
1363 {
1364 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1365 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1366 GFXPIPE_SINGLE_DW_CMD(0, 0))
1367 return 1;
1368
1369 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1370 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1371 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1372
1373 /* Most instructions have the # of dwords (minus 2) in 7:0 */
1374 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1375 }
1376
dump_mi_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1377 static int dump_mi_command(struct drm_printer *p,
1378 struct xe_gt *gt,
1379 u32 *dw,
1380 int remaining_dw)
1381 {
1382 u32 inst_header = *dw;
1383 u32 numdw = instr_dw(inst_header);
1384 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1385 int num_noop;
1386
1387 /* First check for commands that don't have/use a '# DW' field */
1388 switch (inst_header & MI_OPCODE) {
1389 case MI_NOOP:
1390 num_noop = 1;
1391 while (num_noop < remaining_dw &&
1392 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1393 num_noop++;
1394 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1395 return num_noop;
1396
1397 case MI_TOPOLOGY_FILTER:
1398 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1399 return 1;
1400
1401 case MI_BATCH_BUFFER_END:
1402 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1403 /* Return 'remaining_dw' to consume the rest of the LRC */
1404 return remaining_dw;
1405 }
1406
1407 /*
1408 * Any remaining commands include a # of dwords. We should make sure
1409 * it doesn't exceed the remaining size of the LRC.
1410 */
1411 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1412 numdw = remaining_dw;
1413
1414 switch (inst_header & MI_OPCODE) {
1415 case MI_LOAD_REGISTER_IMM:
1416 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1417 inst_header, (numdw - 1) / 2);
1418 for (int i = 1; i < numdw; i += 2)
1419 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1420 return numdw;
1421
1422 case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1423 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1424 inst_header,
1425 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1426 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1427 if (numdw == 4)
1428 drm_printf(p, " - %#6x = %#010llx\n",
1429 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1430 else
1431 drm_printf(p, " - %*ph (%s)\n",
1432 (int)sizeof(u32) * (numdw - 1), dw + 1,
1433 numdw < 4 ? "truncated" : "malformed");
1434 return numdw;
1435
1436 case MI_FORCE_WAKEUP:
1437 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1438 return numdw;
1439
1440 default:
1441 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1442 inst_header, opcode, numdw);
1443 return numdw;
1444 }
1445 }
1446
dump_gfxpipe_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1447 static int dump_gfxpipe_command(struct drm_printer *p,
1448 struct xe_gt *gt,
1449 u32 *dw,
1450 int remaining_dw)
1451 {
1452 u32 numdw = instr_dw(*dw);
1453 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1454 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1455 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1456
1457 /*
1458 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1459 * remaining size of the LRC.
1460 */
1461 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1462 numdw = remaining_dw;
1463
1464 switch (*dw & GFXPIPE_MATCH_MASK) {
1465 #define MATCH(cmd) \
1466 case cmd: \
1467 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1468 return numdw
1469 #define MATCH3D(cmd) \
1470 case CMD_##cmd: \
1471 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1472 return numdw
1473
1474 MATCH(STATE_BASE_ADDRESS);
1475 MATCH(STATE_SIP);
1476 MATCH(GPGPU_CSR_BASE_ADDRESS);
1477 MATCH(STATE_COMPUTE_MODE);
1478 MATCH3D(3DSTATE_BTD);
1479 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1480 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1481
1482 MATCH3D(3DSTATE_VF_STATISTICS);
1483
1484 MATCH(PIPELINE_SELECT);
1485
1486 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1487 MATCH3D(3DSTATE_CLEAR_PARAMS);
1488 MATCH3D(3DSTATE_DEPTH_BUFFER);
1489 MATCH3D(3DSTATE_STENCIL_BUFFER);
1490 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1491 MATCH3D(3DSTATE_VERTEX_BUFFERS);
1492 MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1493 MATCH3D(3DSTATE_INDEX_BUFFER);
1494 MATCH3D(3DSTATE_VF);
1495 MATCH3D(3DSTATE_MULTISAMPLE);
1496 MATCH3D(3DSTATE_CC_STATE_POINTERS);
1497 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1498 MATCH3D(3DSTATE_VS);
1499 MATCH3D(3DSTATE_GS);
1500 MATCH3D(3DSTATE_CLIP);
1501 MATCH3D(3DSTATE_SF);
1502 MATCH3D(3DSTATE_WM);
1503 MATCH3D(3DSTATE_CONSTANT_VS);
1504 MATCH3D(3DSTATE_CONSTANT_GS);
1505 MATCH3D(3DSTATE_CONSTANT_PS);
1506 MATCH3D(3DSTATE_SAMPLE_MASK);
1507 MATCH3D(3DSTATE_CONSTANT_HS);
1508 MATCH3D(3DSTATE_CONSTANT_DS);
1509 MATCH3D(3DSTATE_HS);
1510 MATCH3D(3DSTATE_TE);
1511 MATCH3D(3DSTATE_DS);
1512 MATCH3D(3DSTATE_STREAMOUT);
1513 MATCH3D(3DSTATE_SBE);
1514 MATCH3D(3DSTATE_PS);
1515 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1516 MATCH3D(3DSTATE_CPS_POINTERS);
1517 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1518 MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1519 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1520 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1521 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1522 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1523 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1524 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1525 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1526 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1527 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1528 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1529 MATCH3D(3DSTATE_VF_INSTANCING);
1530 MATCH3D(3DSTATE_VF_SGVS);
1531 MATCH3D(3DSTATE_VF_TOPOLOGY);
1532 MATCH3D(3DSTATE_WM_CHROMAKEY);
1533 MATCH3D(3DSTATE_PS_BLEND);
1534 MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1535 MATCH3D(3DSTATE_PS_EXTRA);
1536 MATCH3D(3DSTATE_RASTER);
1537 MATCH3D(3DSTATE_SBE_SWIZ);
1538 MATCH3D(3DSTATE_WM_HZ_OP);
1539 MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1540 MATCH3D(3DSTATE_VF_SGVS_2);
1541 MATCH3D(3DSTATE_VFG);
1542 MATCH3D(3DSTATE_URB_ALLOC_VS);
1543 MATCH3D(3DSTATE_URB_ALLOC_HS);
1544 MATCH3D(3DSTATE_URB_ALLOC_DS);
1545 MATCH3D(3DSTATE_URB_ALLOC_GS);
1546 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1547 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1548 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1549 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1550 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1551 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1552 MATCH3D(3DSTATE_AMFS);
1553 MATCH3D(3DSTATE_DEPTH_BOUNDS);
1554 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1555 MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1556 MATCH3D(3DSTATE_MESH_CONTROL);
1557 MATCH3D(3DSTATE_MESH_DISTRIB);
1558 MATCH3D(3DSTATE_TASK_REDISTRIB);
1559 MATCH3D(3DSTATE_MESH_SHADER);
1560 MATCH3D(3DSTATE_MESH_SHADER_DATA);
1561 MATCH3D(3DSTATE_TASK_CONTROL);
1562 MATCH3D(3DSTATE_TASK_SHADER);
1563 MATCH3D(3DSTATE_TASK_SHADER_DATA);
1564 MATCH3D(3DSTATE_URB_ALLOC_MESH);
1565 MATCH3D(3DSTATE_URB_ALLOC_TASK);
1566 MATCH3D(3DSTATE_CLIP_MESH);
1567 MATCH3D(3DSTATE_SBE_MESH);
1568 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1569
1570 MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1571 MATCH3D(3DSTATE_CHROMA_KEY);
1572 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1573 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1574 MATCH3D(3DSTATE_LINE_STIPPLE);
1575 MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1576 MATCH3D(3DSTATE_MONOFILTER_SIZE);
1577 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1578 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1579 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1580 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1581 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1582 MATCH3D(3DSTATE_SO_DECL_LIST);
1583 MATCH3D(3DSTATE_SO_BUFFER);
1584 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1585 MATCH3D(3DSTATE_SAMPLE_PATTERN);
1586 MATCH3D(3DSTATE_3D_MODE);
1587 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1588 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1589 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1590
1591 default:
1592 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1593 *dw, pipeline, opcode, subopcode, numdw);
1594 return numdw;
1595 }
1596 }
1597
dump_gfx_state_command(struct drm_printer * p,struct xe_gt * gt,u32 * dw,int remaining_dw)1598 static int dump_gfx_state_command(struct drm_printer *p,
1599 struct xe_gt *gt,
1600 u32 *dw,
1601 int remaining_dw)
1602 {
1603 u32 numdw = instr_dw(*dw);
1604 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1605
1606 /*
1607 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1608 * remaining size of the LRC.
1609 */
1610 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1611 numdw = remaining_dw;
1612
1613 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1614 MATCH(STATE_WRITE_INLINE);
1615
1616 default:
1617 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1618 *dw, opcode, numdw);
1619 return numdw;
1620 }
1621 }
1622
xe_lrc_dump_default(struct drm_printer * p,struct xe_gt * gt,enum xe_engine_class hwe_class)1623 void xe_lrc_dump_default(struct drm_printer *p,
1624 struct xe_gt *gt,
1625 enum xe_engine_class hwe_class)
1626 {
1627 u32 *dw;
1628 int remaining_dw, num_dw;
1629
1630 if (!gt->default_lrc[hwe_class]) {
1631 drm_printf(p, "No default LRC for class %d\n", hwe_class);
1632 return;
1633 }
1634
1635 /*
1636 * Skip the beginning of the LRC since it contains the per-process
1637 * hardware status page.
1638 */
1639 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1640 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
1641
1642 while (remaining_dw > 0) {
1643 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1644 num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1645 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1646 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1647 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1648 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1649 } else {
1650 num_dw = min(instr_dw(*dw), remaining_dw);
1651 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1652 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1653 num_dw);
1654 }
1655
1656 dw += num_dw;
1657 remaining_dw -= num_dw;
1658 }
1659 }
1660
1661 struct instr_state {
1662 u32 instr;
1663 u16 num_dw;
1664 };
1665
1666 static const struct instr_state xe_hpg_svg_state[] = {
1667 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1668 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1669 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1670 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1671 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1672 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1673 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1674 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1675 { .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1676 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1677 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1678 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1679 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1680 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1681 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1682 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1683 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1684 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1685 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1686 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1687 { .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1688 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1689 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1690 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1691 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1692 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1693 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1694 { .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1695 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1696 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1697 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1698 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1699 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1700 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1701 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1702 { .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1703 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1704 { .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1705 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1706 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1707 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1708 { .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1709 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1710 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1711 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1712 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1713 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1714 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1715 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1716 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1717 };
1718
xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue * q,struct xe_bb * bb)1719 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1720 {
1721 struct xe_gt *gt = q->hwe->gt;
1722 struct xe_device *xe = gt_to_xe(gt);
1723 const struct instr_state *state_table = NULL;
1724 int state_table_size = 0;
1725
1726 /*
1727 * Wa_14019789679
1728 *
1729 * If the driver doesn't explicitly emit the SVG instructions while
1730 * setting up the default LRC, the context switch will write 0's
1731 * (noops) into the LRC memory rather than the expected instruction
1732 * headers. Application contexts start out as a copy of the default
1733 * LRC, and if they also do not emit specific settings for some SVG
1734 * state, then on context restore they'll unintentionally inherit
1735 * whatever state setting the previous context had programmed into the
1736 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
1737 * prevent the hardware from resetting that state back to any specific
1738 * value).
1739 *
1740 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
1741 * since that's a specific state setting that can easily cause GPU
1742 * hangs if unintentionally inherited. However to be safe we'll
1743 * continue to emit all of the SVG state since it's best not to leak
1744 * any of the state between contexts, even if that leakage is harmless.
1745 */
1746 if (XE_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
1747 state_table = xe_hpg_svg_state;
1748 state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1749 }
1750
1751 if (!state_table) {
1752 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1753 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1754 return;
1755 }
1756
1757 for (int i = 0; i < state_table_size; i++) {
1758 u32 instr = state_table[i].instr;
1759 u16 num_dw = state_table[i].num_dw;
1760 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1761
1762 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1763 xe_gt_assert(gt, num_dw != 0);
1764 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1765
1766 /*
1767 * Xe2's SVG context is the same as the one on DG2 / MTL
1768 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1769 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1770 * Just make the replacement here rather than defining a
1771 * whole separate table for the single trivial change.
1772 */
1773 if (GRAPHICS_VER(xe) >= 20 &&
1774 instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1775 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1776
1777 bb->cs[bb->len] = instr;
1778 if (!is_single_dw)
1779 bb->cs[bb->len] |= (num_dw - 2);
1780
1781 bb->len += num_dw;
1782 }
1783 }
1784
xe_lrc_snapshot_capture(struct xe_lrc * lrc)1785 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
1786 {
1787 struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
1788
1789 if (!snapshot)
1790 return NULL;
1791
1792 if (lrc->bo->vm)
1793 xe_vm_get(lrc->bo->vm);
1794
1795 snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
1796 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
1797 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
1798 snapshot->head = xe_lrc_ring_head(lrc);
1799 snapshot->tail.internal = lrc->ring.tail;
1800 snapshot->tail.memory = xe_lrc_ring_tail(lrc);
1801 snapshot->start = xe_lrc_ring_start(lrc);
1802 snapshot->start_seqno = xe_lrc_start_seqno(lrc);
1803 snapshot->seqno = xe_lrc_seqno(lrc);
1804 snapshot->lrc_bo = xe_bo_get(lrc->bo);
1805 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
1806 snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
1807 snapshot->lrc_snapshot = NULL;
1808 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
1809 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
1810 return snapshot;
1811 }
1812
xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot * snapshot)1813 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
1814 {
1815 struct xe_bo *bo;
1816 struct xe_vm *vm;
1817 struct iosys_map src;
1818
1819 if (!snapshot)
1820 return;
1821
1822 bo = snapshot->lrc_bo;
1823 vm = bo->vm;
1824 snapshot->lrc_bo = NULL;
1825
1826 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
1827 if (!snapshot->lrc_snapshot)
1828 goto put_bo;
1829
1830 xe_bo_lock(bo, false);
1831 if (!ttm_bo_vmap(&bo->ttm, &src)) {
1832 xe_map_memcpy_from(xe_bo_device(bo),
1833 snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
1834 snapshot->lrc_size);
1835 ttm_bo_vunmap(&bo->ttm, &src);
1836 } else {
1837 kvfree(snapshot->lrc_snapshot);
1838 snapshot->lrc_snapshot = NULL;
1839 }
1840 xe_bo_unlock(bo);
1841 put_bo:
1842 xe_bo_put(bo);
1843 if (vm)
1844 xe_vm_put(vm);
1845 }
1846
xe_lrc_snapshot_print(struct xe_lrc_snapshot * snapshot,struct drm_printer * p)1847 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
1848 {
1849 unsigned long i;
1850
1851 if (!snapshot)
1852 return;
1853
1854 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
1855 drm_printf(p, "\tHW Ring address: 0x%08x\n",
1856 snapshot->ring_addr);
1857 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
1858 snapshot->indirect_context_desc);
1859 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
1860 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
1861 snapshot->tail.internal, snapshot->tail.memory);
1862 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
1863 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
1864 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
1865 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
1866 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
1867
1868 if (!snapshot->lrc_snapshot)
1869 return;
1870
1871 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
1872 drm_puts(p, "\t[HWSP].data: ");
1873 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
1874 u32 *val = snapshot->lrc_snapshot + i;
1875 char dumped[ASCII85_BUFSZ];
1876
1877 drm_puts(p, ascii85_encode(*val, dumped));
1878 }
1879
1880 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
1881 drm_puts(p, "\t[HWCTX].data: ");
1882 for (; i < snapshot->lrc_size; i += sizeof(u32)) {
1883 u32 *val = snapshot->lrc_snapshot + i;
1884 char dumped[ASCII85_BUFSZ];
1885
1886 drm_puts(p, ascii85_encode(*val, dumped));
1887 }
1888 drm_puts(p, "\n");
1889 }
1890
xe_lrc_snapshot_free(struct xe_lrc_snapshot * snapshot)1891 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
1892 {
1893 if (!snapshot)
1894 return;
1895
1896 kvfree(snapshot->lrc_snapshot);
1897 if (snapshot->lrc_bo) {
1898 struct xe_vm *vm;
1899
1900 vm = snapshot->lrc_bo->vm;
1901 xe_bo_put(snapshot->lrc_bo);
1902 if (vm)
1903 xe_vm_put(vm);
1904 }
1905 kfree(snapshot);
1906 }
1907
get_ctx_timestamp(struct xe_lrc * lrc,u32 engine_id,u64 * reg_ctx_ts)1908 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
1909 {
1910 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
1911 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
1912 struct xe_hw_engine *hwe;
1913 u64 val;
1914
1915 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
1916 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
1917 "Unexpected engine class:instance %d:%d for context utilization\n",
1918 class, instance))
1919 return -1;
1920
1921 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1922 val = xe_mmio_read64_2x32(&hwe->gt->mmio,
1923 RING_CTX_TIMESTAMP(hwe->mmio_base));
1924 else
1925 val = xe_mmio_read32(&hwe->gt->mmio,
1926 RING_CTX_TIMESTAMP(hwe->mmio_base));
1927
1928 *reg_ctx_ts = val;
1929
1930 return 0;
1931 }
1932
1933 /**
1934 * xe_lrc_update_timestamp() - Update ctx timestamp
1935 * @lrc: Pointer to the lrc.
1936 * @old_ts: Old timestamp value
1937 *
1938 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
1939 * update saved value. With support for active contexts, the calculation may be
1940 * slightly racy, so follow a read-again logic to ensure that the context is
1941 * still active before returning the right timestamp.
1942 *
1943 * Returns: New ctx timestamp value
1944 */
xe_lrc_update_timestamp(struct xe_lrc * lrc,u64 * old_ts)1945 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
1946 {
1947 u64 lrc_ts, reg_ts;
1948 u32 engine_id;
1949
1950 *old_ts = lrc->ctx_timestamp;
1951
1952 lrc_ts = xe_lrc_ctx_timestamp(lrc);
1953 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
1954 if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
1955 lrc->ctx_timestamp = lrc_ts;
1956 goto done;
1957 }
1958
1959 if (lrc_ts == CONTEXT_ACTIVE) {
1960 engine_id = xe_lrc_engine_id(lrc);
1961 if (!get_ctx_timestamp(lrc, engine_id, ®_ts))
1962 lrc->ctx_timestamp = reg_ts;
1963
1964 /* read lrc again to ensure context is still active */
1965 lrc_ts = xe_lrc_ctx_timestamp(lrc);
1966 }
1967
1968 /*
1969 * If context switched out, just use the lrc_ts. Note that this needs to
1970 * be a separate if condition.
1971 */
1972 if (lrc_ts != CONTEXT_ACTIVE)
1973 lrc->ctx_timestamp = lrc_ts;
1974
1975 done:
1976 trace_xe_lrc_update_timestamp(lrc, *old_ts);
1977
1978 return lrc->ctx_timestamp;
1979 }
1980
1981 /**
1982 * xe_lrc_ring_is_idle() - LRC is idle
1983 * @lrc: Pointer to the lrc.
1984 *
1985 * Compare LRC ring head and tail to determine if idle.
1986 *
1987 * Return: True is ring is idle, False otherwise
1988 */
xe_lrc_ring_is_idle(struct xe_lrc * lrc)1989 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
1990 {
1991 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
1992 }
1993