xref: /linux/drivers/gpu/drm/xe/xe_lrc.c (revision 4a57e0913e8c7fff407e97909f4ae48caa84d612) !
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_lrc.h"
7 
8 #include <generated/xe_wa_oob.h>
9 
10 #include <linux/ascii85.h>
11 #include <linux/panic.h>
12 
13 #include "instructions/xe_mi_commands.h"
14 #include "instructions/xe_gfxpipe_commands.h"
15 #include "instructions/xe_gfx_state_commands.h"
16 #include "regs/xe_engine_regs.h"
17 #include "regs/xe_gt_regs.h"
18 #include "regs/xe_lrc_layout.h"
19 #include "xe_bb.h"
20 #include "xe_bo.h"
21 #include "xe_configfs.h"
22 #include "xe_device.h"
23 #include "xe_drm_client.h"
24 #include "xe_exec_queue_types.h"
25 #include "xe_gt.h"
26 #include "xe_gt_printk.h"
27 #include "xe_hw_fence.h"
28 #include "xe_map.h"
29 #include "xe_memirq.h"
30 #include "xe_mmio.h"
31 #include "xe_ring_ops.h"
32 #include "xe_sriov.h"
33 #include "xe_trace_lrc.h"
34 #include "xe_vm.h"
35 #include "xe_wa.h"
36 
37 #define LRC_VALID				BIT_ULL(0)
38 #define LRC_PRIVILEGE				BIT_ULL(8)
39 #define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
40 #define LRC_LEGACY_64B_CONTEXT			3
41 
42 #define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
43 #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
44 
45 #define LRC_PPHWSP_SIZE				SZ_4K
46 #define LRC_INDIRECT_CTX_BO_SIZE		SZ_4K
47 #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
48 
49 #define LRC_PRIORITY				GENMASK_ULL(10, 9)
50 #define LRC_PRIORITY_LOW			0
51 #define LRC_PRIORITY_NORMAL			1
52 #define LRC_PRIORITY_HIGH			2
53 
54 /*
55  * Layout of the LRC and associated data allocated as
56  * lrc->bo:
57  *
58  *   Region                       Size
59  *  +============================+=================================+ <- __xe_lrc_ring_offset()
60  *  | Ring                       | ring_size, see                  |
61  *  |                            | xe_lrc_init()                   |
62  *  +============================+=================================+ <- __xe_lrc_pphwsp_offset()
63  *  | PPHWSP (includes SW state) | 4K                              |
64  *  +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
65  *  | Engine Context Image       | n * 4K, see                     |
66  *  |                            | xe_gt_lrc_size()                |
67  *  +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
68  *  | Indirect Ring State Page   | 0 or 4k, see                    |
69  *  |                            | XE_LRC_FLAG_INDIRECT_RING_STATE |
70  *  +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
71  *  | Indirect Context Page      | 0 or 4k, see                    |
72  *  |                            | XE_LRC_FLAG_INDIRECT_CTX        |
73  *  +============================+=================================+ <- __xe_lrc_wa_bb_offset()
74  *  | WA BB Per Ctx              | 4k                              |
75  *  +============================+=================================+ <- xe_bo_size(lrc->bo)
76  */
77 
78 static struct xe_device *
79 lrc_to_xe(struct xe_lrc *lrc)
80 {
81 	return gt_to_xe(lrc->fence_ctx.gt);
82 }
83 
84 static bool
85 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
86 {
87 	struct xe_device *xe = gt_to_xe(gt);
88 
89 	if (XE_GT_WA(gt, 16010904313) &&
90 	    (class == XE_ENGINE_CLASS_RENDER ||
91 	     class == XE_ENGINE_CLASS_COMPUTE))
92 		return true;
93 
94 	if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
95 					       class, NULL))
96 		return true;
97 
98 	if (gt->ring_ops[class]->emit_aux_table_inv)
99 		return true;
100 
101 	return false;
102 }
103 
104 /**
105  * xe_gt_lrc_hang_replay_size() - Hang replay size
106  * @gt: The GT
107  * @class: Hardware engine class
108  *
109  * Determine size of GPU hang replay state for a GT and hardware engine class.
110  *
111  * Return: Size of GPU hang replay size
112  */
113 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
114 {
115 	struct xe_device *xe = gt_to_xe(gt);
116 	size_t size = 0;
117 
118 	/* Engine context image */
119 	switch (class) {
120 	case XE_ENGINE_CLASS_RENDER:
121 		if (GRAPHICS_VERx100(xe) >= 3510)
122 			size += 7 * SZ_4K;
123 		else if (GRAPHICS_VER(xe) >= 20)
124 			size += 3 * SZ_4K;
125 		else
126 			size += 13 * SZ_4K;
127 		break;
128 	case XE_ENGINE_CLASS_COMPUTE:
129 		if (GRAPHICS_VERx100(xe) >= 3510)
130 			size += 5 * SZ_4K;
131 		else if (GRAPHICS_VER(xe) >= 20)
132 			size += 2 * SZ_4K;
133 		else
134 			size += 13 * SZ_4K;
135 		break;
136 	default:
137 		WARN(1, "Unknown engine class: %d", class);
138 		fallthrough;
139 	case XE_ENGINE_CLASS_COPY:
140 	case XE_ENGINE_CLASS_VIDEO_DECODE:
141 	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
142 	case XE_ENGINE_CLASS_OTHER:
143 		size += 1 * SZ_4K;
144 	}
145 
146 	return size;
147 }
148 
149 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
150 {
151 	size_t size = xe_gt_lrc_hang_replay_size(gt, class);
152 
153 	/* Add indirect ring state page */
154 	if (xe_gt_has_indirect_ring_state(gt))
155 		size += LRC_INDIRECT_RING_STATE_SIZE;
156 
157 	return size + LRC_PPHWSP_SIZE;
158 }
159 
160 /*
161  * The per-platform tables are u8-encoded in @data. Decode @data and set the
162  * addresses' offset and commands in @regs. The following encoding is used
163  * for each byte. There are 2 steps: decoding commands and decoding addresses.
164  *
165  * Commands:
166  * [7]: create NOPs - number of NOPs are set in lower bits
167  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
168  *      MI_LRI_FORCE_POSTED
169  * [5:0]: Number of NOPs or registers to set values to in case of
170  *        MI_LOAD_REGISTER_IMM
171  *
172  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
173  * number of registers. They are set by using the REG/REG16 macros: the former
174  * is used for offsets smaller than 0x200 while the latter is for values bigger
175  * than that. Those macros already set all the bits documented below correctly:
176  *
177  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
178  *      follow, for the lower bits
179  * [6:0]: Register offset, without considering the engine base.
180  *
181  * This function only tweaks the commands and register offsets. Values are not
182  * filled out.
183  */
184 static void set_offsets(u32 *regs,
185 			const u8 *data,
186 			const struct xe_hw_engine *hwe)
187 #define NOP(x) (BIT(7) | (x))
188 #define LRI(count, flags) ((flags) << 6 | (count) | \
189 			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
190 #define POSTED BIT(0)
191 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
192 #define REG16(x) \
193 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
194 	(((x) >> 2) & 0x7f)
195 {
196 	const u32 base = hwe->mmio_base;
197 
198 	while (*data) {
199 		u8 count, flags;
200 
201 		if (*data & BIT(7)) { /* skip */
202 			count = *data++ & ~BIT(7);
203 			regs += count;
204 			continue;
205 		}
206 
207 		count = *data & 0x3f;
208 		flags = *data >> 6;
209 		data++;
210 
211 		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
212 		if (flags & POSTED)
213 			*regs |= MI_LRI_FORCE_POSTED;
214 		*regs |= MI_LRI_LRM_CS_MMIO;
215 		regs++;
216 
217 		xe_gt_assert(hwe->gt, count);
218 		do {
219 			u32 offset = 0;
220 			u8 v;
221 
222 			do {
223 				v = *data++;
224 				offset <<= 7;
225 				offset |= v & ~BIT(7);
226 			} while (v & BIT(7));
227 
228 			regs[0] = base + (offset << 2);
229 			regs += 2;
230 		} while (--count);
231 	}
232 
233 	*regs = MI_BATCH_BUFFER_END | BIT(0);
234 }
235 
236 static const u8 gen12_xcs_offsets[] = {
237 	NOP(1),
238 	LRI(13, POSTED),
239 	REG16(0x244),
240 	REG(0x034),
241 	REG(0x030),
242 	REG(0x038),
243 	REG(0x03c),
244 	REG(0x168),
245 	REG(0x140),
246 	REG(0x110),
247 	REG(0x1c0),
248 	REG(0x1c4),
249 	REG(0x1c8),
250 	REG(0x180),
251 	REG16(0x2b4),
252 
253 	NOP(5),
254 	LRI(9, POSTED),
255 	REG16(0x3a8),
256 	REG16(0x28c),
257 	REG16(0x288),
258 	REG16(0x284),
259 	REG16(0x280),
260 	REG16(0x27c),
261 	REG16(0x278),
262 	REG16(0x274),
263 	REG16(0x270),
264 
265 	0
266 };
267 
268 static const u8 dg2_xcs_offsets[] = {
269 	NOP(1),
270 	LRI(15, POSTED),
271 	REG16(0x244),
272 	REG(0x034),
273 	REG(0x030),
274 	REG(0x038),
275 	REG(0x03c),
276 	REG(0x168),
277 	REG(0x140),
278 	REG(0x110),
279 	REG(0x1c0),
280 	REG(0x1c4),
281 	REG(0x1c8),
282 	REG(0x180),
283 	REG16(0x2b4),
284 	REG(0x120),
285 	REG(0x124),
286 
287 	NOP(1),
288 	LRI(9, POSTED),
289 	REG16(0x3a8),
290 	REG16(0x28c),
291 	REG16(0x288),
292 	REG16(0x284),
293 	REG16(0x280),
294 	REG16(0x27c),
295 	REG16(0x278),
296 	REG16(0x274),
297 	REG16(0x270),
298 
299 	0
300 };
301 
302 static const u8 gen12_rcs_offsets[] = {
303 	NOP(1),
304 	LRI(13, POSTED),
305 	REG16(0x244),
306 	REG(0x034),
307 	REG(0x030),
308 	REG(0x038),
309 	REG(0x03c),
310 	REG(0x168),
311 	REG(0x140),
312 	REG(0x110),
313 	REG(0x1c0),
314 	REG(0x1c4),
315 	REG(0x1c8),
316 	REG(0x180),
317 	REG16(0x2b4),
318 
319 	NOP(5),
320 	LRI(9, POSTED),
321 	REG16(0x3a8),
322 	REG16(0x28c),
323 	REG16(0x288),
324 	REG16(0x284),
325 	REG16(0x280),
326 	REG16(0x27c),
327 	REG16(0x278),
328 	REG16(0x274),
329 	REG16(0x270),
330 
331 	LRI(3, POSTED),
332 	REG(0x1b0),
333 	REG16(0x5a8),
334 	REG16(0x5ac),
335 
336 	NOP(6),
337 	LRI(1, 0),
338 	REG(0x0c8),
339 	NOP(3 + 9 + 1),
340 
341 	LRI(51, POSTED),
342 	REG16(0x588),
343 	REG16(0x588),
344 	REG16(0x588),
345 	REG16(0x588),
346 	REG16(0x588),
347 	REG16(0x588),
348 	REG(0x028),
349 	REG(0x09c),
350 	REG(0x0c0),
351 	REG(0x178),
352 	REG(0x17c),
353 	REG16(0x358),
354 	REG(0x170),
355 	REG(0x150),
356 	REG(0x154),
357 	REG(0x158),
358 	REG16(0x41c),
359 	REG16(0x600),
360 	REG16(0x604),
361 	REG16(0x608),
362 	REG16(0x60c),
363 	REG16(0x610),
364 	REG16(0x614),
365 	REG16(0x618),
366 	REG16(0x61c),
367 	REG16(0x620),
368 	REG16(0x624),
369 	REG16(0x628),
370 	REG16(0x62c),
371 	REG16(0x630),
372 	REG16(0x634),
373 	REG16(0x638),
374 	REG16(0x63c),
375 	REG16(0x640),
376 	REG16(0x644),
377 	REG16(0x648),
378 	REG16(0x64c),
379 	REG16(0x650),
380 	REG16(0x654),
381 	REG16(0x658),
382 	REG16(0x65c),
383 	REG16(0x660),
384 	REG16(0x664),
385 	REG16(0x668),
386 	REG16(0x66c),
387 	REG16(0x670),
388 	REG16(0x674),
389 	REG16(0x678),
390 	REG16(0x67c),
391 	REG(0x068),
392 	REG(0x084),
393 	NOP(1),
394 
395 	0
396 };
397 
398 static const u8 xehp_rcs_offsets[] = {
399 	NOP(1),
400 	LRI(13, POSTED),
401 	REG16(0x244),
402 	REG(0x034),
403 	REG(0x030),
404 	REG(0x038),
405 	REG(0x03c),
406 	REG(0x168),
407 	REG(0x140),
408 	REG(0x110),
409 	REG(0x1c0),
410 	REG(0x1c4),
411 	REG(0x1c8),
412 	REG(0x180),
413 	REG16(0x2b4),
414 
415 	NOP(5),
416 	LRI(9, POSTED),
417 	REG16(0x3a8),
418 	REG16(0x28c),
419 	REG16(0x288),
420 	REG16(0x284),
421 	REG16(0x280),
422 	REG16(0x27c),
423 	REG16(0x278),
424 	REG16(0x274),
425 	REG16(0x270),
426 
427 	LRI(3, POSTED),
428 	REG(0x1b0),
429 	REG16(0x5a8),
430 	REG16(0x5ac),
431 
432 	NOP(6),
433 	LRI(1, 0),
434 	REG(0x0c8),
435 
436 	0
437 };
438 
439 static const u8 dg2_rcs_offsets[] = {
440 	NOP(1),
441 	LRI(15, POSTED),
442 	REG16(0x244),
443 	REG(0x034),
444 	REG(0x030),
445 	REG(0x038),
446 	REG(0x03c),
447 	REG(0x168),
448 	REG(0x140),
449 	REG(0x110),
450 	REG(0x1c0),
451 	REG(0x1c4),
452 	REG(0x1c8),
453 	REG(0x180),
454 	REG16(0x2b4),
455 	REG(0x120),
456 	REG(0x124),
457 
458 	NOP(1),
459 	LRI(9, POSTED),
460 	REG16(0x3a8),
461 	REG16(0x28c),
462 	REG16(0x288),
463 	REG16(0x284),
464 	REG16(0x280),
465 	REG16(0x27c),
466 	REG16(0x278),
467 	REG16(0x274),
468 	REG16(0x270),
469 
470 	LRI(3, POSTED),
471 	REG(0x1b0),
472 	REG16(0x5a8),
473 	REG16(0x5ac),
474 
475 	NOP(6),
476 	LRI(1, 0),
477 	REG(0x0c8),
478 
479 	0
480 };
481 
482 static const u8 mtl_rcs_offsets[] = {
483 	NOP(1),
484 	LRI(15, POSTED),
485 	REG16(0x244),
486 	REG(0x034),
487 	REG(0x030),
488 	REG(0x038),
489 	REG(0x03c),
490 	REG(0x168),
491 	REG(0x140),
492 	REG(0x110),
493 	REG(0x1c0),
494 	REG(0x1c4),
495 	REG(0x1c8),
496 	REG(0x180),
497 	REG16(0x2b4),
498 	REG(0x120),
499 	REG(0x124),
500 
501 	NOP(1),
502 	LRI(9, POSTED),
503 	REG16(0x3a8),
504 	REG16(0x28c),
505 	REG16(0x288),
506 	REG16(0x284),
507 	REG16(0x280),
508 	REG16(0x27c),
509 	REG16(0x278),
510 	REG16(0x274),
511 	REG16(0x270),
512 
513 	NOP(2),
514 	LRI(2, POSTED),
515 	REG16(0x5a8),
516 	REG16(0x5ac),
517 
518 	NOP(6),
519 	LRI(1, 0),
520 	REG(0x0c8),
521 
522 	0
523 };
524 
525 #define XE2_CTX_COMMON \
526 	NOP(1),                 /* [0x00] */ \
527 	LRI(15, POSTED),        /* [0x01] */ \
528 	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
529 	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
530 	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
531 	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
532 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
533 	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
534 	REG(0x140),             /* [0x0e] BB_ADDR */ \
535 	REG(0x110),             /* [0x10] BB_STATE */ \
536 	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
537 	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
538 	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
539 	REG(0x180),             /* [0x18] CCID */ \
540 	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
541 	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
542 	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
543 	\
544 	NOP(1),                 /* [0x20] */ \
545 	LRI(9, POSTED),         /* [0x21] */ \
546 	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
547 	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
548 	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
549 	REG16(0x284),           /* [0x28] dummy reg */ \
550 	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
551 	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
552 	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
553 	REG16(0x274),           /* [0x30] PTBP_UDW */ \
554 	REG16(0x270)            /* [0x32] PTBP_LDW */
555 
556 static const u8 xe2_rcs_offsets[] = {
557 	XE2_CTX_COMMON,
558 
559 	NOP(2),                 /* [0x34] */
560 	LRI(2, POSTED),         /* [0x36] */
561 	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
562 	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
563 
564 	NOP(6),                 /* [0x41] */
565 	LRI(1, 0),              /* [0x47] */
566 	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
567 
568 	0
569 };
570 
571 static const u8 xe2_bcs_offsets[] = {
572 	XE2_CTX_COMMON,
573 
574 	NOP(4 + 8 + 1),         /* [0x34] */
575 	LRI(2, POSTED),         /* [0x41] */
576 	REG16(0x200),           /* [0x42] BCS_SWCTRL */
577 	REG16(0x204),           /* [0x44] BLIT_CCTL */
578 
579 	0
580 };
581 
582 static const u8 xe2_xcs_offsets[] = {
583 	XE2_CTX_COMMON,
584 
585 	0
586 };
587 
588 static const u8 xe2_indirect_ring_state_offsets[] = {
589 	NOP(1),                 /* [0x00] */
590 	LRI(5, POSTED),         /* [0x01] */
591 	REG(0x034),             /* [0x02] RING_BUFFER_HEAD */
592 	REG(0x030),             /* [0x04] RING_BUFFER_TAIL */
593 	REG(0x038),             /* [0x06] RING_BUFFER_START */
594 	REG(0x048),             /* [0x08] RING_BUFFER_START_UDW */
595 	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */
596 
597 	NOP(5),                 /* [0x0c] */
598 	LRI(9, POSTED),         /* [0x11] */
599 	REG(0x168),             /* [0x12] BB_ADDR_UDW */
600 	REG(0x140),             /* [0x14] BB_ADDR */
601 	REG(0x110),             /* [0x16] BB_STATE */
602 	REG16(0x588),           /* [0x18] BB_STACK_WRITE_PORT */
603 	REG16(0x588),           /* [0x20] BB_STACK_WRITE_PORT */
604 	REG16(0x588),           /* [0x22] BB_STACK_WRITE_PORT */
605 	REG16(0x588),           /* [0x24] BB_STACK_WRITE_PORT */
606 	REG16(0x588),           /* [0x26] BB_STACK_WRITE_PORT */
607 	REG16(0x588),           /* [0x28] BB_STACK_WRITE_PORT */
608 
609 	NOP(12),                 /* [0x00] */
610 
611 	0
612 };
613 
614 #undef REG16
615 #undef REG
616 #undef LRI
617 #undef NOP
618 
619 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
620 {
621 	if (class == XE_ENGINE_CLASS_RENDER) {
622 		if (GRAPHICS_VER(xe) >= 20)
623 			return xe2_rcs_offsets;
624 		else if (GRAPHICS_VERx100(xe) >= 1270)
625 			return mtl_rcs_offsets;
626 		else if (GRAPHICS_VERx100(xe) >= 1255)
627 			return dg2_rcs_offsets;
628 		else if (GRAPHICS_VERx100(xe) >= 1250)
629 			return xehp_rcs_offsets;
630 		else
631 			return gen12_rcs_offsets;
632 	} else if (class == XE_ENGINE_CLASS_COPY) {
633 		if (GRAPHICS_VER(xe) >= 20)
634 			return xe2_bcs_offsets;
635 		else
636 			return gen12_xcs_offsets;
637 	} else {
638 		if (GRAPHICS_VER(xe) >= 20)
639 			return xe2_xcs_offsets;
640 		else if (GRAPHICS_VERx100(xe) >= 1255)
641 			return dg2_xcs_offsets;
642 		else
643 			return gen12_xcs_offsets;
644 	}
645 }
646 
647 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
648 {
649 	regs[CTX_CONTEXT_CONTROL] = REG_MASKED_FIELD_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
650 							    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
651 
652 	if (xe_gt_has_indirect_ring_state(hwe->gt))
653 		regs[CTX_CONTEXT_CONTROL] |=
654 			REG_MASKED_FIELD_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
655 }
656 
657 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
658 {
659 	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
660 	struct xe_device *xe = gt_to_xe(hwe->gt);
661 	u8 num_regs;
662 
663 	if (!xe_device_uses_memirq(xe))
664 		return;
665 
666 	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
667 					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
668 	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
669 	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
670 
671 	num_regs = xe_device_has_msix(xe) ? 3 : 2;
672 	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
673 				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
674 	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
675 	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
676 	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
677 	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
678 
679 	if (xe_device_has_msix(xe)) {
680 		regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
681 		/* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
682 	}
683 }
684 
685 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
686 {
687 	struct xe_device *xe = gt_to_xe(hwe->gt);
688 
689 	if (GRAPHICS_VERx100(xe) >= 1250)
690 		return 0x70;
691 	else
692 		return 0x60;
693 }
694 
695 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
696 {
697 	int x;
698 
699 	x = lrc_ring_mi_mode(hwe);
700 	regs[x + 1] &= ~STOP_RING;
701 	regs[x + 1] |= STOP_RING << 16;
702 }
703 
704 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
705 {
706 	return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
707 }
708 
709 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
710 {
711 	return 0;
712 }
713 
714 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
715 {
716 	return lrc->ring.size;
717 }
718 
719 /* Make the magic macros work */
720 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
721 #define __xe_lrc_regs_offset xe_lrc_regs_offset
722 
723 #define LRC_CTX_JOB_TIMESTAMP_OFFSET 512
724 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
725 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
726 
727 #define LRC_SEQNO_OFFSET 0
728 #define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8)
729 
730 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
731 {
732 	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
733 }
734 
735 /**
736  * xe_lrc_reg_size() - Get size of the LRC registers area within queues
737  * @xe: the &xe_device struct instance
738  *
739  * Returns: Size of the LRC registers area for current platform
740  */
741 size_t xe_lrc_reg_size(struct xe_device *xe)
742 {
743 	if (GRAPHICS_VERx100(xe) >= 1250)
744 		return 96 * sizeof(u32);
745 	else
746 		return 80 * sizeof(u32);
747 }
748 
749 size_t xe_lrc_skip_size(struct xe_device *xe)
750 {
751 	return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
752 }
753 
754 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
755 {
756 	return LRC_SEQNO_OFFSET;
757 }
758 
759 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
760 {
761 	return LRC_START_SEQNO_OFFSET;
762 }
763 
764 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
765 {
766 	/* This is stored in the driver-defined portion of PPHWSP */
767 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
768 }
769 
770 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
771 {
772 	/* The parallel is stored in the driver-defined portion of PPHWSP */
773 	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
774 }
775 
776 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
777 {
778 	return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
779 }
780 
781 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
782 {
783 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
784 }
785 
786 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
787 {
788 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
789 }
790 
791 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
792 {
793 	u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
794 		     LRC_INDIRECT_RING_STATE_SIZE;
795 
796 	if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
797 		offset -= LRC_INDIRECT_CTX_BO_SIZE;
798 
799 	return offset;
800 }
801 
802 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
803 {
804 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
805 }
806 
807 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
808 {
809 	return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
810 }
811 
812 #define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \
813 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
814 { \
815 	struct xe_bo *bo = (bo_expr); \
816 	struct iosys_map map = bo->vmap; \
817 \
818 	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
819 	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
820 	return map; \
821 } \
822 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
823 { \
824 	struct xe_bo *bo = (bo_expr); \
825 \
826 	return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \
827 } \
828 
829 DECL_MAP_ADDR_HELPERS(ring, lrc->bo)
830 DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo)
831 DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo)
832 DECL_MAP_ADDR_HELPERS(regs, lrc->bo)
833 DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo)
834 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo)
835 DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo)
836 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo)
837 DECL_MAP_ADDR_HELPERS(parallel, lrc->bo)
838 DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo)
839 DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo)
840 
841 #undef DECL_MAP_ADDR_HELPERS
842 
843 /**
844  * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
845  * @lrc: Pointer to the lrc.
846  *
847  * Returns: ctx timestamp GGTT address
848  */
849 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
850 {
851 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
852 }
853 
854 /**
855  * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
856  * @lrc: Pointer to the lrc.
857  *
858  * Returns: ctx timestamp udw GGTT address
859  */
860 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
861 {
862 	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
863 }
864 
865 /**
866  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
867  * @lrc: Pointer to the lrc.
868  *
869  * Returns: ctx timestamp value
870  */
871 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
872 {
873 	struct xe_device *xe = lrc_to_xe(lrc);
874 	struct iosys_map map;
875 	u32 ldw, udw = 0;
876 
877 	map = __xe_lrc_ctx_timestamp_map(lrc);
878 	ldw = xe_map_read32(xe, &map);
879 
880 	if (xe->info.has_64bit_timestamp) {
881 		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
882 		udw = xe_map_read32(xe, &map);
883 	}
884 
885 	return (u64)udw << 32 | ldw;
886 }
887 
888 /**
889  * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
890  * @lrc: Pointer to the lrc.
891  *
892  * Returns: ctx timestamp job GGTT address
893  */
894 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
895 {
896 	return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
897 }
898 
899 /**
900  * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
901  * @lrc: Pointer to the lrc.
902  *
903  * Returns: ctx timestamp job value
904  */
905 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
906 {
907 	struct xe_device *xe = lrc_to_xe(lrc);
908 	struct iosys_map map;
909 
910 	map = __xe_lrc_ctx_job_timestamp_map(lrc);
911 	return xe_map_read32(xe, &map);
912 }
913 
914 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
915 {
916 	return __xe_lrc_pphwsp_ggtt_addr(lrc);
917 }
918 
919 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
920 {
921 	if (!xe_lrc_has_indirect_ring_state(lrc))
922 		return 0;
923 
924 	return __xe_lrc_indirect_ring_ggtt_addr(lrc);
925 }
926 
927 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
928 {
929 	struct xe_device *xe = lrc_to_xe(lrc);
930 	struct iosys_map map;
931 
932 	map = __xe_lrc_indirect_ring_map(lrc);
933 	iosys_map_incr(&map, reg_nr * sizeof(u32));
934 	return xe_map_read32(xe, &map);
935 }
936 
937 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
938 					  int reg_nr, u32 val)
939 {
940 	struct xe_device *xe = lrc_to_xe(lrc);
941 	struct iosys_map map;
942 
943 	map = __xe_lrc_indirect_ring_map(lrc);
944 	iosys_map_incr(&map, reg_nr * sizeof(u32));
945 	xe_map_write32(xe, &map, val);
946 }
947 
948 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
949 {
950 	struct xe_device *xe = lrc_to_xe(lrc);
951 	struct iosys_map map;
952 
953 	map = __xe_lrc_regs_map(lrc);
954 	iosys_map_incr(&map, reg_nr * sizeof(u32));
955 	return xe_map_read32(xe, &map);
956 }
957 
958 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
959 {
960 	struct xe_device *xe = lrc_to_xe(lrc);
961 	struct iosys_map map;
962 
963 	map = __xe_lrc_regs_map(lrc);
964 	iosys_map_incr(&map, reg_nr * sizeof(u32));
965 	xe_map_write32(xe, &map, val);
966 }
967 
968 static void *empty_lrc_data(struct xe_hw_engine *hwe)
969 {
970 	struct xe_gt *gt = hwe->gt;
971 	void *data;
972 	u32 *regs;
973 
974 	data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
975 	if (!data)
976 		return NULL;
977 
978 	/* 1st page: Per-Process of HW status Page */
979 	regs = data + LRC_PPHWSP_SIZE;
980 	set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
981 	set_context_control(regs, hwe);
982 	set_memory_based_intr(regs, hwe);
983 	reset_stop_ring(regs, hwe);
984 	if (xe_gt_has_indirect_ring_state(gt)) {
985 		regs = data + xe_gt_lrc_size(gt, hwe->class) -
986 		       LRC_INDIRECT_RING_STATE_SIZE;
987 		set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
988 	}
989 
990 	return data;
991 }
992 
993 /**
994  * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
995  * of given engine.
996  * @hwe: the &xe_hw_engine struct instance
997  */
998 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
999 {
1000 	struct xe_gt *gt = hwe->gt;
1001 	u32 *regs;
1002 
1003 	if (!gt->default_lrc[hwe->class])
1004 		return;
1005 
1006 	regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
1007 	set_memory_based_intr(regs, hwe);
1008 }
1009 
1010 /**
1011  * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
1012  * for given LRC.
1013  * @lrc: the &xe_lrc struct instance
1014  * @hwe: the &xe_hw_engine struct instance
1015  * @regs: scratch buffer to be used as temporary storage
1016  */
1017 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1018 					    u32 *regs)
1019 {
1020 	struct xe_gt *gt = hwe->gt;
1021 	struct iosys_map map;
1022 	size_t regs_len;
1023 
1024 	if (!xe_device_uses_memirq(gt_to_xe(gt)))
1025 		return;
1026 
1027 	map = __xe_lrc_regs_map(lrc);
1028 	regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1029 	xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1030 	set_memory_based_intr(regs, hwe);
1031 	xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1032 }
1033 
1034 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1035 {
1036 	u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1037 
1038 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1039 	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1040 }
1041 
1042 static void xe_lrc_finish(struct xe_lrc *lrc)
1043 {
1044 	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1045 	xe_bo_unpin_map_no_vm(lrc->bo);
1046 	xe_bo_unpin_map_no_vm(lrc->seqno_bo);
1047 }
1048 
1049 /*
1050  * wa_bb_setup_utilization() - Write commands to wa bb to assist
1051  * in calculating active context run ticks.
1052  *
1053  * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1054  * context, but only gets updated when the context switches out. In order to
1055  * check how long a context has been active before it switches out, two things
1056  * are required:
1057  *
1058  * (1) Determine if the context is running:
1059  * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1060  * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1061  * initialized. During a query, we just check for this value to determine if the
1062  * context is active. If the context switched out, it would overwrite this
1063  * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1064  * the last part of context restore, so reusing this LRC location will not
1065  * clobber anything.
1066  *
1067  * (2) Calculate the time that the context has been active for:
1068  * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1069  * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1070  * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1071  * engine instance. Since we do not know which instance the context is running
1072  * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1073  * store it in the PPHSWP.
1074  */
1075 #define CONTEXT_ACTIVE 1ULL
1076 static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1077 				    struct xe_hw_engine *hwe,
1078 				    u32 *batch,
1079 				    size_t max_len)
1080 {
1081 	u32 *cmd = batch;
1082 
1083 	if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1084 		return 0;
1085 
1086 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1087 		return -ENOSPC;
1088 
1089 	*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1090 	*cmd++ = ENGINE_ID(0).addr;
1091 	*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1092 	*cmd++ = 0;
1093 
1094 	*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1095 	*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1096 	*cmd++ = 0;
1097 	*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1098 
1099 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1100 		*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1101 		*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1102 		*cmd++ = 0;
1103 		*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1104 	}
1105 
1106 	return cmd - batch;
1107 }
1108 
1109 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1110 				  u32 *batch, size_t max_len)
1111 {
1112 	const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1113 	u32 *cmd = batch;
1114 
1115 	if (!XE_GT_WA(lrc->gt, 16010904313) ||
1116 	    !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1117 	      hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1118 	      hwe->class == XE_ENGINE_CLASS_COPY ||
1119 	      hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1120 	      hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1121 		return 0;
1122 
1123 	if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1124 		return -ENOSPC;
1125 
1126 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1127 		 MI_LRM_ASYNC;
1128 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1129 	*cmd++ = ts_addr;
1130 	*cmd++ = 0;
1131 
1132 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1133 		 MI_LRM_ASYNC;
1134 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1135 	*cmd++ = ts_addr;
1136 	*cmd++ = 0;
1137 
1138 	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1139 	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
1140 	*cmd++ = ts_addr;
1141 	*cmd++ = 0;
1142 
1143 	return cmd - batch;
1144 }
1145 
1146 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1147 						  struct xe_hw_engine *hwe,
1148 						  u32 *batch, size_t max_len)
1149 {
1150 	struct xe_device *xe = gt_to_xe(lrc->gt);
1151 	const u32 *user_batch;
1152 	u32 *cmd = batch;
1153 	u32 count;
1154 
1155 	count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1156 						    hwe->class, &user_batch);
1157 	if (!count)
1158 		return 0;
1159 
1160 	if (count > max_len)
1161 		return -ENOSPC;
1162 
1163 	/*
1164 	 * This should be used only for tests and validation. Taint the kernel
1165 	 * as anything could be submitted directly in context switches
1166 	 */
1167 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1168 
1169 	memcpy(cmd, user_batch, count * sizeof(u32));
1170 	cmd += count;
1171 
1172 	return cmd - batch;
1173 }
1174 
1175 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1176 						 struct xe_hw_engine *hwe,
1177 						 u32 *batch, size_t max_len)
1178 {
1179 	struct xe_device *xe = gt_to_xe(lrc->gt);
1180 	const u32 *user_batch;
1181 	u32 *cmd = batch;
1182 	u32 count;
1183 
1184 	count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1185 						   hwe->class, &user_batch);
1186 	if (!count)
1187 		return 0;
1188 
1189 	if (count > max_len)
1190 		return -ENOSPC;
1191 
1192 	/*
1193 	 * This should be used only for tests and validation. Taint the kernel
1194 	 * as anything could be submitted directly in context switches
1195 	 */
1196 	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1197 
1198 	memcpy(cmd, user_batch, count * sizeof(u32));
1199 	cmd += count;
1200 
1201 	return cmd - batch;
1202 }
1203 
1204 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1205 					       struct xe_hw_engine *hwe,
1206 					       u32 *batch, size_t max_len)
1207 {
1208 	u32 *cmd = batch;
1209 
1210 	if (!XE_GT_WA(lrc->gt, 18022495364) ||
1211 	    hwe->class != XE_ENGINE_CLASS_RENDER)
1212 		return 0;
1213 
1214 	if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1215 		return -ENOSPC;
1216 
1217 	*cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1218 	*cmd++ = CS_DEBUG_MODE2(0).addr;
1219 	*cmd++ = REG_MASKED_FIELD_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1220 
1221 	return cmd - batch;
1222 }
1223 
1224 static ssize_t setup_invalidate_auxccs_wa(struct xe_lrc *lrc,
1225 					  struct xe_hw_engine *hwe,
1226 					  u32 *batch, size_t max_len)
1227 {
1228 	struct xe_gt *gt = lrc->gt;
1229 	u32 *(*emit)(struct xe_gt *gt, u32 *cmd) =
1230 		gt->ring_ops[hwe->class]->emit_aux_table_inv;
1231 
1232 	if (!emit)
1233 		return 0;
1234 
1235 	if (xe_gt_WARN_ON(gt, max_len < 8))
1236 		return -ENOSPC;
1237 
1238 	return emit(gt, batch) - batch;
1239 }
1240 
1241 struct bo_setup {
1242 	ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1243 			 u32 *batch, size_t max_size);
1244 };
1245 
1246 struct bo_setup_state {
1247 	/* Input: */
1248 	struct xe_lrc		*lrc;
1249 	struct xe_hw_engine	*hwe;
1250 	size_t			max_size;
1251 	size_t                  reserve_dw;
1252 	unsigned int		offset;
1253 	const struct bo_setup	*funcs;
1254 	unsigned int		num_funcs;
1255 
1256 	/* State: */
1257 	u32			*buffer;
1258 	u32			*ptr;
1259 	unsigned int		written;
1260 };
1261 
1262 static int setup_bo(struct bo_setup_state *state)
1263 {
1264 	ssize_t remain;
1265 
1266 	if (state->lrc->bo->vmap.is_iomem) {
1267 		xe_gt_assert(state->hwe->gt, state->buffer);
1268 		state->ptr = state->buffer;
1269 	} else {
1270 		state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1271 	}
1272 
1273 	remain = state->max_size / sizeof(u32);
1274 
1275 	for (size_t i = 0; i < state->num_funcs; i++) {
1276 		ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1277 						    state->ptr, remain);
1278 
1279 		remain -= len;
1280 
1281 		/*
1282 		 * Caller has asked for at least reserve_dw to remain unused.
1283 		 */
1284 		if (len < 0 ||
1285 		    xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1286 			goto fail;
1287 
1288 		state->ptr += len;
1289 		state->written += len;
1290 	}
1291 
1292 	return 0;
1293 
1294 fail:
1295 	return -ENOSPC;
1296 }
1297 
1298 static void finish_bo(struct bo_setup_state *state)
1299 {
1300 	if (!state->lrc->bo->vmap.is_iomem)
1301 		return;
1302 
1303 	xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1304 			 state->offset, state->buffer,
1305 			 state->written * sizeof(u32));
1306 }
1307 
1308 /**
1309  * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1310  * @lrc: the &xe_lrc struct instance
1311  * @hwe: the &xe_hw_engine struct instance
1312  * @scratch: preallocated scratch buffer for temporary storage
1313  * Return: 0 on success, negative error code on failure
1314  */
1315 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1316 {
1317 	static const struct bo_setup funcs[] = {
1318 		{ .setup = setup_timestamp_wa },
1319 		{ .setup = setup_invalidate_state_cache_wa },
1320 		{ .setup = setup_utilization_wa },
1321 		{ .setup = setup_configfs_post_ctx_restore_bb },
1322 	};
1323 	struct bo_setup_state state = {
1324 		.lrc = lrc,
1325 		.hwe = hwe,
1326 		.max_size = LRC_WA_BB_SIZE,
1327 		.buffer = scratch,
1328 		.reserve_dw = 1,
1329 		.offset = __xe_lrc_wa_bb_offset(lrc),
1330 		.funcs = funcs,
1331 		.num_funcs = ARRAY_SIZE(funcs),
1332 	};
1333 	int ret;
1334 
1335 	ret = setup_bo(&state);
1336 	if (ret)
1337 		return ret;
1338 
1339 	*state.ptr++ = MI_BATCH_BUFFER_END;
1340 	state.written++;
1341 
1342 	finish_bo(&state);
1343 
1344 	xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1345 			     xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1346 
1347 	return 0;
1348 }
1349 
1350 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1351 {
1352 	u32 *buf = NULL;
1353 	int ret;
1354 
1355 	if (lrc->bo->vmap.is_iomem) {
1356 		buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1357 		if (!buf)
1358 			return -ENOMEM;
1359 	}
1360 
1361 	ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1362 
1363 	kfree(buf);
1364 
1365 	return ret;
1366 }
1367 
1368 static int
1369 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1370 {
1371 	static const struct bo_setup rcs_funcs[] = {
1372 		{ .setup = setup_timestamp_wa },
1373 		{ .setup = setup_invalidate_auxccs_wa },
1374 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1375 	};
1376 	static const struct bo_setup xcs_funcs[] = {
1377 		{ .setup = setup_invalidate_auxccs_wa },
1378 		{ .setup = setup_configfs_mid_ctx_restore_bb },
1379 	};
1380 	struct bo_setup_state state = {
1381 		.lrc = lrc,
1382 		.hwe = hwe,
1383 		.max_size = (63 * 64) /* max 63 cachelines */,
1384 		.buffer = NULL,
1385 		.offset = __xe_lrc_indirect_ctx_offset(lrc),
1386 	};
1387 	int ret;
1388 
1389 	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1390 		return 0;
1391 
1392 	if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1393 	    hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1394 		state.funcs = rcs_funcs;
1395 		state.num_funcs = ARRAY_SIZE(rcs_funcs);
1396 	} else {
1397 		state.funcs = xcs_funcs;
1398 		state.num_funcs = ARRAY_SIZE(xcs_funcs);
1399 	}
1400 
1401 	if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1402 		return 0;
1403 
1404 	if (lrc->bo->vmap.is_iomem) {
1405 		state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1406 		if (!state.buffer)
1407 			return -ENOMEM;
1408 	}
1409 
1410 	ret = setup_bo(&state);
1411 	if (ret) {
1412 		kfree(state.buffer);
1413 		return ret;
1414 	}
1415 
1416 	/*
1417 	 * Align to 64B cacheline so there's no garbage at the end for CS to
1418 	 * execute: size for indirect ctx must be a multiple of 64.
1419 	 */
1420 	while (state.written & 0xf) {
1421 		*state.ptr++ = MI_NOOP;
1422 		state.written++;
1423 	}
1424 
1425 	finish_bo(&state);
1426 	kfree(state.buffer);
1427 
1428 	/*
1429 	 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1430 	 * varies per engine class, but the default is good enough
1431 	 */
1432 	xe_lrc_write_ctx_reg(lrc,
1433 			     CTX_CS_INDIRECT_CTX,
1434 			     (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1435 			     /* Size in CLs. */
1436 			     (state.written * sizeof(u32) / 64));
1437 
1438 	return 0;
1439 }
1440 
1441 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1442 {
1443 	struct xe_device *xe = gt_to_xe(lrc->gt);
1444 
1445 	xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
1446 		       priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
1447 
1448 	/* xe_multi_queue_priority is directly mapped to LRC priority values */
1449 	return priority;
1450 }
1451 
1452 /**
1453  * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
1454  * @lrc: Logical Ring Context
1455  * @priority: Multi queue priority of the exec queue
1456  *
1457  * Convert @priority to LRC multi queue priority and update the @lrc descriptor
1458  */
1459 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1460 {
1461 	lrc->desc &= ~LRC_PRIORITY;
1462 	lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
1463 }
1464 
1465 static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1466 			   void *replay_state, u16 msix_vec, u32 init_flags)
1467 {
1468 	struct xe_gt *gt = hwe->gt;
1469 	struct xe_tile *tile = gt_to_tile(gt);
1470 	struct xe_device *xe = gt_to_xe(gt);
1471 	struct iosys_map map;
1472 	u32 arb_enable;
1473 	u32 state_cache_perf_fix[3];
1474 	int err;
1475 
1476 	/*
1477 	 * Init Per-Process of HW status Page, LRC / context state to known
1478 	 * values. If there's already a primed default_lrc, just copy it, otherwise
1479 	 * it's the early submission to record the lrc: build a new empty one from
1480 	 * scratch.
1481 	 */
1482 	map = __xe_lrc_pphwsp_map(lrc);
1483 	if (gt->default_lrc[hwe->class] || replay_state) {
1484 		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
1485 		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1486 				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1487 				 lrc->size - LRC_PPHWSP_SIZE);
1488 		if (replay_state)
1489 			xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1490 					 replay_state, lrc->replay_size);
1491 	} else {
1492 		void *init_data = empty_lrc_data(hwe);
1493 
1494 		if (!init_data) {
1495 			return -ENOMEM;
1496 		}
1497 
1498 		xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
1499 		kfree(init_data);
1500 	}
1501 
1502 	if (vm)
1503 		xe_lrc_set_ppgtt(lrc, vm);
1504 
1505 	if (xe_device_has_msix(xe)) {
1506 		xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1507 				     xe_memirq_status_ptr(&tile->memirq, hwe));
1508 		xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1509 				     xe_memirq_source_ptr(&tile->memirq, hwe));
1510 		xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1511 	}
1512 
1513 	if (xe_gt_has_indirect_ring_state(gt)) {
1514 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1515 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1516 
1517 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1518 					      __xe_lrc_ring_ggtt_addr(lrc));
1519 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1520 
1521 		/* Match head and tail pointers */
1522 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail);
1523 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1524 
1525 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1526 					      RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1527 	} else {
1528 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1529 
1530 		/* Match head and tail pointers */
1531 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail);
1532 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1533 
1534 		xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1535 				     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1536 	}
1537 
1538 	if (init_flags & XE_LRC_CREATE_RUNALONE)
1539 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1540 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1541 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_RUN_ALONE));
1542 
1543 	if (init_flags & XE_LRC_CREATE_PXP)
1544 		xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1545 				     xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1546 				     REG_MASKED_FIELD_ENABLE(CTX_CTRL_PXP_ENABLE));
1547 
1548 	lrc->ctx_timestamp = 0;
1549 	xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1550 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1551 		xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1552 
1553 	if (xe->info.has_asid && vm)
1554 		xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1555 
1556 	lrc->desc = LRC_VALID;
1557 	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1558 	/* TODO: Priority */
1559 
1560 	/* While this appears to have something about privileged batches or
1561 	 * some such, it really just means PPGTT mode.
1562 	 */
1563 	if (vm)
1564 		lrc->desc |= LRC_PRIVILEGE;
1565 
1566 	if (GRAPHICS_VERx100(xe) < 1250) {
1567 		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1568 		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1569 	}
1570 
1571 	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1572 	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1573 
1574 	if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) {
1575 		state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1576 		state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr;
1577 		state_cache_perf_fix[2] = REG_MASKED_FIELD_ENABLE(DISABLE_STATE_CACHE_PERF_FIX);
1578 		xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix));
1579 	}
1580 
1581 	map = __xe_lrc_seqno_map(lrc);
1582 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1583 
1584 	map = __xe_lrc_start_seqno_map(lrc);
1585 	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1586 
1587 	err = setup_wa_bb(lrc, hwe);
1588 	if (err)
1589 		return err;
1590 
1591 	err = setup_indirect_ctx(lrc, hwe);
1592 
1593 	return err;
1594 }
1595 
1596 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1597 		       void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
1598 {
1599 	struct xe_gt *gt = hwe->gt;
1600 	const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1601 	u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1602 	struct xe_tile *tile = gt_to_tile(gt);
1603 	struct xe_device *xe = gt_to_xe(gt);
1604 	struct xe_bo *bo;
1605 	u32 bo_flags;
1606 	int err;
1607 
1608 	kref_init(&lrc->refcount);
1609 	lrc->gt = gt;
1610 	lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
1611 	lrc->size = lrc_size;
1612 	lrc->flags = 0;
1613 	lrc->ring.size = ring_size;
1614 	lrc->ring.tail = 0;
1615 
1616 	if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1617 		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1618 		bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1619 	}
1620 
1621 	if (xe_gt_has_indirect_ring_state(gt))
1622 		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1623 
1624 	bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1625 		   XE_BO_FLAG_GGTT_INVALIDATE;
1626 
1627 	if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1628 		bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1629 
1630 	bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
1631 				       ttm_bo_type_kernel,
1632 				       bo_flags, false);
1633 	if (IS_ERR(bo))
1634 		return PTR_ERR(bo);
1635 
1636 	lrc->bo = bo;
1637 
1638 	bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
1639 				       ttm_bo_type_kernel,
1640 				       XE_BO_FLAG_GGTT |
1641 				       XE_BO_FLAG_GGTT_INVALIDATE |
1642 				       XE_BO_FLAG_SYSTEM, false);
1643 	if (IS_ERR(bo)) {
1644 		err = PTR_ERR(bo);
1645 		goto err_lrc_finish;
1646 	}
1647 	lrc->seqno_bo = bo;
1648 
1649 	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1650 			     hwe->fence_irq, hwe->name);
1651 
1652 	err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags);
1653 	if (err)
1654 		goto err_lrc_finish;
1655 
1656 	if (vm && vm->xef)
1657 		xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1658 
1659 	return 0;
1660 
1661 err_lrc_finish:
1662 	xe_lrc_finish(lrc);
1663 	return err;
1664 }
1665 
1666 /**
1667  * xe_lrc_create - Create a LRC
1668  * @hwe: Hardware Engine
1669  * @vm: The VM (address space)
1670  * @replay_state: GPU hang replay state
1671  * @ring_size: LRC ring size
1672  * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1673  * @flags: LRC initialization flags
1674  *
1675  * Allocate and initialize the Logical Ring Context (LRC).
1676  *
1677  * Return pointer to created LRC upon success and an error pointer
1678  * upon failure.
1679  */
1680 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1681 			     void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
1682 {
1683 	struct xe_lrc *lrc;
1684 	int err;
1685 
1686 	lrc = kzalloc_obj(*lrc);
1687 	if (!lrc)
1688 		return ERR_PTR(-ENOMEM);
1689 
1690 	err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
1691 	if (err) {
1692 		kfree(lrc);
1693 		return ERR_PTR(err);
1694 	}
1695 
1696 	return lrc;
1697 }
1698 
1699 /**
1700  * xe_lrc_destroy - Destroy the LRC
1701  * @ref: reference to LRC
1702  *
1703  * Called when ref == 0, release resources held by the Logical Ring Context
1704  * (LRC) and free the LRC memory.
1705  */
1706 void xe_lrc_destroy(struct kref *ref)
1707 {
1708 	struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1709 
1710 	xe_lrc_finish(lrc);
1711 	kfree(lrc);
1712 }
1713 
1714 /**
1715  * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1716  * @lrc: the &xe_lrc struct instance
1717  */
1718 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1719 {
1720 	if (xe_lrc_has_indirect_ring_state(lrc)) {
1721 		xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1722 				     __xe_lrc_indirect_ring_ggtt_addr(lrc));
1723 
1724 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1725 					      __xe_lrc_ring_ggtt_addr(lrc));
1726 	} else {
1727 		xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1728 	}
1729 }
1730 
1731 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1732 {
1733 	if (xe_lrc_has_indirect_ring_state(lrc))
1734 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1735 	else
1736 		xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1737 }
1738 
1739 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1740 {
1741 	if (xe_lrc_has_indirect_ring_state(lrc))
1742 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1743 	else
1744 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1745 }
1746 
1747 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1748 {
1749 	if (xe_lrc_has_indirect_ring_state(lrc))
1750 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1751 	else
1752 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1753 }
1754 
1755 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1756 {
1757 	if (xe_lrc_has_indirect_ring_state(lrc))
1758 		xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1759 	else
1760 		xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1761 }
1762 
1763 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1764 {
1765 	if (xe_lrc_has_indirect_ring_state(lrc))
1766 		return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1767 	else
1768 		return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1769 }
1770 
1771 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1772 {
1773 	const u32 head = xe_lrc_ring_head(lrc);
1774 	const u32 tail = lrc->ring.tail;
1775 	const u32 size = lrc->ring.size;
1776 
1777 	return ((head - tail - 1) & (size - 1)) + 1;
1778 }
1779 
1780 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1781 				const void *data, size_t size)
1782 {
1783 	struct xe_device *xe = lrc_to_xe(lrc);
1784 
1785 	iosys_map_incr(&ring, lrc->ring.tail);
1786 	xe_map_memcpy_to(xe, &ring, 0, data, size);
1787 	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1788 }
1789 
1790 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1791 {
1792 	struct xe_device *xe = lrc_to_xe(lrc);
1793 	struct iosys_map ring;
1794 	u32 rhs;
1795 	size_t aligned_size;
1796 
1797 	xe_assert(xe, IS_ALIGNED(size, 4));
1798 	aligned_size = ALIGN(size, 8);
1799 
1800 	ring = __xe_lrc_ring_map(lrc);
1801 
1802 	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1803 	rhs = lrc->ring.size - lrc->ring.tail;
1804 	if (size > rhs) {
1805 		__xe_lrc_write_ring(lrc, ring, data, rhs);
1806 		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1807 	} else {
1808 		__xe_lrc_write_ring(lrc, ring, data, size);
1809 	}
1810 
1811 	if (aligned_size > size) {
1812 		u32 noop = MI_NOOP;
1813 
1814 		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1815 	}
1816 }
1817 
1818 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1819 {
1820 	return lrc->desc | xe_lrc_ggtt_addr(lrc);
1821 }
1822 
1823 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1824 {
1825 	return __xe_lrc_seqno_ggtt_addr(lrc);
1826 }
1827 
1828 /**
1829  * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1830  *
1831  * Allocate but don't initialize an lrc seqno fence.
1832  *
1833  * Return: Pointer to the allocated fence or
1834  * negative error pointer on error.
1835  */
1836 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1837 {
1838 	return xe_hw_fence_alloc();
1839 }
1840 
1841 /**
1842  * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1843  * @fence: Pointer to the fence to free.
1844  *
1845  * Frees an lrc seqno fence that hasn't yet been
1846  * initialized.
1847  */
1848 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1849 {
1850 	xe_hw_fence_free(fence);
1851 }
1852 
1853 /**
1854  * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1855  * @lrc: Pointer to the lrc.
1856  * @fence: Pointer to the fence to initialize.
1857  *
1858  * Initializes a pre-allocated lrc seqno fence.
1859  * After initialization, the fence is subject to normal
1860  * dma-fence refcounting.
1861  */
1862 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1863 {
1864 	xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1865 }
1866 
1867 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1868 {
1869 	struct iosys_map map = __xe_lrc_seqno_map(lrc);
1870 
1871 	return xe_map_read32(lrc_to_xe(lrc), &map);
1872 }
1873 
1874 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1875 {
1876 	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1877 
1878 	return xe_map_read32(lrc_to_xe(lrc), &map);
1879 }
1880 
1881 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1882 {
1883 	return __xe_lrc_start_seqno_ggtt_addr(lrc);
1884 }
1885 
1886 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1887 {
1888 	return __xe_lrc_parallel_ggtt_addr(lrc);
1889 }
1890 
1891 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1892 {
1893 	return __xe_lrc_parallel_map(lrc);
1894 }
1895 
1896 /**
1897  * xe_lrc_engine_id() - Read engine id value
1898  * @lrc: Pointer to the lrc.
1899  *
1900  * Returns: context id value
1901  */
1902 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1903 {
1904 	struct xe_device *xe = lrc_to_xe(lrc);
1905 	struct iosys_map map;
1906 
1907 	map = __xe_lrc_engine_id_map(lrc);
1908 	return xe_map_read32(xe, &map);
1909 }
1910 
1911 static int instr_dw(u32 cmd_header)
1912 {
1913 	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1914 	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1915 	    GFXPIPE_SINGLE_DW_CMD(0, 0))
1916 		return 1;
1917 
1918 	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1919 	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1920 		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1921 
1922 	/* Most instructions have the # of dwords (minus 2) in 7:0 */
1923 	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1924 }
1925 
1926 static int dump_mi_command(struct drm_printer *p,
1927 			   struct xe_gt *gt,
1928 			   u32 *start,
1929 			   u32 *dw,
1930 			   int remaining_dw)
1931 {
1932 	u32 inst_header = *dw;
1933 	u32 numdw = instr_dw(inst_header);
1934 	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1935 	int num_noop;
1936 
1937 	/* First check for commands that don't have/use a '# DW' field */
1938 	switch (inst_header & MI_OPCODE) {
1939 	case MI_NOOP:
1940 		num_noop = 1;
1941 		while (num_noop < remaining_dw &&
1942 		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1943 			num_noop++;
1944 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_NOOP (%d dwords)\n",
1945 			   dw - num_noop - start, inst_header, num_noop);
1946 		return num_noop;
1947 
1948 	case MI_TOPOLOGY_FILTER:
1949 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_TOPOLOGY_FILTER\n",
1950 			   dw - start, inst_header);
1951 		return 1;
1952 
1953 	case MI_BATCH_BUFFER_END:
1954 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_BATCH_BUFFER_END\n",
1955 			   dw - start, inst_header);
1956 		/* Return 'remaining_dw' to consume the rest of the LRC */
1957 		return remaining_dw;
1958 	}
1959 
1960 	/*
1961 	 * Any remaining commands include a # of dwords.  We should make sure
1962 	 * it doesn't exceed the remaining size of the LRC.
1963 	 */
1964 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1965 		numdw = remaining_dw;
1966 
1967 	switch (inst_header & MI_OPCODE) {
1968 	case MI_LOAD_REGISTER_IMM:
1969 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1970 			   dw - start, inst_header, (numdw - 1) / 2);
1971 		for (int i = 1; i < numdw; i += 2)
1972 			drm_printf(p, "LRC[%#5tx]  =  - %#6x = %#010x\n",
1973 				   &dw[i] - start, dw[i], dw[i + 1]);
1974 		return numdw;
1975 
1976 	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1977 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1978 			   dw - start, inst_header,
1979 			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1980 			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1981 		if (numdw == 4)
1982 			drm_printf(p, "LRC[%#5tx]  =  - %#6x = %#010llx\n",
1983 				   dw - start,
1984 				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1985 		else
1986 			drm_printf(p, "LRC[%#5tx]  =  - %*ph (%s)\n",
1987 				   dw - start, (int)sizeof(u32) * (numdw - 1),
1988 				   dw + 1, numdw < 4 ? "truncated" : "malformed");
1989 		return numdw;
1990 
1991 	case MI_FORCE_WAKEUP:
1992 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] MI_FORCE_WAKEUP\n",
1993 			   dw - start, inst_header);
1994 		return numdw;
1995 
1996 	default:
1997 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown MI opcode %#x, likely %d dwords\n",
1998 			   dw - start, inst_header, opcode, numdw);
1999 		return numdw;
2000 	}
2001 }
2002 
2003 static int dump_gfxpipe_command(struct drm_printer *p,
2004 				struct xe_gt *gt,
2005 				u32 *start,
2006 				u32 *dw,
2007 				int remaining_dw)
2008 {
2009 	u32 numdw = instr_dw(*dw);
2010 	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
2011 	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
2012 	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
2013 
2014 	/*
2015 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2016 	 * remaining size of the LRC.
2017 	 */
2018 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2019 		numdw = remaining_dw;
2020 
2021 	switch (*dw & GFXPIPE_MATCH_MASK) {
2022 #define MATCH(cmd) \
2023 	case cmd: \
2024 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] " #cmd " (%d dwords)\n", \
2025 			   dw - start, *dw, numdw); \
2026 		return numdw
2027 #define MATCH3D(cmd) \
2028 	case CMD_##cmd: \
2029 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] " #cmd " (%d dwords)\n", \
2030 			   dw - start, *dw, numdw); \
2031 		return numdw
2032 
2033 	MATCH(STATE_BASE_ADDRESS);
2034 	MATCH(STATE_SIP);
2035 	MATCH(GPGPU_CSR_BASE_ADDRESS);
2036 	MATCH(STATE_COMPUTE_MODE);
2037 	MATCH3D(3DSTATE_BTD);
2038 	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
2039 	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
2040 
2041 	MATCH3D(3DSTATE_VF_STATISTICS);
2042 
2043 	MATCH(PIPELINE_SELECT);
2044 
2045 	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
2046 	MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN);
2047 	MATCH3D(3DSTATE_CLEAR_PARAMS);
2048 	MATCH3D(3DSTATE_DEPTH_BUFFER);
2049 	MATCH3D(3DSTATE_STENCIL_BUFFER);
2050 	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
2051 	MATCH3D(3DSTATE_VERTEX_BUFFERS);
2052 	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
2053 	MATCH3D(3DSTATE_INDEX_BUFFER);
2054 	MATCH3D(3DSTATE_VF);
2055 	MATCH3D(3DSTATE_MULTISAMPLE);
2056 	MATCH3D(3DSTATE_CC_STATE_POINTERS);
2057 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
2058 	MATCH3D(3DSTATE_VS);
2059 	MATCH3D(3DSTATE_GS);
2060 	MATCH3D(3DSTATE_CLIP);
2061 	MATCH3D(3DSTATE_SF);
2062 	MATCH3D(3DSTATE_WM);
2063 	MATCH3D(3DSTATE_CONSTANT_VS);
2064 	MATCH3D(3DSTATE_CONSTANT_GS);
2065 	MATCH3D(3DSTATE_CONSTANT_PS);
2066 	MATCH3D(3DSTATE_SAMPLE_MASK);
2067 	MATCH3D(3DSTATE_CONSTANT_HS);
2068 	MATCH3D(3DSTATE_CONSTANT_DS);
2069 	MATCH3D(3DSTATE_HS);
2070 	MATCH3D(3DSTATE_TE);
2071 	MATCH3D(3DSTATE_DS);
2072 	MATCH3D(3DSTATE_STREAMOUT);
2073 	MATCH3D(3DSTATE_SBE);
2074 	MATCH3D(3DSTATE_PS);
2075 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
2076 	MATCH3D(3DSTATE_CPS_POINTERS);
2077 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
2078 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
2079 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
2080 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
2081 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
2082 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
2083 	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
2084 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
2085 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
2086 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
2087 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
2088 	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
2089 	MATCH3D(3DSTATE_VF_INSTANCING);
2090 	MATCH3D(3DSTATE_VF_SGVS);
2091 	MATCH3D(3DSTATE_VF_TOPOLOGY);
2092 	MATCH3D(3DSTATE_WM_CHROMAKEY);
2093 	MATCH3D(3DSTATE_PS_BLEND);
2094 	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
2095 	MATCH3D(3DSTATE_PS_EXTRA);
2096 	MATCH3D(3DSTATE_RASTER);
2097 	MATCH3D(3DSTATE_SBE_SWIZ);
2098 	MATCH3D(3DSTATE_WM_HZ_OP);
2099 	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
2100 	MATCH3D(3DSTATE_VF_SGVS_2);
2101 	MATCH3D(3DSTATE_VFG);
2102 	MATCH3D(3DSTATE_URB_ALLOC_VS);
2103 	MATCH3D(3DSTATE_URB_ALLOC_HS);
2104 	MATCH3D(3DSTATE_URB_ALLOC_DS);
2105 	MATCH3D(3DSTATE_URB_ALLOC_GS);
2106 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
2107 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
2108 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
2109 	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
2110 	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
2111 	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
2112 	MATCH3D(3DSTATE_AMFS);
2113 	MATCH3D(3DSTATE_DEPTH_BOUNDS);
2114 	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
2115 	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
2116 	MATCH3D(3DSTATE_MESH_CONTROL);
2117 	MATCH3D(3DSTATE_MESH_DISTRIB);
2118 	MATCH3D(3DSTATE_TASK_REDISTRIB);
2119 	MATCH3D(3DSTATE_MESH_SHADER);
2120 	MATCH3D(3DSTATE_MESH_SHADER_DATA);
2121 	MATCH3D(3DSTATE_TASK_CONTROL);
2122 	MATCH3D(3DSTATE_TASK_SHADER);
2123 	MATCH3D(3DSTATE_TASK_SHADER_DATA);
2124 	MATCH3D(3DSTATE_URB_ALLOC_MESH);
2125 	MATCH3D(3DSTATE_URB_ALLOC_TASK);
2126 	MATCH3D(3DSTATE_CLIP_MESH);
2127 	MATCH3D(3DSTATE_SBE_MESH);
2128 	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2129 	MATCH3D(3DSTATE_COARSE_PIXEL);
2130 	MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT);
2131 	MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT);
2132 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2);
2133 	MATCH3D(3DSTATE_CC_STATE_POINTERS_2);
2134 	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2);
2135 	MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2);
2136 	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2);
2137 
2138 	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2139 	MATCH3D(3DSTATE_URB_MEMORY);
2140 	MATCH3D(3DSTATE_CHROMA_KEY);
2141 	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2142 	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2143 	MATCH3D(3DSTATE_LINE_STIPPLE);
2144 	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2145 	MATCH3D(3DSTATE_MONOFILTER_SIZE);
2146 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2147 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2148 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2149 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2150 	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2151 	MATCH3D(3DSTATE_SO_DECL_LIST);
2152 	MATCH3D(3DSTATE_SO_BUFFER);
2153 	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2154 	MATCH3D(3DSTATE_SAMPLE_PATTERN);
2155 	MATCH3D(3DSTATE_3D_MODE);
2156 	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2157 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2158 	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2159 	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2);
2160 
2161 	default:
2162 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2163 			   dw - start, *dw, pipeline, opcode, subopcode, numdw);
2164 		return numdw;
2165 	}
2166 }
2167 
2168 static int dump_gfx_state_command(struct drm_printer *p,
2169 				  struct xe_gt *gt,
2170 				  u32 *start,
2171 				  u32 *dw,
2172 				  int remaining_dw)
2173 {
2174 	u32 numdw = instr_dw(*dw);
2175 	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2176 
2177 	/*
2178 	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2179 	 * remaining size of the LRC.
2180 	 */
2181 	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2182 		numdw = remaining_dw;
2183 
2184 	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2185 	MATCH(STATE_WRITE_INLINE);
2186 
2187 	default:
2188 		drm_printf(p, "LRC[%#5tx]  =  [%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2189 			   dw - start, *dw, opcode, numdw);
2190 		return numdw;
2191 	}
2192 }
2193 
2194 void xe_lrc_dump_default(struct drm_printer *p,
2195 			 struct xe_gt *gt,
2196 			 enum xe_engine_class hwe_class)
2197 {
2198 	u32 *dw, *start;
2199 	int remaining_dw, num_dw;
2200 
2201 	if (!gt->default_lrc[hwe_class]) {
2202 		drm_printf(p, "No default LRC for class %d\n", hwe_class);
2203 		return;
2204 	}
2205 
2206 	/*
2207 	 * Skip the beginning of the LRC since it contains the per-process
2208 	 * hardware status page.
2209 	 */
2210 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2211 	start = dw;
2212 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2213 
2214 	while (remaining_dw > 0) {
2215 		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2216 			num_dw = dump_mi_command(p, gt, start, dw, remaining_dw);
2217 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2218 			num_dw = dump_gfxpipe_command(p, gt, start, dw, remaining_dw);
2219 		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2220 			num_dw = dump_gfx_state_command(p, gt, start, dw, remaining_dw);
2221 		} else {
2222 			num_dw = min(instr_dw(*dw), remaining_dw);
2223 			drm_printf(p, "LRC[%#5tx]  =  [%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2224 				   dw - start,
2225 				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2226 				   num_dw);
2227 		}
2228 
2229 		dw += num_dw;
2230 		remaining_dw -= num_dw;
2231 	}
2232 }
2233 
2234 /*
2235  * Lookup the value of a register within the offset/value pairs of an
2236  * MI_LOAD_REGISTER_IMM instruction.
2237  *
2238  * Return -ENOENT if the register is not present in the MI_LRI instruction.
2239  */
2240 static int lookup_reg_in_mi_lri(u32 offset, u32 *value,
2241 				const u32 *dword_pair, int num_regs)
2242 {
2243 	for (int i = 0; i < num_regs; i++) {
2244 		if (dword_pair[2 * i] == offset) {
2245 			*value = dword_pair[2 * i + 1];
2246 			return 0;
2247 		}
2248 	}
2249 
2250 	return -ENOENT;
2251 }
2252 
2253 /*
2254  * Lookup the value of a register in a specific engine type's default LRC.
2255  *
2256  * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register
2257  * cannot be found in the default LRC.
2258  */
2259 int xe_lrc_lookup_default_reg_value(struct xe_gt *gt,
2260 				    enum xe_engine_class hwe_class,
2261 				    u32 offset,
2262 				    u32 *value)
2263 {
2264 	u32 *dw;
2265 	int remaining_dw, ret;
2266 
2267 	if (!gt->default_lrc[hwe_class])
2268 		return -EINVAL;
2269 
2270 	/*
2271 	 * Skip the beginning of the LRC since it contains the per-process
2272 	 * hardware status page.
2273 	 */
2274 	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2275 	remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2276 
2277 	while (remaining_dw > 0) {
2278 		u32 num_dw = instr_dw(*dw);
2279 
2280 		if (num_dw > remaining_dw)
2281 			num_dw = remaining_dw;
2282 
2283 		switch (*dw & XE_INSTR_CMD_TYPE) {
2284 		case XE_INSTR_MI:
2285 			switch (*dw & MI_OPCODE) {
2286 			case MI_BATCH_BUFFER_END:
2287 				/* End of LRC; register not found */
2288 				return -ENOENT;
2289 
2290 			case MI_NOOP:
2291 			case MI_TOPOLOGY_FILTER:
2292 				/*
2293 				 * MI_NOOP and MI_TOPOLOGY_FILTER don't have
2294 				 * a length field and are always 1-dword
2295 				 * instructions.
2296 				 */
2297 				remaining_dw--;
2298 				dw++;
2299 				break;
2300 
2301 			case MI_LOAD_REGISTER_IMM:
2302 				ret = lookup_reg_in_mi_lri(offset, value,
2303 							   dw + 1, (num_dw - 1) / 2);
2304 				if (ret == 0)
2305 					return 0;
2306 
2307 				fallthrough;
2308 
2309 			default:
2310 				/*
2311 				 * Jump to next instruction based on length
2312 				 * field.
2313 				 */
2314 				remaining_dw -= num_dw;
2315 				dw += num_dw;
2316 				break;
2317 			}
2318 			break;
2319 
2320 		default:
2321 			/* Jump to next instruction based on length field. */
2322 			remaining_dw -= num_dw;
2323 			dw += num_dw;
2324 		}
2325 	}
2326 
2327 	return -ENOENT;
2328 }
2329 
2330 struct instr_state {
2331 	u32 instr;
2332 	u16 num_dw;
2333 };
2334 
2335 static const struct instr_state xe_hpg_svg_state[] = {
2336 	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2337 	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2338 	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2339 	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2340 	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2341 	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2342 	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2343 	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2344 	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2345 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2346 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2347 	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2348 	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2349 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2350 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2351 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2352 	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2353 	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2354 	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2355 	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2356 	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2357 	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2358 	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2359 	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2360 	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2361 	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2362 	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2363 	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2364 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2365 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2366 	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2367 	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2368 	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2369 	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2370 	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2371 	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2372 	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2373 	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2374 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2375 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2376 	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2377 	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2378 	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2379 	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2380 	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2381 	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2382 	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2383 	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2384 	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2385 	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2386 };
2387 
2388 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2389 {
2390 	struct xe_gt *gt = q->hwe->gt;
2391 	struct xe_device *xe = gt_to_xe(gt);
2392 	const struct instr_state *state_table = NULL;
2393 	int state_table_size = 0;
2394 
2395 	/*
2396 	 * Wa_14019789679
2397 	 *
2398 	 * If the driver doesn't explicitly emit the SVG instructions while
2399 	 * setting up the default LRC, the context switch will write 0's
2400 	 * (noops) into the LRC memory rather than the expected instruction
2401 	 * headers.  Application contexts start out as a copy of the default
2402 	 * LRC, and if they also do not emit specific settings for some SVG
2403 	 * state, then on context restore they'll unintentionally inherit
2404 	 * whatever state setting the previous context had programmed into the
2405 	 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2406 	 * prevent the hardware from resetting that state back to any specific
2407 	 * value).
2408 	 *
2409 	 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2410 	 * since that's a specific state setting that can easily cause GPU
2411 	 * hangs if unintentionally inherited.  However to be safe we'll
2412 	 * continue to emit all of the SVG state since it's best not to leak
2413 	 * any of the state between contexts, even if that leakage is harmless.
2414 	 */
2415 	if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2416 		state_table = xe_hpg_svg_state;
2417 		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2418 	}
2419 
2420 	if (!state_table) {
2421 		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2422 			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2423 		return cs;
2424 	}
2425 
2426 	for (int i = 0; i < state_table_size; i++) {
2427 		u32 instr = state_table[i].instr;
2428 		u16 num_dw = state_table[i].num_dw;
2429 		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2430 
2431 		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2432 		xe_gt_assert(gt, num_dw != 0);
2433 		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2434 
2435 		/*
2436 		 * Xe2's SVG context is the same as the one on DG2 / MTL
2437 		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2438 		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2439 		 * Just make the replacement here rather than defining a
2440 		 * whole separate table for the single trivial change.
2441 		 */
2442 		if (GRAPHICS_VER(xe) >= 20 &&
2443 		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2444 			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2445 
2446 		*cs = instr;
2447 		if (!is_single_dw)
2448 			*cs |= (num_dw - 2);
2449 
2450 		cs += num_dw;
2451 	}
2452 
2453 	return cs;
2454 }
2455 
2456 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2457 {
2458 	struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
2459 
2460 	if (!snapshot)
2461 		return NULL;
2462 
2463 	snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2464 	snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2465 	snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2466 	snapshot->head = xe_lrc_ring_head(lrc);
2467 	snapshot->tail.internal = lrc->ring.tail;
2468 	snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2469 	snapshot->start = xe_lrc_ring_start(lrc);
2470 	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2471 	snapshot->seqno = xe_lrc_seqno(lrc);
2472 	snapshot->lrc_bo = xe_bo_get(lrc->bo);
2473 	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2474 	snapshot->lrc_size = lrc->size;
2475 	snapshot->replay_offset = 0;
2476 	snapshot->replay_size = lrc->replay_size;
2477 	snapshot->lrc_snapshot = NULL;
2478 	snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2479 	snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2480 	return snapshot;
2481 }
2482 
2483 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2484 {
2485 	struct xe_bo *bo;
2486 	struct iosys_map src;
2487 
2488 	if (!snapshot)
2489 		return;
2490 
2491 	bo = snapshot->lrc_bo;
2492 	snapshot->lrc_bo = NULL;
2493 
2494 	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2495 	if (!snapshot->lrc_snapshot)
2496 		goto put_bo;
2497 
2498 	xe_bo_lock(bo, false);
2499 	if (!ttm_bo_vmap(&bo->ttm, &src)) {
2500 		xe_map_memcpy_from(xe_bo_device(bo),
2501 				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2502 				   snapshot->lrc_size);
2503 		ttm_bo_vunmap(&bo->ttm, &src);
2504 	} else {
2505 		kvfree(snapshot->lrc_snapshot);
2506 		snapshot->lrc_snapshot = NULL;
2507 	}
2508 	xe_bo_unlock(bo);
2509 put_bo:
2510 	xe_bo_put(bo);
2511 }
2512 
2513 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2514 {
2515 	unsigned long i;
2516 
2517 	if (!snapshot)
2518 		return;
2519 
2520 	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2521 	drm_printf(p, "\tHW Ring address: 0x%08x\n",
2522 		   snapshot->ring_addr);
2523 	drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2524 		   snapshot->indirect_context_desc);
2525 	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2526 	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2527 		   snapshot->tail.internal, snapshot->tail.memory);
2528 	drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2529 	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2530 	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2531 	drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2532 	drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2533 
2534 	if (!snapshot->lrc_snapshot)
2535 		return;
2536 
2537 	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2538 	drm_puts(p, "\t[HWSP].data: ");
2539 	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2540 		u32 *val = snapshot->lrc_snapshot + i;
2541 		char dumped[ASCII85_BUFSZ];
2542 
2543 		drm_puts(p, ascii85_encode(*val, dumped));
2544 	}
2545 
2546 	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2547 	drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
2548 	drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
2549 
2550 	drm_puts(p, "\t[HWCTX].data: ");
2551 	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2552 		u32 *val = snapshot->lrc_snapshot + i;
2553 		char dumped[ASCII85_BUFSZ];
2554 
2555 		drm_puts(p, ascii85_encode(*val, dumped));
2556 	}
2557 	drm_puts(p, "\n");
2558 }
2559 
2560 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2561 {
2562 	if (!snapshot)
2563 		return;
2564 
2565 	kvfree(snapshot->lrc_snapshot);
2566 	if (snapshot->lrc_bo)
2567 		xe_bo_put(snapshot->lrc_bo);
2568 
2569 	kfree(snapshot);
2570 }
2571 
2572 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2573 {
2574 	u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2575 	u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2576 	struct xe_hw_engine *hwe;
2577 	u64 val;
2578 
2579 	hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2580 	if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2581 			    "Unexpected engine class:instance %d:%d for context utilization\n",
2582 			    class, instance))
2583 		return -1;
2584 
2585 	if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2586 		val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2587 					  RING_CTX_TIMESTAMP(hwe->mmio_base));
2588 	else
2589 		val = xe_mmio_read32(&hwe->gt->mmio,
2590 				     RING_CTX_TIMESTAMP(hwe->mmio_base));
2591 
2592 	*reg_ctx_ts = val;
2593 
2594 	return 0;
2595 }
2596 
2597 /**
2598  * xe_lrc_timestamp() - Current ctx timestamp
2599  * @lrc: Pointer to the lrc.
2600  *
2601  * Return latest ctx timestamp. With support for active contexts, the
2602  * calculation may be slightly racy, so follow a read-again logic to ensure that
2603  * the context is still active before returning the right timestamp.
2604  *
2605  * Returns: New ctx timestamp value
2606  */
2607 u64 xe_lrc_timestamp(struct xe_lrc *lrc)
2608 {
2609 	u64 lrc_ts, reg_ts, new_ts = lrc->ctx_timestamp;
2610 	u32 engine_id;
2611 
2612 	lrc_ts = xe_lrc_ctx_timestamp(lrc);
2613 	/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2614 	if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2615 		new_ts = lrc_ts;
2616 		goto done;
2617 	}
2618 
2619 	if (lrc_ts == CONTEXT_ACTIVE) {
2620 		engine_id = xe_lrc_engine_id(lrc);
2621 		if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
2622 			new_ts = reg_ts;
2623 
2624 		/* read lrc again to ensure context is still active */
2625 		lrc_ts = xe_lrc_ctx_timestamp(lrc);
2626 	}
2627 
2628 	/*
2629 	 * If context switched out, just use the lrc_ts. Note that this needs to
2630 	 * be a separate if condition.
2631 	 */
2632 	if (lrc_ts != CONTEXT_ACTIVE)
2633 		new_ts = lrc_ts;
2634 
2635 done:
2636 	return new_ts;
2637 }
2638 
2639 /**
2640  * xe_lrc_update_timestamp() - Update ctx timestamp
2641  * @lrc: Pointer to the lrc.
2642  * @old_ts: Old timestamp value
2643  *
2644  * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2645  * update saved value.
2646  *
2647  * Returns: New ctx timestamp value
2648  */
2649 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2650 {
2651 	*old_ts = lrc->ctx_timestamp;
2652 	lrc->ctx_timestamp = xe_lrc_timestamp(lrc);
2653 
2654 	trace_xe_lrc_update_timestamp(lrc, *old_ts);
2655 
2656 	return lrc->ctx_timestamp;
2657 }
2658 
2659 /**
2660  * xe_lrc_ring_is_idle() - LRC is idle
2661  * @lrc: Pointer to the lrc.
2662  *
2663  * Compare LRC ring head and tail to determine if idle.
2664  *
2665  * Return: True is ring is idle, False otherwise
2666  */
2667 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2668 {
2669 	return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2670 }
2671