xref: /linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "i915_reg.h"
12 #include "intel_context.h"
13 #include "intel_engine.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt.h"
17 #include "intel_gt_regs.h"
18 #include "intel_lrc.h"
19 #include "intel_lrc_reg.h"
20 #include "intel_ring.h"
21 #include "shmem_utils.h"
22 
23 /*
24  * The per-platform tables are u8-encoded in @data. Decode @data and set the
25  * addresses' offset and commands in @regs. The following encoding is used
26  * for each byte. There are 2 steps: decoding commands and decoding addresses.
27  *
28  * Commands:
29  * [7]: create NOPs - number of NOPs are set in lower bits
30  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
31  *      MI_LRI_FORCE_POSTED
32  * [5:0]: Number of NOPs or registers to set values to in case of
33  *        MI_LOAD_REGISTER_IMM
34  *
35  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
36  * number of registers. They are set by using the REG/REG16 macros: the former
37  * is used for offsets smaller than 0x200 while the latter is for values bigger
38  * than that. Those macros already set all the bits documented below correctly:
39  *
40  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
41  *      follow, for the lower bits
42  * [6:0]: Register offset, without considering the engine base.
43  *
44  * This function only tweaks the commands and register offsets. Values are not
45  * filled out.
46  */
set_offsets(u32 * regs,const u8 * data,const struct intel_engine_cs * engine,bool close)47 static void set_offsets(u32 *regs,
48 			const u8 *data,
49 			const struct intel_engine_cs *engine,
50 			bool close)
51 #define NOP(x) (BIT(7) | (x))
52 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
53 #define POSTED BIT(0)
54 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
55 #define REG16(x) \
56 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
57 	(((x) >> 2) & 0x7f)
58 #define END 0
59 {
60 	const u32 base = engine->mmio_base;
61 
62 	while (*data) {
63 		u8 count, flags;
64 
65 		if (*data & BIT(7)) { /* skip */
66 			count = *data++ & ~BIT(7);
67 			regs += count;
68 			continue;
69 		}
70 
71 		count = *data & 0x3f;
72 		flags = *data >> 6;
73 		data++;
74 
75 		*regs = MI_LOAD_REGISTER_IMM(count);
76 		if (flags & POSTED)
77 			*regs |= MI_LRI_FORCE_POSTED;
78 		if (GRAPHICS_VER(engine->i915) >= 11)
79 			*regs |= MI_LRI_LRM_CS_MMIO;
80 		regs++;
81 
82 		GEM_BUG_ON(!count);
83 		do {
84 			u32 offset = 0;
85 			u8 v;
86 
87 			do {
88 				v = *data++;
89 				offset <<= 7;
90 				offset |= v & ~BIT(7);
91 			} while (v & BIT(7));
92 
93 			regs[0] = base + (offset << 2);
94 			regs += 2;
95 		} while (--count);
96 	}
97 
98 	if (close) {
99 		/* Close the batch; used mainly by live_lrc_layout() */
100 		*regs = MI_BATCH_BUFFER_END;
101 		if (GRAPHICS_VER(engine->i915) >= 11)
102 			*regs |= BIT(0);
103 	}
104 }
105 
106 static const u8 gen8_xcs_offsets[] = {
107 	NOP(1),
108 	LRI(11, 0),
109 	REG16(0x244),
110 	REG(0x034),
111 	REG(0x030),
112 	REG(0x038),
113 	REG(0x03c),
114 	REG(0x168),
115 	REG(0x140),
116 	REG(0x110),
117 	REG(0x11c),
118 	REG(0x114),
119 	REG(0x118),
120 
121 	NOP(9),
122 	LRI(9, 0),
123 	REG16(0x3a8),
124 	REG16(0x28c),
125 	REG16(0x288),
126 	REG16(0x284),
127 	REG16(0x280),
128 	REG16(0x27c),
129 	REG16(0x278),
130 	REG16(0x274),
131 	REG16(0x270),
132 
133 	NOP(13),
134 	LRI(2, 0),
135 	REG16(0x200),
136 	REG(0x028),
137 
138 	END
139 };
140 
141 static const u8 gen9_xcs_offsets[] = {
142 	NOP(1),
143 	LRI(14, POSTED),
144 	REG16(0x244),
145 	REG(0x034),
146 	REG(0x030),
147 	REG(0x038),
148 	REG(0x03c),
149 	REG(0x168),
150 	REG(0x140),
151 	REG(0x110),
152 	REG(0x11c),
153 	REG(0x114),
154 	REG(0x118),
155 	REG(0x1c0),
156 	REG(0x1c4),
157 	REG(0x1c8),
158 
159 	NOP(3),
160 	LRI(9, POSTED),
161 	REG16(0x3a8),
162 	REG16(0x28c),
163 	REG16(0x288),
164 	REG16(0x284),
165 	REG16(0x280),
166 	REG16(0x27c),
167 	REG16(0x278),
168 	REG16(0x274),
169 	REG16(0x270),
170 
171 	NOP(13),
172 	LRI(1, POSTED),
173 	REG16(0x200),
174 
175 	NOP(13),
176 	LRI(44, POSTED),
177 	REG(0x028),
178 	REG(0x09c),
179 	REG(0x0c0),
180 	REG(0x178),
181 	REG(0x17c),
182 	REG16(0x358),
183 	REG(0x170),
184 	REG(0x150),
185 	REG(0x154),
186 	REG(0x158),
187 	REG16(0x41c),
188 	REG16(0x600),
189 	REG16(0x604),
190 	REG16(0x608),
191 	REG16(0x60c),
192 	REG16(0x610),
193 	REG16(0x614),
194 	REG16(0x618),
195 	REG16(0x61c),
196 	REG16(0x620),
197 	REG16(0x624),
198 	REG16(0x628),
199 	REG16(0x62c),
200 	REG16(0x630),
201 	REG16(0x634),
202 	REG16(0x638),
203 	REG16(0x63c),
204 	REG16(0x640),
205 	REG16(0x644),
206 	REG16(0x648),
207 	REG16(0x64c),
208 	REG16(0x650),
209 	REG16(0x654),
210 	REG16(0x658),
211 	REG16(0x65c),
212 	REG16(0x660),
213 	REG16(0x664),
214 	REG16(0x668),
215 	REG16(0x66c),
216 	REG16(0x670),
217 	REG16(0x674),
218 	REG16(0x678),
219 	REG16(0x67c),
220 	REG(0x068),
221 
222 	END
223 };
224 
225 static const u8 gen12_xcs_offsets[] = {
226 	NOP(1),
227 	LRI(13, POSTED),
228 	REG16(0x244),
229 	REG(0x034),
230 	REG(0x030),
231 	REG(0x038),
232 	REG(0x03c),
233 	REG(0x168),
234 	REG(0x140),
235 	REG(0x110),
236 	REG(0x1c0),
237 	REG(0x1c4),
238 	REG(0x1c8),
239 	REG(0x180),
240 	REG16(0x2b4),
241 
242 	NOP(5),
243 	LRI(9, POSTED),
244 	REG16(0x3a8),
245 	REG16(0x28c),
246 	REG16(0x288),
247 	REG16(0x284),
248 	REG16(0x280),
249 	REG16(0x27c),
250 	REG16(0x278),
251 	REG16(0x274),
252 	REG16(0x270),
253 
254 	END
255 };
256 
257 static const u8 dg2_xcs_offsets[] = {
258 	NOP(1),
259 	LRI(15, POSTED),
260 	REG16(0x244),
261 	REG(0x034),
262 	REG(0x030),
263 	REG(0x038),
264 	REG(0x03c),
265 	REG(0x168),
266 	REG(0x140),
267 	REG(0x110),
268 	REG(0x1c0),
269 	REG(0x1c4),
270 	REG(0x1c8),
271 	REG(0x180),
272 	REG16(0x2b4),
273 	REG(0x120),
274 	REG(0x124),
275 
276 	NOP(1),
277 	LRI(9, POSTED),
278 	REG16(0x3a8),
279 	REG16(0x28c),
280 	REG16(0x288),
281 	REG16(0x284),
282 	REG16(0x280),
283 	REG16(0x27c),
284 	REG16(0x278),
285 	REG16(0x274),
286 	REG16(0x270),
287 
288 	END
289 };
290 
291 static const u8 gen8_rcs_offsets[] = {
292 	NOP(1),
293 	LRI(14, POSTED),
294 	REG16(0x244),
295 	REG(0x034),
296 	REG(0x030),
297 	REG(0x038),
298 	REG(0x03c),
299 	REG(0x168),
300 	REG(0x140),
301 	REG(0x110),
302 	REG(0x11c),
303 	REG(0x114),
304 	REG(0x118),
305 	REG(0x1c0),
306 	REG(0x1c4),
307 	REG(0x1c8),
308 
309 	NOP(3),
310 	LRI(9, POSTED),
311 	REG16(0x3a8),
312 	REG16(0x28c),
313 	REG16(0x288),
314 	REG16(0x284),
315 	REG16(0x280),
316 	REG16(0x27c),
317 	REG16(0x278),
318 	REG16(0x274),
319 	REG16(0x270),
320 
321 	NOP(13),
322 	LRI(1, 0),
323 	REG(0x0c8),
324 
325 	END
326 };
327 
328 static const u8 gen9_rcs_offsets[] = {
329 	NOP(1),
330 	LRI(14, POSTED),
331 	REG16(0x244),
332 	REG(0x34),
333 	REG(0x30),
334 	REG(0x38),
335 	REG(0x3c),
336 	REG(0x168),
337 	REG(0x140),
338 	REG(0x110),
339 	REG(0x11c),
340 	REG(0x114),
341 	REG(0x118),
342 	REG(0x1c0),
343 	REG(0x1c4),
344 	REG(0x1c8),
345 
346 	NOP(3),
347 	LRI(9, POSTED),
348 	REG16(0x3a8),
349 	REG16(0x28c),
350 	REG16(0x288),
351 	REG16(0x284),
352 	REG16(0x280),
353 	REG16(0x27c),
354 	REG16(0x278),
355 	REG16(0x274),
356 	REG16(0x270),
357 
358 	NOP(13),
359 	LRI(1, 0),
360 	REG(0xc8),
361 
362 	NOP(13),
363 	LRI(44, POSTED),
364 	REG(0x28),
365 	REG(0x9c),
366 	REG(0xc0),
367 	REG(0x178),
368 	REG(0x17c),
369 	REG16(0x358),
370 	REG(0x170),
371 	REG(0x150),
372 	REG(0x154),
373 	REG(0x158),
374 	REG16(0x41c),
375 	REG16(0x600),
376 	REG16(0x604),
377 	REG16(0x608),
378 	REG16(0x60c),
379 	REG16(0x610),
380 	REG16(0x614),
381 	REG16(0x618),
382 	REG16(0x61c),
383 	REG16(0x620),
384 	REG16(0x624),
385 	REG16(0x628),
386 	REG16(0x62c),
387 	REG16(0x630),
388 	REG16(0x634),
389 	REG16(0x638),
390 	REG16(0x63c),
391 	REG16(0x640),
392 	REG16(0x644),
393 	REG16(0x648),
394 	REG16(0x64c),
395 	REG16(0x650),
396 	REG16(0x654),
397 	REG16(0x658),
398 	REG16(0x65c),
399 	REG16(0x660),
400 	REG16(0x664),
401 	REG16(0x668),
402 	REG16(0x66c),
403 	REG16(0x670),
404 	REG16(0x674),
405 	REG16(0x678),
406 	REG16(0x67c),
407 	REG(0x68),
408 
409 	END
410 };
411 
412 static const u8 gen11_rcs_offsets[] = {
413 	NOP(1),
414 	LRI(15, POSTED),
415 	REG16(0x244),
416 	REG(0x034),
417 	REG(0x030),
418 	REG(0x038),
419 	REG(0x03c),
420 	REG(0x168),
421 	REG(0x140),
422 	REG(0x110),
423 	REG(0x11c),
424 	REG(0x114),
425 	REG(0x118),
426 	REG(0x1c0),
427 	REG(0x1c4),
428 	REG(0x1c8),
429 	REG(0x180),
430 
431 	NOP(1),
432 	LRI(9, POSTED),
433 	REG16(0x3a8),
434 	REG16(0x28c),
435 	REG16(0x288),
436 	REG16(0x284),
437 	REG16(0x280),
438 	REG16(0x27c),
439 	REG16(0x278),
440 	REG16(0x274),
441 	REG16(0x270),
442 
443 	LRI(1, POSTED),
444 	REG(0x1b0),
445 
446 	NOP(10),
447 	LRI(1, 0),
448 	REG(0x0c8),
449 
450 	END
451 };
452 
453 static const u8 gen12_rcs_offsets[] = {
454 	NOP(1),
455 	LRI(13, POSTED),
456 	REG16(0x244),
457 	REG(0x034),
458 	REG(0x030),
459 	REG(0x038),
460 	REG(0x03c),
461 	REG(0x168),
462 	REG(0x140),
463 	REG(0x110),
464 	REG(0x1c0),
465 	REG(0x1c4),
466 	REG(0x1c8),
467 	REG(0x180),
468 	REG16(0x2b4),
469 
470 	NOP(5),
471 	LRI(9, POSTED),
472 	REG16(0x3a8),
473 	REG16(0x28c),
474 	REG16(0x288),
475 	REG16(0x284),
476 	REG16(0x280),
477 	REG16(0x27c),
478 	REG16(0x278),
479 	REG16(0x274),
480 	REG16(0x270),
481 
482 	LRI(3, POSTED),
483 	REG(0x1b0),
484 	REG16(0x5a8),
485 	REG16(0x5ac),
486 
487 	NOP(6),
488 	LRI(1, 0),
489 	REG(0x0c8),
490 	NOP(3 + 9 + 1),
491 
492 	LRI(51, POSTED),
493 	REG16(0x588),
494 	REG16(0x588),
495 	REG16(0x588),
496 	REG16(0x588),
497 	REG16(0x588),
498 	REG16(0x588),
499 	REG(0x028),
500 	REG(0x09c),
501 	REG(0x0c0),
502 	REG(0x178),
503 	REG(0x17c),
504 	REG16(0x358),
505 	REG(0x170),
506 	REG(0x150),
507 	REG(0x154),
508 	REG(0x158),
509 	REG16(0x41c),
510 	REG16(0x600),
511 	REG16(0x604),
512 	REG16(0x608),
513 	REG16(0x60c),
514 	REG16(0x610),
515 	REG16(0x614),
516 	REG16(0x618),
517 	REG16(0x61c),
518 	REG16(0x620),
519 	REG16(0x624),
520 	REG16(0x628),
521 	REG16(0x62c),
522 	REG16(0x630),
523 	REG16(0x634),
524 	REG16(0x638),
525 	REG16(0x63c),
526 	REG16(0x640),
527 	REG16(0x644),
528 	REG16(0x648),
529 	REG16(0x64c),
530 	REG16(0x650),
531 	REG16(0x654),
532 	REG16(0x658),
533 	REG16(0x65c),
534 	REG16(0x660),
535 	REG16(0x664),
536 	REG16(0x668),
537 	REG16(0x66c),
538 	REG16(0x670),
539 	REG16(0x674),
540 	REG16(0x678),
541 	REG16(0x67c),
542 	REG(0x068),
543 	REG(0x084),
544 	NOP(1),
545 
546 	END
547 };
548 
549 static const u8 dg2_rcs_offsets[] = {
550 	NOP(1),
551 	LRI(15, POSTED),
552 	REG16(0x244),
553 	REG(0x034),
554 	REG(0x030),
555 	REG(0x038),
556 	REG(0x03c),
557 	REG(0x168),
558 	REG(0x140),
559 	REG(0x110),
560 	REG(0x1c0),
561 	REG(0x1c4),
562 	REG(0x1c8),
563 	REG(0x180),
564 	REG16(0x2b4),
565 	REG(0x120),
566 	REG(0x124),
567 
568 	NOP(1),
569 	LRI(9, POSTED),
570 	REG16(0x3a8),
571 	REG16(0x28c),
572 	REG16(0x288),
573 	REG16(0x284),
574 	REG16(0x280),
575 	REG16(0x27c),
576 	REG16(0x278),
577 	REG16(0x274),
578 	REG16(0x270),
579 
580 	LRI(3, POSTED),
581 	REG(0x1b0),
582 	REG16(0x5a8),
583 	REG16(0x5ac),
584 
585 	NOP(6),
586 	LRI(1, 0),
587 	REG(0x0c8),
588 
589 	END
590 };
591 
592 static const u8 mtl_rcs_offsets[] = {
593 	NOP(1),
594 	LRI(15, POSTED),
595 	REG16(0x244),
596 	REG(0x034),
597 	REG(0x030),
598 	REG(0x038),
599 	REG(0x03c),
600 	REG(0x168),
601 	REG(0x140),
602 	REG(0x110),
603 	REG(0x1c0),
604 	REG(0x1c4),
605 	REG(0x1c8),
606 	REG(0x180),
607 	REG16(0x2b4),
608 	REG(0x120),
609 	REG(0x124),
610 
611 	NOP(1),
612 	LRI(9, POSTED),
613 	REG16(0x3a8),
614 	REG16(0x28c),
615 	REG16(0x288),
616 	REG16(0x284),
617 	REG16(0x280),
618 	REG16(0x27c),
619 	REG16(0x278),
620 	REG16(0x274),
621 	REG16(0x270),
622 
623 	NOP(2),
624 	LRI(2, POSTED),
625 	REG16(0x5a8),
626 	REG16(0x5ac),
627 
628 	NOP(6),
629 	LRI(1, 0),
630 	REG(0x0c8),
631 
632 	END
633 };
634 
635 #undef END
636 #undef REG16
637 #undef REG
638 #undef LRI
639 #undef NOP
640 
reg_offsets(const struct intel_engine_cs * engine)641 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
642 {
643 	/*
644 	 * The gen12+ lists only have the registers we program in the basic
645 	 * default state. We rely on the context image using relative
646 	 * addressing to automatic fixup the register state between the
647 	 * physical engines for virtual engine.
648 	 */
649 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
650 		   !intel_engine_has_relative_mmio(engine));
651 
652 	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
653 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
654 			return mtl_rcs_offsets;
655 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
656 			return dg2_rcs_offsets;
657 		else if (GRAPHICS_VER(engine->i915) >= 12)
658 			return gen12_rcs_offsets;
659 		else if (GRAPHICS_VER(engine->i915) >= 11)
660 			return gen11_rcs_offsets;
661 		else if (GRAPHICS_VER(engine->i915) >= 9)
662 			return gen9_rcs_offsets;
663 		else
664 			return gen8_rcs_offsets;
665 	} else {
666 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
667 			return dg2_xcs_offsets;
668 		else if (GRAPHICS_VER(engine->i915) >= 12)
669 			return gen12_xcs_offsets;
670 		else if (GRAPHICS_VER(engine->i915) >= 9)
671 			return gen9_xcs_offsets;
672 		else
673 			return gen8_xcs_offsets;
674 	}
675 }
676 
lrc_ring_mi_mode(const struct intel_engine_cs * engine)677 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
678 {
679 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
680 		return 0x70;
681 	else if (GRAPHICS_VER(engine->i915) >= 12)
682 		return 0x60;
683 	else if (GRAPHICS_VER(engine->i915) >= 9)
684 		return 0x54;
685 	else if (engine->class == RENDER_CLASS)
686 		return 0x58;
687 	else
688 		return -1;
689 }
690 
lrc_ring_bb_offset(const struct intel_engine_cs * engine)691 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
692 {
693 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
694 		return 0x80;
695 	else if (GRAPHICS_VER(engine->i915) >= 12)
696 		return 0x70;
697 	else if (GRAPHICS_VER(engine->i915) >= 9)
698 		return 0x64;
699 	else if (GRAPHICS_VER(engine->i915) >= 8 &&
700 		 engine->class == RENDER_CLASS)
701 		return 0xc4;
702 	else
703 		return -1;
704 }
705 
lrc_ring_gpr0(const struct intel_engine_cs * engine)706 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
707 {
708 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
709 		return 0x84;
710 	else if (GRAPHICS_VER(engine->i915) >= 12)
711 		return 0x74;
712 	else if (GRAPHICS_VER(engine->i915) >= 9)
713 		return 0x68;
714 	else if (engine->class == RENDER_CLASS)
715 		return 0xd8;
716 	else
717 		return -1;
718 }
719 
lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs * engine)720 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
721 {
722 	if (GRAPHICS_VER(engine->i915) >= 12)
723 		return 0x12;
724 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
725 		return 0x18;
726 	else
727 		return -1;
728 }
729 
lrc_ring_indirect_ptr(const struct intel_engine_cs * engine)730 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
731 {
732 	int x;
733 
734 	x = lrc_ring_wa_bb_per_ctx(engine);
735 	if (x < 0)
736 		return x;
737 
738 	return x + 2;
739 }
740 
lrc_ring_indirect_offset(const struct intel_engine_cs * engine)741 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
742 {
743 	int x;
744 
745 	x = lrc_ring_indirect_ptr(engine);
746 	if (x < 0)
747 		return x;
748 
749 	return x + 2;
750 }
751 
lrc_ring_cmd_buf_cctl(const struct intel_engine_cs * engine)752 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
753 {
754 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
755 		/*
756 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
757 		 * simply to match the RCS context image layout.
758 		 */
759 		return 0xc6;
760 	else if (engine->class != RENDER_CLASS)
761 		return -1;
762 	else if (GRAPHICS_VER(engine->i915) >= 12)
763 		return 0xb6;
764 	else if (GRAPHICS_VER(engine->i915) >= 11)
765 		return 0xaa;
766 	else
767 		return -1;
768 }
769 
770 static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs * engine)771 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
772 {
773 	if (GRAPHICS_VER(engine->i915) >= 12)
774 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
775 	else if (GRAPHICS_VER(engine->i915) >= 11)
776 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
777 	else if (GRAPHICS_VER(engine->i915) >= 9)
778 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
779 	else if (GRAPHICS_VER(engine->i915) >= 8)
780 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
781 
782 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
783 
784 	return 0;
785 }
786 
787 static void
lrc_setup_bb_per_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr)788 lrc_setup_bb_per_ctx(u32 *regs,
789 		     const struct intel_engine_cs *engine,
790 		     u32 ctx_bb_ggtt_addr)
791 {
792 	GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
793 	regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
794 		ctx_bb_ggtt_addr |
795 		PER_CTX_BB_FORCE |
796 		PER_CTX_BB_VALID;
797 }
798 
799 static void
lrc_setup_indirect_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr,u32 size)800 lrc_setup_indirect_ctx(u32 *regs,
801 		       const struct intel_engine_cs *engine,
802 		       u32 ctx_bb_ggtt_addr,
803 		       u32 size)
804 {
805 	GEM_BUG_ON(!size);
806 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
807 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
808 	regs[lrc_ring_indirect_ptr(engine) + 1] =
809 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
810 
811 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
812 	regs[lrc_ring_indirect_offset(engine) + 1] =
813 		lrc_ring_indirect_offset_default(engine) << 6;
814 }
815 
ctx_needs_runalone(const struct intel_context * ce)816 static bool ctx_needs_runalone(const struct intel_context *ce)
817 {
818 	struct i915_gem_context *gem_ctx;
819 	bool ctx_is_protected = false;
820 
821 	/*
822 	 * Wa_14019159160 - Case 2.
823 	 * On some platforms, protected contexts require setting
824 	 * the LRC run-alone bit or else the encryption/decryption will not happen.
825 	 * NOTE: Case 2 only applies to PXP use-case of said workaround.
826 	 */
827 	if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
828 	    (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) {
829 		rcu_read_lock();
830 		gem_ctx = rcu_dereference(ce->gem_context);
831 		if (gem_ctx)
832 			ctx_is_protected = gem_ctx->uses_protected_content;
833 		rcu_read_unlock();
834 	}
835 
836 	return ctx_is_protected;
837 }
838 
init_common_regs(u32 * const regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)839 static void init_common_regs(u32 * const regs,
840 			     const struct intel_context *ce,
841 			     const struct intel_engine_cs *engine,
842 			     bool inhibit)
843 {
844 	u32 ctl;
845 	int loc;
846 
847 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
848 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
849 	if (inhibit)
850 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
851 	if (GRAPHICS_VER(engine->i915) < 11)
852 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
853 					   CTX_CTRL_RS_CTX_ENABLE);
854 	/* Wa_14019159160 - Case 2.*/
855 	if (ctx_needs_runalone(ce))
856 		ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE);
857 	regs[CTX_CONTEXT_CONTROL] = ctl;
858 
859 	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
860 
861 	loc = lrc_ring_bb_offset(engine);
862 	if (loc != -1)
863 		regs[loc + 1] = 0;
864 }
865 
init_wa_bb_regs(u32 * const regs,const struct intel_engine_cs * engine)866 static void init_wa_bb_regs(u32 * const regs,
867 			    const struct intel_engine_cs *engine)
868 {
869 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
870 
871 	if (wa_ctx->per_ctx.size) {
872 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
873 
874 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
875 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
876 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
877 	}
878 
879 	if (wa_ctx->indirect_ctx.size) {
880 		lrc_setup_indirect_ctx(regs, engine,
881 				       i915_ggtt_offset(wa_ctx->vma) +
882 				       wa_ctx->indirect_ctx.offset,
883 				       wa_ctx->indirect_ctx.size);
884 	}
885 }
886 
init_ppgtt_regs(u32 * regs,const struct i915_ppgtt * ppgtt)887 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
888 {
889 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
890 		/* 64b PPGTT (48bit canonical)
891 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
892 		 * other PDP Descriptors are ignored.
893 		 */
894 		ASSIGN_CTX_PML4(ppgtt, regs);
895 	} else {
896 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
897 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
898 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
899 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
900 	}
901 }
902 
vm_alias(struct i915_address_space * vm)903 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
904 {
905 	if (i915_is_ggtt(vm))
906 		return i915_vm_to_ggtt(vm)->alias;
907 	else
908 		return i915_vm_to_ppgtt(vm);
909 }
910 
__reset_stop_ring(u32 * regs,const struct intel_engine_cs * engine)911 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
912 {
913 	int x;
914 
915 	x = lrc_ring_mi_mode(engine);
916 	if (x != -1) {
917 		regs[x + 1] &= ~STOP_RING;
918 		regs[x + 1] |= STOP_RING << 16;
919 	}
920 }
921 
__lrc_init_regs(u32 * regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)922 static void __lrc_init_regs(u32 *regs,
923 			    const struct intel_context *ce,
924 			    const struct intel_engine_cs *engine,
925 			    bool inhibit)
926 {
927 	/*
928 	 * A context is actually a big batch buffer with several
929 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
930 	 * values we are setting here are only for the first context restore:
931 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
932 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
933 	 * we are not initializing here).
934 	 *
935 	 * Must keep consistent with virtual_update_register_offsets().
936 	 */
937 
938 	if (inhibit)
939 		memset(regs, 0, PAGE_SIZE);
940 
941 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
942 
943 	init_common_regs(regs, ce, engine, inhibit);
944 	init_ppgtt_regs(regs, vm_alias(ce->vm));
945 
946 	init_wa_bb_regs(regs, engine);
947 
948 	__reset_stop_ring(regs, engine);
949 }
950 
lrc_init_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)951 void lrc_init_regs(const struct intel_context *ce,
952 		   const struct intel_engine_cs *engine,
953 		   bool inhibit)
954 {
955 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
956 }
957 
lrc_reset_regs(const struct intel_context * ce,const struct intel_engine_cs * engine)958 void lrc_reset_regs(const struct intel_context *ce,
959 		    const struct intel_engine_cs *engine)
960 {
961 	__reset_stop_ring(ce->lrc_reg_state, engine);
962 }
963 
964 static void
set_redzone(void * vaddr,const struct intel_engine_cs * engine)965 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
966 {
967 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
968 		return;
969 
970 	vaddr += engine->context_size;
971 
972 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
973 }
974 
975 static void
check_redzone(const void * vaddr,const struct intel_engine_cs * engine)976 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
977 {
978 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
979 		return;
980 
981 	vaddr += engine->context_size;
982 
983 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
984 		drm_err_once(&engine->i915->drm,
985 			     "%s context redzone overwritten!\n",
986 			     engine->name);
987 }
988 
context_wa_bb_offset(const struct intel_context * ce)989 static u32 context_wa_bb_offset(const struct intel_context *ce)
990 {
991 	return PAGE_SIZE * ce->wa_bb_page;
992 }
993 
994 /*
995  * per_ctx below determines which WABB section is used.
996  * When true, the function returns the location of the
997  * PER_CTX_BB.  When false, the function returns the
998  * location of the INDIRECT_CTX.
999  */
context_wabb(const struct intel_context * ce,bool per_ctx)1000 static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
1001 {
1002 	void *ptr;
1003 
1004 	GEM_BUG_ON(!ce->wa_bb_page);
1005 
1006 	ptr = ce->lrc_reg_state;
1007 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1008 	ptr += context_wa_bb_offset(ce);
1009 	ptr += per_ctx ? PAGE_SIZE : 0;
1010 
1011 	return ptr;
1012 }
1013 
lrc_init_state(struct intel_context * ce,struct intel_engine_cs * engine,void * state)1014 void lrc_init_state(struct intel_context *ce,
1015 		    struct intel_engine_cs *engine,
1016 		    void *state)
1017 {
1018 	bool inhibit = true;
1019 
1020 	set_redzone(state, engine);
1021 
1022 	if (ce->default_state) {
1023 		shmem_read(ce->default_state, 0, state, engine->context_size);
1024 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
1025 		inhibit = false;
1026 	}
1027 
1028 	/* Clear the ppHWSP (inc. per-context counters) */
1029 	memset(state, 0, PAGE_SIZE);
1030 
1031 	/* Clear the indirect wa and storage */
1032 	if (ce->wa_bb_page)
1033 		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1034 
1035 	/*
1036 	 * The second page of the context object contains some registers which
1037 	 * must be set up prior to the first execution.
1038 	 */
1039 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1040 }
1041 
lrc_indirect_bb(const struct intel_context * ce)1042 u32 lrc_indirect_bb(const struct intel_context *ce)
1043 {
1044 	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1045 }
1046 
setup_predicate_disable_wa(const struct intel_context * ce,u32 * cs)1047 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1048 {
1049 	/* If predication is active, this will be noop'ed */
1050 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1051 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1052 	*cs++ = 0;
1053 	*cs++ = 0; /* No predication */
1054 
1055 	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1056 	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
1057 	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1058 
1059 	/* Instructions are no longer predicated (disabled), we can proceed */
1060 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1061 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1062 	*cs++ = 0;
1063 	*cs++ = 1; /* enable predication before the next BB */
1064 
1065 	*cs++ = MI_BATCH_BUFFER_END;
1066 	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1067 
1068 	return cs;
1069 }
1070 
1071 static struct i915_vma *
__lrc_alloc_state(struct intel_context * ce,struct intel_engine_cs * engine)1072 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1073 {
1074 	struct drm_i915_gem_object *obj;
1075 	struct i915_vma *vma;
1076 	u32 context_size;
1077 
1078 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1079 
1080 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1081 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1082 
1083 	if (GRAPHICS_VER(engine->i915) >= 12) {
1084 		ce->wa_bb_page = context_size / PAGE_SIZE;
1085 		/* INDIRECT_CTX and PER_CTX_BB need separate pages. */
1086 		context_size += PAGE_SIZE * 2;
1087 	}
1088 
1089 	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1090 		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1091 		context_size += PARENT_SCRATCH_SIZE;
1092 	}
1093 
1094 	obj = i915_gem_object_create_lmem(engine->i915, context_size,
1095 					  I915_BO_ALLOC_PM_VOLATILE);
1096 	if (IS_ERR(obj)) {
1097 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
1098 		if (IS_ERR(obj))
1099 			return ERR_CAST(obj);
1100 
1101 		/*
1102 		 * Wa_22016122933: For Media version 13.0, all Media GT shared
1103 		 * memory needs to be mapped as WC on CPU side and UC (PAT
1104 		 * index 2) on GPU side.
1105 		 */
1106 		if (intel_gt_needs_wa_22016122933(engine->gt))
1107 			i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
1108 	}
1109 
1110 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1111 	if (IS_ERR(vma)) {
1112 		i915_gem_object_put(obj);
1113 		return vma;
1114 	}
1115 
1116 	return vma;
1117 }
1118 
1119 static struct intel_timeline *
pinned_timeline(struct intel_context * ce,struct intel_engine_cs * engine)1120 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1121 {
1122 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1123 
1124 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1125 }
1126 
lrc_alloc(struct intel_context * ce,struct intel_engine_cs * engine)1127 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1128 {
1129 	struct intel_ring *ring;
1130 	struct i915_vma *vma;
1131 	int err;
1132 
1133 	GEM_BUG_ON(ce->state);
1134 
1135 	if (!intel_context_has_own_state(ce))
1136 		ce->default_state = engine->default_state;
1137 
1138 	vma = __lrc_alloc_state(ce, engine);
1139 	if (IS_ERR(vma))
1140 		return PTR_ERR(vma);
1141 
1142 	ring = intel_engine_create_ring(engine, ce->ring_size);
1143 	if (IS_ERR(ring)) {
1144 		err = PTR_ERR(ring);
1145 		goto err_vma;
1146 	}
1147 
1148 	if (!page_mask_bits(ce->timeline)) {
1149 		struct intel_timeline *tl;
1150 
1151 		/*
1152 		 * Use the static global HWSP for the kernel context, and
1153 		 * a dynamically allocated cacheline for everyone else.
1154 		 */
1155 		if (unlikely(ce->timeline))
1156 			tl = pinned_timeline(ce, engine);
1157 		else
1158 			tl = intel_timeline_create(engine->gt);
1159 		if (IS_ERR(tl)) {
1160 			err = PTR_ERR(tl);
1161 			goto err_ring;
1162 		}
1163 
1164 		ce->timeline = tl;
1165 	}
1166 
1167 	ce->ring = ring;
1168 	ce->state = vma;
1169 
1170 	return 0;
1171 
1172 err_ring:
1173 	intel_ring_put(ring);
1174 err_vma:
1175 	i915_vma_put(vma);
1176 	return err;
1177 }
1178 
lrc_reset(struct intel_context * ce)1179 void lrc_reset(struct intel_context *ce)
1180 {
1181 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1182 
1183 	intel_ring_reset(ce->ring, ce->ring->emit);
1184 
1185 	/* Scrub away the garbage */
1186 	lrc_init_regs(ce, ce->engine, true);
1187 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1188 }
1189 
1190 int
lrc_pre_pin(struct intel_context * ce,struct intel_engine_cs * engine,struct i915_gem_ww_ctx * ww,void ** vaddr)1191 lrc_pre_pin(struct intel_context *ce,
1192 	    struct intel_engine_cs *engine,
1193 	    struct i915_gem_ww_ctx *ww,
1194 	    void **vaddr)
1195 {
1196 	GEM_BUG_ON(!ce->state);
1197 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1198 
1199 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1200 					 intel_gt_coherent_map_type(ce->engine->gt,
1201 								    ce->state->obj,
1202 								    false) |
1203 					 I915_MAP_OVERRIDE);
1204 
1205 	return PTR_ERR_OR_ZERO(*vaddr);
1206 }
1207 
1208 int
lrc_pin(struct intel_context * ce,struct intel_engine_cs * engine,void * vaddr)1209 lrc_pin(struct intel_context *ce,
1210 	struct intel_engine_cs *engine,
1211 	void *vaddr)
1212 {
1213 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1214 
1215 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1216 		lrc_init_state(ce, engine, vaddr);
1217 
1218 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1219 	return 0;
1220 }
1221 
lrc_unpin(struct intel_context * ce)1222 void lrc_unpin(struct intel_context *ce)
1223 {
1224 	if (unlikely(ce->parallel.last_rq)) {
1225 		i915_request_put(ce->parallel.last_rq);
1226 		ce->parallel.last_rq = NULL;
1227 	}
1228 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1229 		      ce->engine);
1230 }
1231 
lrc_post_unpin(struct intel_context * ce)1232 void lrc_post_unpin(struct intel_context *ce)
1233 {
1234 	i915_gem_object_unpin_map(ce->state->obj);
1235 }
1236 
lrc_fini(struct intel_context * ce)1237 void lrc_fini(struct intel_context *ce)
1238 {
1239 	if (!ce->state)
1240 		return;
1241 
1242 	intel_ring_put(fetch_and_zero(&ce->ring));
1243 	i915_vma_put(fetch_and_zero(&ce->state));
1244 }
1245 
lrc_destroy(struct kref * kref)1246 void lrc_destroy(struct kref *kref)
1247 {
1248 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1249 
1250 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1251 	GEM_BUG_ON(intel_context_is_pinned(ce));
1252 
1253 	lrc_fini(ce);
1254 
1255 	intel_context_fini(ce);
1256 	intel_context_free(ce);
1257 }
1258 
1259 static u32 *
gen12_emit_timestamp_wa(const struct intel_context * ce,u32 * cs)1260 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1261 {
1262 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1263 		MI_SRM_LRM_GLOBAL_GTT |
1264 		MI_LRI_LRM_CS_MMIO;
1265 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1266 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1267 		CTX_TIMESTAMP * sizeof(u32);
1268 	*cs++ = 0;
1269 
1270 	*cs++ = MI_LOAD_REGISTER_REG |
1271 		MI_LRR_SOURCE_CS_MMIO |
1272 		MI_LRI_LRM_CS_MMIO;
1273 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1274 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1275 
1276 	*cs++ = MI_LOAD_REGISTER_REG |
1277 		MI_LRR_SOURCE_CS_MMIO |
1278 		MI_LRI_LRM_CS_MMIO;
1279 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1280 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1281 
1282 	return cs;
1283 }
1284 
1285 static u32 *
gen12_emit_restore_scratch(const struct intel_context * ce,u32 * cs)1286 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1287 {
1288 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1289 
1290 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1291 		MI_SRM_LRM_GLOBAL_GTT |
1292 		MI_LRI_LRM_CS_MMIO;
1293 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1294 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1295 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1296 	*cs++ = 0;
1297 
1298 	return cs;
1299 }
1300 
1301 static u32 *
gen12_emit_cmd_buf_wa(const struct intel_context * ce,u32 * cs)1302 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1303 {
1304 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1305 
1306 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1307 		MI_SRM_LRM_GLOBAL_GTT |
1308 		MI_LRI_LRM_CS_MMIO;
1309 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1310 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1311 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1312 	*cs++ = 0;
1313 
1314 	*cs++ = MI_LOAD_REGISTER_REG |
1315 		MI_LRR_SOURCE_CS_MMIO |
1316 		MI_LRI_LRM_CS_MMIO;
1317 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1318 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1319 
1320 	return cs;
1321 }
1322 
1323 /*
1324  * The bspec's tuning guide asks us to program a vertical watermark value of
1325  * 0x3FF.  However this register is not saved/restored properly by the
1326  * hardware, so we're required to apply the desired value via INDIRECT_CTX
1327  * batch buffer to ensure the value takes effect properly.  All other bits
1328  * in this register should remain at 0 (the hardware default).
1329  */
1330 static u32 *
dg2_emit_draw_watermark_setting(u32 * cs)1331 dg2_emit_draw_watermark_setting(u32 *cs)
1332 {
1333 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1334 	*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1335 	*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1336 
1337 	return cs;
1338 }
1339 
1340 static u32 *
gen12_invalidate_state_cache(u32 * cs)1341 gen12_invalidate_state_cache(u32 *cs)
1342 {
1343 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1344 	*cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2);
1345 	*cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1346 	return cs;
1347 }
1348 
1349 static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context * ce,u32 * cs)1350 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1351 {
1352 	cs = gen12_emit_timestamp_wa(ce, cs);
1353 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1354 	cs = gen12_emit_restore_scratch(ce, cs);
1355 
1356 	/* Wa_16013000631:dg2 */
1357 	if (IS_DG2_G11(ce->engine->i915))
1358 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1359 
1360 	cs = gen12_emit_aux_table_inv(ce->engine, cs);
1361 
1362 	/* Wa_18022495364 */
1363 	if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10)))
1364 		cs = gen12_invalidate_state_cache(cs);
1365 
1366 	/* Wa_16014892111 */
1367 	if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1368 	    IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
1369 	    IS_DG2(ce->engine->i915))
1370 		cs = dg2_emit_draw_watermark_setting(cs);
1371 
1372 	return cs;
1373 }
1374 
1375 static u32 *
gen12_emit_indirect_ctx_xcs(const struct intel_context * ce,u32 * cs)1376 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1377 {
1378 	cs = gen12_emit_timestamp_wa(ce, cs);
1379 	cs = gen12_emit_restore_scratch(ce, cs);
1380 
1381 	/* Wa_16013000631:dg2 */
1382 	if (IS_DG2_G11(ce->engine->i915))
1383 		if (ce->engine->class == COMPUTE_CLASS)
1384 			cs = gen8_emit_pipe_control(cs,
1385 						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1386 						    0);
1387 
1388 	return gen12_emit_aux_table_inv(ce->engine, cs);
1389 }
1390 
xehp_emit_fastcolor_blt_wabb(const struct intel_context * ce,u32 * cs)1391 static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
1392 {
1393 	struct intel_gt *gt = ce->engine->gt;
1394 	int mocs = gt->mocs.uc_index << 1;
1395 
1396 	/**
1397 	 * Wa_16018031267 / Wa_16018063123 requires that SW forces the
1398 	 * main copy engine arbitration into round robin mode.  We
1399 	 * additionally need to submit the following WABB blt command
1400 	 * to produce 4 subblits with each subblit generating 0 byte
1401 	 * write requests as WABB:
1402 	 *
1403 	 * XY_FASTCOLOR_BLT
1404 	 *  BG0    -> 5100000E
1405 	 *  BG1    -> 0000003F (Dest pitch)
1406 	 *  BG2    -> 00000000 (X1, Y1) = (0, 0)
1407 	 *  BG3    -> 00040001 (X2, Y2) = (1, 4)
1408 	 *  BG4    -> scratch
1409 	 *  BG5    -> scratch
1410 	 *  BG6-12 -> 00000000
1411 	 *  BG13   -> 20004004 (Surf. Width= 2,Surf. Height = 5 )
1412 	 *  BG14   -> 00000010 (Qpitch = 4)
1413 	 *  BG15   -> 00000000
1414 	 */
1415 	*cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2);
1416 	*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f;
1417 	*cs++ = 0;
1418 	*cs++ = 4 << 16 | 1;
1419 	*cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1420 	*cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1421 	*cs++ = 0;
1422 	*cs++ = 0;
1423 	*cs++ = 0;
1424 	*cs++ = 0;
1425 	*cs++ = 0;
1426 	*cs++ = 0;
1427 	*cs++ = 0;
1428 	*cs++ = 0x20004004;
1429 	*cs++ = 0x10;
1430 	*cs++ = 0;
1431 
1432 	return cs;
1433 }
1434 
1435 static u32 *
xehp_emit_per_ctx_bb(const struct intel_context * ce,u32 * cs)1436 xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
1437 {
1438 	/* Wa_16018031267, Wa_16018063123 */
1439 	if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine))
1440 		cs = xehp_emit_fastcolor_blt_wabb(ce, cs);
1441 
1442 	return cs;
1443 }
1444 
1445 static void
setup_per_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))1446 setup_per_ctx_bb(const struct intel_context *ce,
1447 		 const struct intel_engine_cs *engine,
1448 		 u32 *(*emit)(const struct intel_context *, u32 *))
1449 {
1450 	/* Place PER_CTX_BB on next page after INDIRECT_CTX */
1451 	u32 * const start = context_wabb(ce, true);
1452 	u32 *cs;
1453 
1454 	cs = emit(ce, start);
1455 
1456 	/* PER_CTX_BB must manually terminate */
1457 	*cs++ = MI_BATCH_BUFFER_END;
1458 
1459 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1460 	lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine,
1461 			     lrc_indirect_bb(ce) + PAGE_SIZE);
1462 }
1463 
1464 static void
setup_indirect_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))1465 setup_indirect_ctx_bb(const struct intel_context *ce,
1466 		      const struct intel_engine_cs *engine,
1467 		      u32 *(*emit)(const struct intel_context *, u32 *))
1468 {
1469 	u32 * const start = context_wabb(ce, false);
1470 	u32 *cs;
1471 
1472 	cs = emit(ce, start);
1473 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1474 	while ((unsigned long)cs % CACHELINE_BYTES)
1475 		*cs++ = MI_NOOP;
1476 
1477 	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1478 	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1479 
1480 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1481 			       lrc_indirect_bb(ce),
1482 			       (cs - start) * sizeof(*cs));
1483 }
1484 
1485 /*
1486  * The context descriptor encodes various attributes of a context,
1487  * including its GTT address and some flags. Because it's fairly
1488  * expensive to calculate, we'll just do it once and cache the result,
1489  * which remains valid until the context is unpinned.
1490  *
1491  * This is what a descriptor looks like, from LSB to MSB::
1492  *
1493  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1494  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1495  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1496  *      bits 53-54:    mbz, reserved for use by hardware
1497  *      bits 55-63:    group ID, currently unused and set to 0
1498  *
1499  * Starting from Gen11, the upper dword of the descriptor has a new format:
1500  *
1501  *      bits 32-36:    reserved
1502  *      bits 37-47:    SW context ID
1503  *      bits 48:53:    engine instance
1504  *      bit 54:        mbz, reserved for use by hardware
1505  *      bits 55-60:    SW counter
1506  *      bits 61-63:    engine class
1507  *
1508  * On Xe_HP, the upper dword of the descriptor has a new format:
1509  *
1510  *      bits 32-37:    virtual function number
1511  *      bit 38:        mbz, reserved for use by hardware
1512  *      bits 39-54:    SW context ID
1513  *      bits 55-57:    reserved
1514  *      bits 58-63:    SW counter
1515  *
1516  * engine info, SW context ID and SW counter need to form a unique number
1517  * (Context ID) per lrc.
1518  */
lrc_descriptor(const struct intel_context * ce)1519 static u32 lrc_descriptor(const struct intel_context *ce)
1520 {
1521 	u32 desc;
1522 
1523 	desc = INTEL_LEGACY_32B_CONTEXT;
1524 	if (i915_vm_is_4lvl(ce->vm))
1525 		desc = INTEL_LEGACY_64B_CONTEXT;
1526 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1527 
1528 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1529 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1530 		desc |= GEN8_CTX_L3LLC_COHERENT;
1531 
1532 	return i915_ggtt_offset(ce->state) | desc;
1533 }
1534 
lrc_update_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 head)1535 u32 lrc_update_regs(const struct intel_context *ce,
1536 		    const struct intel_engine_cs *engine,
1537 		    u32 head)
1538 {
1539 	struct intel_ring *ring = ce->ring;
1540 	u32 *regs = ce->lrc_reg_state;
1541 
1542 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1543 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1544 
1545 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1546 	regs[CTX_RING_HEAD] = head;
1547 	regs[CTX_RING_TAIL] = ring->tail;
1548 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1549 
1550 	/* RPCS */
1551 	if (engine->class == RENDER_CLASS) {
1552 		regs[CTX_R_PWR_CLK_STATE] =
1553 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1554 
1555 		i915_oa_init_reg_state(ce, engine);
1556 	}
1557 
1558 	if (ce->wa_bb_page) {
1559 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1560 
1561 		fn = gen12_emit_indirect_ctx_xcs;
1562 		if (ce->engine->class == RENDER_CLASS)
1563 			fn = gen12_emit_indirect_ctx_rcs;
1564 
1565 		/* Mutually exclusive wrt to global indirect bb */
1566 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1567 		setup_indirect_ctx_bb(ce, engine, fn);
1568 		setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb);
1569 	}
1570 
1571 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1572 }
1573 
lrc_update_offsets(struct intel_context * ce,struct intel_engine_cs * engine)1574 void lrc_update_offsets(struct intel_context *ce,
1575 			struct intel_engine_cs *engine)
1576 {
1577 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1578 }
1579 
lrc_check_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,const char * when)1580 void lrc_check_regs(const struct intel_context *ce,
1581 		    const struct intel_engine_cs *engine,
1582 		    const char *when)
1583 {
1584 	const struct intel_ring *ring = ce->ring;
1585 	u32 *regs = ce->lrc_reg_state;
1586 	bool valid = true;
1587 	int x;
1588 
1589 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1590 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1591 		       engine->name,
1592 		       regs[CTX_RING_START],
1593 		       i915_ggtt_offset(ring->vma));
1594 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1595 		valid = false;
1596 	}
1597 
1598 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1599 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1600 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1601 		       engine->name,
1602 		       regs[CTX_RING_CTL],
1603 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1604 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1605 		valid = false;
1606 	}
1607 
1608 	x = lrc_ring_mi_mode(engine);
1609 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1610 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1611 		       engine->name, regs[x + 1]);
1612 		regs[x + 1] &= ~STOP_RING;
1613 		regs[x + 1] |= STOP_RING << 16;
1614 		valid = false;
1615 	}
1616 
1617 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1618 }
1619 
1620 /*
1621  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1622  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1623  * but there is a slight complication as this is applied in WA batch where the
1624  * values are only initialized once so we cannot take register value at the
1625  * beginning and reuse it further; hence we save its value to memory, upload a
1626  * constant value with bit21 set and then we restore it back with the saved value.
1627  * To simplify the WA, a constant value is formed by using the default value
1628  * of this register. This shouldn't be a problem because we are only modifying
1629  * it for a short period and this batch in non-premptible. We can ofcourse
1630  * use additional instructions that read the actual value of the register
1631  * at that time and set our bit of interest but it makes the WA complicated.
1632  *
1633  * This WA is also required for Gen9 so extracting as a function avoids
1634  * code duplication.
1635  */
1636 static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs * engine,u32 * batch)1637 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1638 {
1639 	/* NB no one else is allowed to scribble over scratch + 256! */
1640 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1641 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1642 	*batch++ = intel_gt_scratch_offset(engine->gt,
1643 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1644 	*batch++ = 0;
1645 
1646 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1647 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1648 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1649 
1650 	batch = gen8_emit_pipe_control(batch,
1651 				       PIPE_CONTROL_CS_STALL |
1652 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1653 				       0);
1654 
1655 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1656 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1657 	*batch++ = intel_gt_scratch_offset(engine->gt,
1658 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1659 	*batch++ = 0;
1660 
1661 	return batch;
1662 }
1663 
1664 /*
1665  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1666  * initialized at the beginning and shared across all contexts but this field
1667  * helps us to have multiple batches at different offsets and select them based
1668  * on a criteria. At the moment this batch always start at the beginning of the page
1669  * and at this point we don't have multiple wa_ctx batch buffers.
1670  *
1671  * The number of WA applied are not known at the beginning; we use this field
1672  * to return the no of DWORDS written.
1673  *
1674  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1675  * so it adds NOOPs as padding to make it cacheline aligned.
1676  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1677  * makes a complete batch buffer.
1678  */
gen8_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1679 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1680 {
1681 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1682 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1683 
1684 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1685 	if (IS_BROADWELL(engine->i915))
1686 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1687 
1688 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1689 	/* Actual scratch location is at 128 bytes offset */
1690 	batch = gen8_emit_pipe_control(batch,
1691 				       PIPE_CONTROL_FLUSH_L3 |
1692 				       PIPE_CONTROL_STORE_DATA_INDEX |
1693 				       PIPE_CONTROL_CS_STALL |
1694 				       PIPE_CONTROL_QW_WRITE,
1695 				       LRC_PPHWSP_SCRATCH_ADDR);
1696 
1697 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1698 
1699 	/* Pad to end of cacheline */
1700 	while ((unsigned long)batch % CACHELINE_BYTES)
1701 		*batch++ = MI_NOOP;
1702 
1703 	/*
1704 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1705 	 * execution depends on the length specified in terms of cache lines
1706 	 * in the register CTX_RCS_INDIRECT_CTX
1707 	 */
1708 
1709 	return batch;
1710 }
1711 
1712 struct lri {
1713 	i915_reg_t reg;
1714 	u32 value;
1715 };
1716 
emit_lri(u32 * batch,const struct lri * lri,unsigned int count)1717 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1718 {
1719 	GEM_BUG_ON(!count || count > 63);
1720 
1721 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1722 	do {
1723 		*batch++ = i915_mmio_reg_offset(lri->reg);
1724 		*batch++ = lri->value;
1725 	} while (lri++, --count);
1726 	*batch++ = MI_NOOP;
1727 
1728 	return batch;
1729 }
1730 
gen9_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1731 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1732 {
1733 	static const struct lri lri[] = {
1734 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1735 		{
1736 			COMMON_SLICE_CHICKEN2,
1737 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1738 				       0),
1739 		},
1740 
1741 		/* BSpec: 11391 */
1742 		{
1743 			FF_SLICE_CHICKEN,
1744 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1745 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1746 		},
1747 
1748 		/* BSpec: 11299 */
1749 		{
1750 			_3D_CHICKEN3,
1751 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1752 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1753 		}
1754 	};
1755 
1756 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1757 
1758 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1759 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1760 
1761 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1762 	batch = gen8_emit_pipe_control(batch,
1763 				       PIPE_CONTROL_FLUSH_L3 |
1764 				       PIPE_CONTROL_STORE_DATA_INDEX |
1765 				       PIPE_CONTROL_CS_STALL |
1766 				       PIPE_CONTROL_QW_WRITE,
1767 				       LRC_PPHWSP_SCRATCH_ADDR);
1768 
1769 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1770 
1771 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1772 	if (HAS_POOLED_EU(engine->i915)) {
1773 		/*
1774 		 * EU pool configuration is setup along with golden context
1775 		 * during context initialization. This value depends on
1776 		 * device type (2x6 or 3x6) and needs to be updated based
1777 		 * on which subslice is disabled especially for 2x6
1778 		 * devices, however it is safe to load default
1779 		 * configuration of 3x6 device instead of masking off
1780 		 * corresponding bits because HW ignores bits of a disabled
1781 		 * subslice and drops down to appropriate config. Please
1782 		 * see render_state_setup() in i915_gem_render_state.c for
1783 		 * possible configurations, to avoid duplication they are
1784 		 * not shown here again.
1785 		 */
1786 		*batch++ = GEN9_MEDIA_POOL_STATE;
1787 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1788 		*batch++ = 0x00777000;
1789 		*batch++ = 0;
1790 		*batch++ = 0;
1791 		*batch++ = 0;
1792 	}
1793 
1794 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1795 
1796 	/* Pad to end of cacheline */
1797 	while ((unsigned long)batch % CACHELINE_BYTES)
1798 		*batch++ = MI_NOOP;
1799 
1800 	return batch;
1801 }
1802 
1803 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1804 
lrc_create_wa_ctx(struct intel_engine_cs * engine)1805 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1806 {
1807 	struct drm_i915_gem_object *obj;
1808 	struct i915_vma *vma;
1809 	int err;
1810 
1811 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1812 	if (IS_ERR(obj))
1813 		return PTR_ERR(obj);
1814 
1815 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1816 	if (IS_ERR(vma)) {
1817 		err = PTR_ERR(vma);
1818 		goto err;
1819 	}
1820 
1821 	engine->wa_ctx.vma = vma;
1822 	return 0;
1823 
1824 err:
1825 	i915_gem_object_put(obj);
1826 	return err;
1827 }
1828 
lrc_fini_wa_ctx(struct intel_engine_cs * engine)1829 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1830 {
1831 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1832 }
1833 
1834 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1835 
lrc_init_wa_ctx(struct intel_engine_cs * engine)1836 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1837 {
1838 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1839 	struct i915_wa_ctx_bb *wa_bb[] = {
1840 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1841 	};
1842 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1843 	struct i915_gem_ww_ctx ww;
1844 	void *batch, *batch_ptr;
1845 	unsigned int i;
1846 	int err;
1847 
1848 	if (GRAPHICS_VER(engine->i915) >= 11 ||
1849 	    !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1850 		return;
1851 
1852 	if (GRAPHICS_VER(engine->i915) == 9) {
1853 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1854 		wa_bb_fn[1] = NULL;
1855 	} else if (GRAPHICS_VER(engine->i915) == 8) {
1856 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1857 		wa_bb_fn[1] = NULL;
1858 	}
1859 
1860 	err = lrc_create_wa_ctx(engine);
1861 	if (err) {
1862 		/*
1863 		 * We continue even if we fail to initialize WA batch
1864 		 * because we only expect rare glitches but nothing
1865 		 * critical to prevent us from using GPU
1866 		 */
1867 		drm_err(&engine->i915->drm,
1868 			"Ignoring context switch w/a allocation error:%d\n",
1869 			err);
1870 		return;
1871 	}
1872 
1873 	if (!engine->wa_ctx.vma)
1874 		return;
1875 
1876 	i915_gem_ww_ctx_init(&ww, true);
1877 retry:
1878 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1879 	if (!err)
1880 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1881 	if (err)
1882 		goto err;
1883 
1884 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1885 	if (IS_ERR(batch)) {
1886 		err = PTR_ERR(batch);
1887 		goto err_unpin;
1888 	}
1889 
1890 	/*
1891 	 * Emit the two workaround batch buffers, recording the offset from the
1892 	 * start of the workaround batch buffer object for each and their
1893 	 * respective sizes.
1894 	 */
1895 	batch_ptr = batch;
1896 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1897 		wa_bb[i]->offset = batch_ptr - batch;
1898 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1899 						  CACHELINE_BYTES))) {
1900 			err = -EINVAL;
1901 			break;
1902 		}
1903 		if (wa_bb_fn[i])
1904 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1905 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1906 	}
1907 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1908 
1909 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1910 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1911 
1912 	/* Verify that we can handle failure to setup the wa_ctx */
1913 	if (!err)
1914 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1915 
1916 err_unpin:
1917 	if (err)
1918 		i915_vma_unpin(wa_ctx->vma);
1919 err:
1920 	if (err == -EDEADLK) {
1921 		err = i915_gem_ww_ctx_backoff(&ww);
1922 		if (!err)
1923 			goto retry;
1924 	}
1925 	i915_gem_ww_ctx_fini(&ww);
1926 
1927 	if (err) {
1928 		i915_vma_put(engine->wa_ctx.vma);
1929 
1930 		/* Clear all flags to prevent further use */
1931 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1932 	}
1933 }
1934 
st_runtime_underflow(struct intel_context_stats * stats,s32 dt)1935 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1936 {
1937 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1938 	stats->runtime.num_underflow++;
1939 	stats->runtime.max_underflow =
1940 		max_t(u32, stats->runtime.max_underflow, -dt);
1941 #endif
1942 }
1943 
lrc_get_runtime(const struct intel_context * ce)1944 static u32 lrc_get_runtime(const struct intel_context *ce)
1945 {
1946 	/*
1947 	 * We can use either ppHWSP[16] which is recorded before the context
1948 	 * switch (and so excludes the cost of context switches) or use the
1949 	 * value from the context image itself, which is saved/restored earlier
1950 	 * and so includes the cost of the save.
1951 	 */
1952 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1953 }
1954 
lrc_update_runtime(struct intel_context * ce)1955 void lrc_update_runtime(struct intel_context *ce)
1956 {
1957 	struct intel_context_stats *stats = &ce->stats;
1958 	u32 old;
1959 	s32 dt;
1960 
1961 	old = stats->runtime.last;
1962 	stats->runtime.last = lrc_get_runtime(ce);
1963 	dt = stats->runtime.last - old;
1964 	if (!dt)
1965 		return;
1966 
1967 	if (unlikely(dt < 0)) {
1968 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1969 			 old, stats->runtime.last, dt);
1970 		st_runtime_underflow(stats, dt);
1971 		return;
1972 	}
1973 
1974 	ewma_runtime_add(&stats->runtime.avg, dt);
1975 	stats->runtime.total += dt;
1976 }
1977 
1978 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1979 #include "selftest_lrc.c"
1980 #endif
1981