1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2014 Intel Corporation
4 */
5
6 #include "gem/i915_gem_lmem.h"
7
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "i915_reg.h"
12 #include "intel_context.h"
13 #include "intel_engine.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt.h"
17 #include "intel_gt_regs.h"
18 #include "intel_lrc.h"
19 #include "intel_lrc_reg.h"
20 #include "intel_ring.h"
21 #include "shmem_utils.h"
22
23 /*
24 * The per-platform tables are u8-encoded in @data. Decode @data and set the
25 * addresses' offset and commands in @regs. The following encoding is used
26 * for each byte. There are 2 steps: decoding commands and decoding addresses.
27 *
28 * Commands:
29 * [7]: create NOPs - number of NOPs are set in lower bits
30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
31 * MI_LRI_FORCE_POSTED
32 * [5:0]: Number of NOPs or registers to set values to in case of
33 * MI_LOAD_REGISTER_IMM
34 *
35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
36 * number of registers. They are set by using the REG/REG16 macros: the former
37 * is used for offsets smaller than 0x200 while the latter is for values bigger
38 * than that. Those macros already set all the bits documented below correctly:
39 *
40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
41 * follow, for the lower bits
42 * [6:0]: Register offset, without considering the engine base.
43 *
44 * This function only tweaks the commands and register offsets. Values are not
45 * filled out.
46 */
set_offsets(u32 * regs,const u8 * data,const struct intel_engine_cs * engine,bool close)47 static void set_offsets(u32 *regs,
48 const u8 *data,
49 const struct intel_engine_cs *engine,
50 bool close)
51 #define NOP(x) (BIT(7) | (x))
52 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
53 #define POSTED BIT(0)
54 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
55 #define REG16(x) \
56 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
57 (((x) >> 2) & 0x7f)
58 #define END 0
59 {
60 const u32 base = engine->mmio_base;
61
62 while (*data) {
63 u8 count, flags;
64
65 if (*data & BIT(7)) { /* skip */
66 count = *data++ & ~BIT(7);
67 regs += count;
68 continue;
69 }
70
71 count = *data & 0x3f;
72 flags = *data >> 6;
73 data++;
74
75 *regs = MI_LOAD_REGISTER_IMM(count);
76 if (flags & POSTED)
77 *regs |= MI_LRI_FORCE_POSTED;
78 if (GRAPHICS_VER(engine->i915) >= 11)
79 *regs |= MI_LRI_LRM_CS_MMIO;
80 regs++;
81
82 GEM_BUG_ON(!count);
83 do {
84 u32 offset = 0;
85 u8 v;
86
87 do {
88 v = *data++;
89 offset <<= 7;
90 offset |= v & ~BIT(7);
91 } while (v & BIT(7));
92
93 regs[0] = base + (offset << 2);
94 regs += 2;
95 } while (--count);
96 }
97
98 if (close) {
99 /* Close the batch; used mainly by live_lrc_layout() */
100 *regs = MI_BATCH_BUFFER_END;
101 if (GRAPHICS_VER(engine->i915) >= 11)
102 *regs |= BIT(0);
103 }
104 }
105
106 static const u8 gen8_xcs_offsets[] = {
107 NOP(1),
108 LRI(11, 0),
109 REG16(0x244),
110 REG(0x034),
111 REG(0x030),
112 REG(0x038),
113 REG(0x03c),
114 REG(0x168),
115 REG(0x140),
116 REG(0x110),
117 REG(0x11c),
118 REG(0x114),
119 REG(0x118),
120
121 NOP(9),
122 LRI(9, 0),
123 REG16(0x3a8),
124 REG16(0x28c),
125 REG16(0x288),
126 REG16(0x284),
127 REG16(0x280),
128 REG16(0x27c),
129 REG16(0x278),
130 REG16(0x274),
131 REG16(0x270),
132
133 NOP(13),
134 LRI(2, 0),
135 REG16(0x200),
136 REG(0x028),
137
138 END
139 };
140
141 static const u8 gen9_xcs_offsets[] = {
142 NOP(1),
143 LRI(14, POSTED),
144 REG16(0x244),
145 REG(0x034),
146 REG(0x030),
147 REG(0x038),
148 REG(0x03c),
149 REG(0x168),
150 REG(0x140),
151 REG(0x110),
152 REG(0x11c),
153 REG(0x114),
154 REG(0x118),
155 REG(0x1c0),
156 REG(0x1c4),
157 REG(0x1c8),
158
159 NOP(3),
160 LRI(9, POSTED),
161 REG16(0x3a8),
162 REG16(0x28c),
163 REG16(0x288),
164 REG16(0x284),
165 REG16(0x280),
166 REG16(0x27c),
167 REG16(0x278),
168 REG16(0x274),
169 REG16(0x270),
170
171 NOP(13),
172 LRI(1, POSTED),
173 REG16(0x200),
174
175 NOP(13),
176 LRI(44, POSTED),
177 REG(0x028),
178 REG(0x09c),
179 REG(0x0c0),
180 REG(0x178),
181 REG(0x17c),
182 REG16(0x358),
183 REG(0x170),
184 REG(0x150),
185 REG(0x154),
186 REG(0x158),
187 REG16(0x41c),
188 REG16(0x600),
189 REG16(0x604),
190 REG16(0x608),
191 REG16(0x60c),
192 REG16(0x610),
193 REG16(0x614),
194 REG16(0x618),
195 REG16(0x61c),
196 REG16(0x620),
197 REG16(0x624),
198 REG16(0x628),
199 REG16(0x62c),
200 REG16(0x630),
201 REG16(0x634),
202 REG16(0x638),
203 REG16(0x63c),
204 REG16(0x640),
205 REG16(0x644),
206 REG16(0x648),
207 REG16(0x64c),
208 REG16(0x650),
209 REG16(0x654),
210 REG16(0x658),
211 REG16(0x65c),
212 REG16(0x660),
213 REG16(0x664),
214 REG16(0x668),
215 REG16(0x66c),
216 REG16(0x670),
217 REG16(0x674),
218 REG16(0x678),
219 REG16(0x67c),
220 REG(0x068),
221
222 END
223 };
224
225 static const u8 gen12_xcs_offsets[] = {
226 NOP(1),
227 LRI(13, POSTED),
228 REG16(0x244),
229 REG(0x034),
230 REG(0x030),
231 REG(0x038),
232 REG(0x03c),
233 REG(0x168),
234 REG(0x140),
235 REG(0x110),
236 REG(0x1c0),
237 REG(0x1c4),
238 REG(0x1c8),
239 REG(0x180),
240 REG16(0x2b4),
241
242 NOP(5),
243 LRI(9, POSTED),
244 REG16(0x3a8),
245 REG16(0x28c),
246 REG16(0x288),
247 REG16(0x284),
248 REG16(0x280),
249 REG16(0x27c),
250 REG16(0x278),
251 REG16(0x274),
252 REG16(0x270),
253
254 END
255 };
256
257 static const u8 dg2_xcs_offsets[] = {
258 NOP(1),
259 LRI(15, POSTED),
260 REG16(0x244),
261 REG(0x034),
262 REG(0x030),
263 REG(0x038),
264 REG(0x03c),
265 REG(0x168),
266 REG(0x140),
267 REG(0x110),
268 REG(0x1c0),
269 REG(0x1c4),
270 REG(0x1c8),
271 REG(0x180),
272 REG16(0x2b4),
273 REG(0x120),
274 REG(0x124),
275
276 NOP(1),
277 LRI(9, POSTED),
278 REG16(0x3a8),
279 REG16(0x28c),
280 REG16(0x288),
281 REG16(0x284),
282 REG16(0x280),
283 REG16(0x27c),
284 REG16(0x278),
285 REG16(0x274),
286 REG16(0x270),
287
288 END
289 };
290
291 static const u8 gen8_rcs_offsets[] = {
292 NOP(1),
293 LRI(14, POSTED),
294 REG16(0x244),
295 REG(0x034),
296 REG(0x030),
297 REG(0x038),
298 REG(0x03c),
299 REG(0x168),
300 REG(0x140),
301 REG(0x110),
302 REG(0x11c),
303 REG(0x114),
304 REG(0x118),
305 REG(0x1c0),
306 REG(0x1c4),
307 REG(0x1c8),
308
309 NOP(3),
310 LRI(9, POSTED),
311 REG16(0x3a8),
312 REG16(0x28c),
313 REG16(0x288),
314 REG16(0x284),
315 REG16(0x280),
316 REG16(0x27c),
317 REG16(0x278),
318 REG16(0x274),
319 REG16(0x270),
320
321 NOP(13),
322 LRI(1, 0),
323 REG(0x0c8),
324
325 END
326 };
327
328 static const u8 gen9_rcs_offsets[] = {
329 NOP(1),
330 LRI(14, POSTED),
331 REG16(0x244),
332 REG(0x34),
333 REG(0x30),
334 REG(0x38),
335 REG(0x3c),
336 REG(0x168),
337 REG(0x140),
338 REG(0x110),
339 REG(0x11c),
340 REG(0x114),
341 REG(0x118),
342 REG(0x1c0),
343 REG(0x1c4),
344 REG(0x1c8),
345
346 NOP(3),
347 LRI(9, POSTED),
348 REG16(0x3a8),
349 REG16(0x28c),
350 REG16(0x288),
351 REG16(0x284),
352 REG16(0x280),
353 REG16(0x27c),
354 REG16(0x278),
355 REG16(0x274),
356 REG16(0x270),
357
358 NOP(13),
359 LRI(1, 0),
360 REG(0xc8),
361
362 NOP(13),
363 LRI(44, POSTED),
364 REG(0x28),
365 REG(0x9c),
366 REG(0xc0),
367 REG(0x178),
368 REG(0x17c),
369 REG16(0x358),
370 REG(0x170),
371 REG(0x150),
372 REG(0x154),
373 REG(0x158),
374 REG16(0x41c),
375 REG16(0x600),
376 REG16(0x604),
377 REG16(0x608),
378 REG16(0x60c),
379 REG16(0x610),
380 REG16(0x614),
381 REG16(0x618),
382 REG16(0x61c),
383 REG16(0x620),
384 REG16(0x624),
385 REG16(0x628),
386 REG16(0x62c),
387 REG16(0x630),
388 REG16(0x634),
389 REG16(0x638),
390 REG16(0x63c),
391 REG16(0x640),
392 REG16(0x644),
393 REG16(0x648),
394 REG16(0x64c),
395 REG16(0x650),
396 REG16(0x654),
397 REG16(0x658),
398 REG16(0x65c),
399 REG16(0x660),
400 REG16(0x664),
401 REG16(0x668),
402 REG16(0x66c),
403 REG16(0x670),
404 REG16(0x674),
405 REG16(0x678),
406 REG16(0x67c),
407 REG(0x68),
408
409 END
410 };
411
412 static const u8 gen11_rcs_offsets[] = {
413 NOP(1),
414 LRI(15, POSTED),
415 REG16(0x244),
416 REG(0x034),
417 REG(0x030),
418 REG(0x038),
419 REG(0x03c),
420 REG(0x168),
421 REG(0x140),
422 REG(0x110),
423 REG(0x11c),
424 REG(0x114),
425 REG(0x118),
426 REG(0x1c0),
427 REG(0x1c4),
428 REG(0x1c8),
429 REG(0x180),
430
431 NOP(1),
432 LRI(9, POSTED),
433 REG16(0x3a8),
434 REG16(0x28c),
435 REG16(0x288),
436 REG16(0x284),
437 REG16(0x280),
438 REG16(0x27c),
439 REG16(0x278),
440 REG16(0x274),
441 REG16(0x270),
442
443 LRI(1, POSTED),
444 REG(0x1b0),
445
446 NOP(10),
447 LRI(1, 0),
448 REG(0x0c8),
449
450 END
451 };
452
453 static const u8 gen12_rcs_offsets[] = {
454 NOP(1),
455 LRI(13, POSTED),
456 REG16(0x244),
457 REG(0x034),
458 REG(0x030),
459 REG(0x038),
460 REG(0x03c),
461 REG(0x168),
462 REG(0x140),
463 REG(0x110),
464 REG(0x1c0),
465 REG(0x1c4),
466 REG(0x1c8),
467 REG(0x180),
468 REG16(0x2b4),
469
470 NOP(5),
471 LRI(9, POSTED),
472 REG16(0x3a8),
473 REG16(0x28c),
474 REG16(0x288),
475 REG16(0x284),
476 REG16(0x280),
477 REG16(0x27c),
478 REG16(0x278),
479 REG16(0x274),
480 REG16(0x270),
481
482 LRI(3, POSTED),
483 REG(0x1b0),
484 REG16(0x5a8),
485 REG16(0x5ac),
486
487 NOP(6),
488 LRI(1, 0),
489 REG(0x0c8),
490 NOP(3 + 9 + 1),
491
492 LRI(51, POSTED),
493 REG16(0x588),
494 REG16(0x588),
495 REG16(0x588),
496 REG16(0x588),
497 REG16(0x588),
498 REG16(0x588),
499 REG(0x028),
500 REG(0x09c),
501 REG(0x0c0),
502 REG(0x178),
503 REG(0x17c),
504 REG16(0x358),
505 REG(0x170),
506 REG(0x150),
507 REG(0x154),
508 REG(0x158),
509 REG16(0x41c),
510 REG16(0x600),
511 REG16(0x604),
512 REG16(0x608),
513 REG16(0x60c),
514 REG16(0x610),
515 REG16(0x614),
516 REG16(0x618),
517 REG16(0x61c),
518 REG16(0x620),
519 REG16(0x624),
520 REG16(0x628),
521 REG16(0x62c),
522 REG16(0x630),
523 REG16(0x634),
524 REG16(0x638),
525 REG16(0x63c),
526 REG16(0x640),
527 REG16(0x644),
528 REG16(0x648),
529 REG16(0x64c),
530 REG16(0x650),
531 REG16(0x654),
532 REG16(0x658),
533 REG16(0x65c),
534 REG16(0x660),
535 REG16(0x664),
536 REG16(0x668),
537 REG16(0x66c),
538 REG16(0x670),
539 REG16(0x674),
540 REG16(0x678),
541 REG16(0x67c),
542 REG(0x068),
543 REG(0x084),
544 NOP(1),
545
546 END
547 };
548
549 static const u8 dg2_rcs_offsets[] = {
550 NOP(1),
551 LRI(15, POSTED),
552 REG16(0x244),
553 REG(0x034),
554 REG(0x030),
555 REG(0x038),
556 REG(0x03c),
557 REG(0x168),
558 REG(0x140),
559 REG(0x110),
560 REG(0x1c0),
561 REG(0x1c4),
562 REG(0x1c8),
563 REG(0x180),
564 REG16(0x2b4),
565 REG(0x120),
566 REG(0x124),
567
568 NOP(1),
569 LRI(9, POSTED),
570 REG16(0x3a8),
571 REG16(0x28c),
572 REG16(0x288),
573 REG16(0x284),
574 REG16(0x280),
575 REG16(0x27c),
576 REG16(0x278),
577 REG16(0x274),
578 REG16(0x270),
579
580 LRI(3, POSTED),
581 REG(0x1b0),
582 REG16(0x5a8),
583 REG16(0x5ac),
584
585 NOP(6),
586 LRI(1, 0),
587 REG(0x0c8),
588
589 END
590 };
591
592 static const u8 mtl_rcs_offsets[] = {
593 NOP(1),
594 LRI(15, POSTED),
595 REG16(0x244),
596 REG(0x034),
597 REG(0x030),
598 REG(0x038),
599 REG(0x03c),
600 REG(0x168),
601 REG(0x140),
602 REG(0x110),
603 REG(0x1c0),
604 REG(0x1c4),
605 REG(0x1c8),
606 REG(0x180),
607 REG16(0x2b4),
608 REG(0x120),
609 REG(0x124),
610
611 NOP(1),
612 LRI(9, POSTED),
613 REG16(0x3a8),
614 REG16(0x28c),
615 REG16(0x288),
616 REG16(0x284),
617 REG16(0x280),
618 REG16(0x27c),
619 REG16(0x278),
620 REG16(0x274),
621 REG16(0x270),
622
623 NOP(2),
624 LRI(2, POSTED),
625 REG16(0x5a8),
626 REG16(0x5ac),
627
628 NOP(6),
629 LRI(1, 0),
630 REG(0x0c8),
631
632 END
633 };
634
635 #undef END
636 #undef REG16
637 #undef REG
638 #undef LRI
639 #undef NOP
640
reg_offsets(const struct intel_engine_cs * engine)641 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
642 {
643 /*
644 * The gen12+ lists only have the registers we program in the basic
645 * default state. We rely on the context image using relative
646 * addressing to automatic fixup the register state between the
647 * physical engines for virtual engine.
648 */
649 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
650 !intel_engine_has_relative_mmio(engine));
651
652 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
653 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
654 return mtl_rcs_offsets;
655 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
656 return dg2_rcs_offsets;
657 else if (GRAPHICS_VER(engine->i915) >= 12)
658 return gen12_rcs_offsets;
659 else if (GRAPHICS_VER(engine->i915) >= 11)
660 return gen11_rcs_offsets;
661 else if (GRAPHICS_VER(engine->i915) >= 9)
662 return gen9_rcs_offsets;
663 else
664 return gen8_rcs_offsets;
665 } else {
666 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
667 return dg2_xcs_offsets;
668 else if (GRAPHICS_VER(engine->i915) >= 12)
669 return gen12_xcs_offsets;
670 else if (GRAPHICS_VER(engine->i915) >= 9)
671 return gen9_xcs_offsets;
672 else
673 return gen8_xcs_offsets;
674 }
675 }
676
lrc_ring_mi_mode(const struct intel_engine_cs * engine)677 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
678 {
679 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
680 return 0x70;
681 else if (GRAPHICS_VER(engine->i915) >= 12)
682 return 0x60;
683 else if (GRAPHICS_VER(engine->i915) >= 9)
684 return 0x54;
685 else if (engine->class == RENDER_CLASS)
686 return 0x58;
687 else
688 return -1;
689 }
690
lrc_ring_bb_offset(const struct intel_engine_cs * engine)691 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
692 {
693 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
694 return 0x80;
695 else if (GRAPHICS_VER(engine->i915) >= 12)
696 return 0x70;
697 else if (GRAPHICS_VER(engine->i915) >= 9)
698 return 0x64;
699 else if (GRAPHICS_VER(engine->i915) >= 8 &&
700 engine->class == RENDER_CLASS)
701 return 0xc4;
702 else
703 return -1;
704 }
705
lrc_ring_gpr0(const struct intel_engine_cs * engine)706 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
707 {
708 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
709 return 0x84;
710 else if (GRAPHICS_VER(engine->i915) >= 12)
711 return 0x74;
712 else if (GRAPHICS_VER(engine->i915) >= 9)
713 return 0x68;
714 else if (engine->class == RENDER_CLASS)
715 return 0xd8;
716 else
717 return -1;
718 }
719
lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs * engine)720 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
721 {
722 if (GRAPHICS_VER(engine->i915) >= 12)
723 return 0x12;
724 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
725 return 0x18;
726 else
727 return -1;
728 }
729
lrc_ring_indirect_ptr(const struct intel_engine_cs * engine)730 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
731 {
732 int x;
733
734 x = lrc_ring_wa_bb_per_ctx(engine);
735 if (x < 0)
736 return x;
737
738 return x + 2;
739 }
740
lrc_ring_indirect_offset(const struct intel_engine_cs * engine)741 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
742 {
743 int x;
744
745 x = lrc_ring_indirect_ptr(engine);
746 if (x < 0)
747 return x;
748
749 return x + 2;
750 }
751
lrc_ring_cmd_buf_cctl(const struct intel_engine_cs * engine)752 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
753 {
754 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
755 /*
756 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
757 * simply to match the RCS context image layout.
758 */
759 return 0xc6;
760 else if (engine->class != RENDER_CLASS)
761 return -1;
762 else if (GRAPHICS_VER(engine->i915) >= 12)
763 return 0xb6;
764 else if (GRAPHICS_VER(engine->i915) >= 11)
765 return 0xaa;
766 else
767 return -1;
768 }
769
770 static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs * engine)771 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
772 {
773 if (GRAPHICS_VER(engine->i915) >= 12)
774 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
775 else if (GRAPHICS_VER(engine->i915) >= 11)
776 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
777 else if (GRAPHICS_VER(engine->i915) >= 9)
778 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
779 else if (GRAPHICS_VER(engine->i915) >= 8)
780 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
781
782 GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
783
784 return 0;
785 }
786
787 static void
lrc_setup_bb_per_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr)788 lrc_setup_bb_per_ctx(u32 *regs,
789 const struct intel_engine_cs *engine,
790 u32 ctx_bb_ggtt_addr)
791 {
792 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
793 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
794 ctx_bb_ggtt_addr |
795 PER_CTX_BB_FORCE |
796 PER_CTX_BB_VALID;
797 }
798
799 static void
lrc_setup_indirect_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr,u32 size)800 lrc_setup_indirect_ctx(u32 *regs,
801 const struct intel_engine_cs *engine,
802 u32 ctx_bb_ggtt_addr,
803 u32 size)
804 {
805 GEM_BUG_ON(!size);
806 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
807 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
808 regs[lrc_ring_indirect_ptr(engine) + 1] =
809 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
810
811 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
812 regs[lrc_ring_indirect_offset(engine) + 1] =
813 lrc_ring_indirect_offset_default(engine) << 6;
814 }
815
ctx_needs_runalone(const struct intel_context * ce)816 static bool ctx_needs_runalone(const struct intel_context *ce)
817 {
818 struct i915_gem_context *gem_ctx;
819 bool ctx_is_protected = false;
820
821 /*
822 * Wa_14019159160 - Case 2.
823 * On some platforms, protected contexts require setting
824 * the LRC run-alone bit or else the encryption/decryption will not happen.
825 * NOTE: Case 2 only applies to PXP use-case of said workaround.
826 */
827 if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
828 (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) {
829 rcu_read_lock();
830 gem_ctx = rcu_dereference(ce->gem_context);
831 if (gem_ctx)
832 ctx_is_protected = gem_ctx->uses_protected_content;
833 rcu_read_unlock();
834 }
835
836 return ctx_is_protected;
837 }
838
init_common_regs(u32 * const regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)839 static void init_common_regs(u32 * const regs,
840 const struct intel_context *ce,
841 const struct intel_engine_cs *engine,
842 bool inhibit)
843 {
844 u32 ctl;
845 int loc;
846
847 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
848 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
849 if (inhibit)
850 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
851 if (GRAPHICS_VER(engine->i915) < 11)
852 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
853 CTX_CTRL_RS_CTX_ENABLE);
854 /* Wa_14019159160 - Case 2.*/
855 if (ctx_needs_runalone(ce))
856 ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE);
857 regs[CTX_CONTEXT_CONTROL] = ctl;
858
859 regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
860
861 loc = lrc_ring_bb_offset(engine);
862 if (loc != -1)
863 regs[loc + 1] = 0;
864 }
865
init_wa_bb_regs(u32 * const regs,const struct intel_engine_cs * engine)866 static void init_wa_bb_regs(u32 * const regs,
867 const struct intel_engine_cs *engine)
868 {
869 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
870
871 if (wa_ctx->per_ctx.size) {
872 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
873
874 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
875 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
876 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
877 }
878
879 if (wa_ctx->indirect_ctx.size) {
880 lrc_setup_indirect_ctx(regs, engine,
881 i915_ggtt_offset(wa_ctx->vma) +
882 wa_ctx->indirect_ctx.offset,
883 wa_ctx->indirect_ctx.size);
884 }
885 }
886
init_ppgtt_regs(u32 * regs,const struct i915_ppgtt * ppgtt)887 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
888 {
889 if (i915_vm_is_4lvl(&ppgtt->vm)) {
890 /* 64b PPGTT (48bit canonical)
891 * PDP0_DESCRIPTOR contains the base address to PML4 and
892 * other PDP Descriptors are ignored.
893 */
894 ASSIGN_CTX_PML4(ppgtt, regs);
895 } else {
896 ASSIGN_CTX_PDP(ppgtt, regs, 3);
897 ASSIGN_CTX_PDP(ppgtt, regs, 2);
898 ASSIGN_CTX_PDP(ppgtt, regs, 1);
899 ASSIGN_CTX_PDP(ppgtt, regs, 0);
900 }
901 }
902
vm_alias(struct i915_address_space * vm)903 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
904 {
905 if (i915_is_ggtt(vm))
906 return i915_vm_to_ggtt(vm)->alias;
907 else
908 return i915_vm_to_ppgtt(vm);
909 }
910
__reset_stop_ring(u32 * regs,const struct intel_engine_cs * engine)911 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
912 {
913 int x;
914
915 x = lrc_ring_mi_mode(engine);
916 if (x != -1) {
917 regs[x + 1] &= ~STOP_RING;
918 regs[x + 1] |= STOP_RING << 16;
919 }
920 }
921
__lrc_init_regs(u32 * regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)922 static void __lrc_init_regs(u32 *regs,
923 const struct intel_context *ce,
924 const struct intel_engine_cs *engine,
925 bool inhibit)
926 {
927 /*
928 * A context is actually a big batch buffer with several
929 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
930 * values we are setting here are only for the first context restore:
931 * on a subsequent save, the GPU will recreate this batchbuffer with new
932 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
933 * we are not initializing here).
934 *
935 * Must keep consistent with virtual_update_register_offsets().
936 */
937
938 if (inhibit)
939 memset(regs, 0, PAGE_SIZE);
940
941 set_offsets(regs, reg_offsets(engine), engine, inhibit);
942
943 init_common_regs(regs, ce, engine, inhibit);
944 init_ppgtt_regs(regs, vm_alias(ce->vm));
945
946 init_wa_bb_regs(regs, engine);
947
948 __reset_stop_ring(regs, engine);
949 }
950
lrc_init_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)951 void lrc_init_regs(const struct intel_context *ce,
952 const struct intel_engine_cs *engine,
953 bool inhibit)
954 {
955 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
956 }
957
lrc_reset_regs(const struct intel_context * ce,const struct intel_engine_cs * engine)958 void lrc_reset_regs(const struct intel_context *ce,
959 const struct intel_engine_cs *engine)
960 {
961 __reset_stop_ring(ce->lrc_reg_state, engine);
962 }
963
964 static void
set_redzone(void * vaddr,const struct intel_engine_cs * engine)965 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
966 {
967 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
968 return;
969
970 vaddr += engine->context_size;
971
972 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
973 }
974
975 static void
check_redzone(const void * vaddr,const struct intel_engine_cs * engine)976 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
977 {
978 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
979 return;
980
981 vaddr += engine->context_size;
982
983 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
984 drm_err_once(&engine->i915->drm,
985 "%s context redzone overwritten!\n",
986 engine->name);
987 }
988
context_wa_bb_offset(const struct intel_context * ce)989 static u32 context_wa_bb_offset(const struct intel_context *ce)
990 {
991 return PAGE_SIZE * ce->wa_bb_page;
992 }
993
994 /*
995 * per_ctx below determines which WABB section is used.
996 * When true, the function returns the location of the
997 * PER_CTX_BB. When false, the function returns the
998 * location of the INDIRECT_CTX.
999 */
context_wabb(const struct intel_context * ce,bool per_ctx)1000 static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
1001 {
1002 void *ptr;
1003
1004 GEM_BUG_ON(!ce->wa_bb_page);
1005
1006 ptr = ce->lrc_reg_state;
1007 ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1008 ptr += context_wa_bb_offset(ce);
1009 ptr += per_ctx ? PAGE_SIZE : 0;
1010
1011 return ptr;
1012 }
1013
lrc_init_state(struct intel_context * ce,struct intel_engine_cs * engine,void * state)1014 void lrc_init_state(struct intel_context *ce,
1015 struct intel_engine_cs *engine,
1016 void *state)
1017 {
1018 bool inhibit = true;
1019
1020 set_redzone(state, engine);
1021
1022 if (ce->default_state) {
1023 shmem_read(ce->default_state, 0, state, engine->context_size);
1024 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
1025 inhibit = false;
1026 }
1027
1028 /* Clear the ppHWSP (inc. per-context counters) */
1029 memset(state, 0, PAGE_SIZE);
1030
1031 /* Clear the indirect wa and storage */
1032 if (ce->wa_bb_page)
1033 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1034
1035 /*
1036 * The second page of the context object contains some registers which
1037 * must be set up prior to the first execution.
1038 */
1039 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1040 }
1041
lrc_indirect_bb(const struct intel_context * ce)1042 u32 lrc_indirect_bb(const struct intel_context *ce)
1043 {
1044 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1045 }
1046
setup_predicate_disable_wa(const struct intel_context * ce,u32 * cs)1047 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1048 {
1049 /* If predication is active, this will be noop'ed */
1050 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1051 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1052 *cs++ = 0;
1053 *cs++ = 0; /* No predication */
1054
1055 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1056 *cs++ = MI_BATCH_BUFFER_END | BIT(15);
1057 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1058
1059 /* Instructions are no longer predicated (disabled), we can proceed */
1060 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1061 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1062 *cs++ = 0;
1063 *cs++ = 1; /* enable predication before the next BB */
1064
1065 *cs++ = MI_BATCH_BUFFER_END;
1066 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1067
1068 return cs;
1069 }
1070
1071 static struct i915_vma *
__lrc_alloc_state(struct intel_context * ce,struct intel_engine_cs * engine)1072 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1073 {
1074 struct drm_i915_gem_object *obj;
1075 struct i915_vma *vma;
1076 u32 context_size;
1077
1078 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1079
1080 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1081 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1082
1083 if (GRAPHICS_VER(engine->i915) >= 12) {
1084 ce->wa_bb_page = context_size / PAGE_SIZE;
1085 /* INDIRECT_CTX and PER_CTX_BB need separate pages. */
1086 context_size += PAGE_SIZE * 2;
1087 }
1088
1089 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1090 ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1091 context_size += PARENT_SCRATCH_SIZE;
1092 }
1093
1094 obj = i915_gem_object_create_lmem(engine->i915, context_size,
1095 I915_BO_ALLOC_PM_VOLATILE);
1096 if (IS_ERR(obj)) {
1097 obj = i915_gem_object_create_shmem(engine->i915, context_size);
1098 if (IS_ERR(obj))
1099 return ERR_CAST(obj);
1100
1101 /*
1102 * Wa_22016122933: For Media version 13.0, all Media GT shared
1103 * memory needs to be mapped as WC on CPU side and UC (PAT
1104 * index 2) on GPU side.
1105 */
1106 if (intel_gt_needs_wa_22016122933(engine->gt))
1107 i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
1108 }
1109
1110 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1111 if (IS_ERR(vma)) {
1112 i915_gem_object_put(obj);
1113 return vma;
1114 }
1115
1116 return vma;
1117 }
1118
1119 static struct intel_timeline *
pinned_timeline(struct intel_context * ce,struct intel_engine_cs * engine)1120 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1121 {
1122 struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1123
1124 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1125 }
1126
lrc_alloc(struct intel_context * ce,struct intel_engine_cs * engine)1127 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1128 {
1129 struct intel_ring *ring;
1130 struct i915_vma *vma;
1131 int err;
1132
1133 GEM_BUG_ON(ce->state);
1134
1135 if (!intel_context_has_own_state(ce))
1136 ce->default_state = engine->default_state;
1137
1138 vma = __lrc_alloc_state(ce, engine);
1139 if (IS_ERR(vma))
1140 return PTR_ERR(vma);
1141
1142 ring = intel_engine_create_ring(engine, ce->ring_size);
1143 if (IS_ERR(ring)) {
1144 err = PTR_ERR(ring);
1145 goto err_vma;
1146 }
1147
1148 if (!page_mask_bits(ce->timeline)) {
1149 struct intel_timeline *tl;
1150
1151 /*
1152 * Use the static global HWSP for the kernel context, and
1153 * a dynamically allocated cacheline for everyone else.
1154 */
1155 if (unlikely(ce->timeline))
1156 tl = pinned_timeline(ce, engine);
1157 else
1158 tl = intel_timeline_create(engine->gt);
1159 if (IS_ERR(tl)) {
1160 err = PTR_ERR(tl);
1161 goto err_ring;
1162 }
1163
1164 ce->timeline = tl;
1165 }
1166
1167 ce->ring = ring;
1168 ce->state = vma;
1169
1170 return 0;
1171
1172 err_ring:
1173 intel_ring_put(ring);
1174 err_vma:
1175 i915_vma_put(vma);
1176 return err;
1177 }
1178
lrc_reset(struct intel_context * ce)1179 void lrc_reset(struct intel_context *ce)
1180 {
1181 GEM_BUG_ON(!intel_context_is_pinned(ce));
1182
1183 intel_ring_reset(ce->ring, ce->ring->emit);
1184
1185 /* Scrub away the garbage */
1186 lrc_init_regs(ce, ce->engine, true);
1187 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1188 }
1189
1190 int
lrc_pre_pin(struct intel_context * ce,struct intel_engine_cs * engine,struct i915_gem_ww_ctx * ww,void ** vaddr)1191 lrc_pre_pin(struct intel_context *ce,
1192 struct intel_engine_cs *engine,
1193 struct i915_gem_ww_ctx *ww,
1194 void **vaddr)
1195 {
1196 GEM_BUG_ON(!ce->state);
1197 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1198
1199 *vaddr = i915_gem_object_pin_map(ce->state->obj,
1200 intel_gt_coherent_map_type(ce->engine->gt,
1201 ce->state->obj,
1202 false) |
1203 I915_MAP_OVERRIDE);
1204
1205 return PTR_ERR_OR_ZERO(*vaddr);
1206 }
1207
1208 int
lrc_pin(struct intel_context * ce,struct intel_engine_cs * engine,void * vaddr)1209 lrc_pin(struct intel_context *ce,
1210 struct intel_engine_cs *engine,
1211 void *vaddr)
1212 {
1213 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1214
1215 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1216 lrc_init_state(ce, engine, vaddr);
1217
1218 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1219 return 0;
1220 }
1221
lrc_unpin(struct intel_context * ce)1222 void lrc_unpin(struct intel_context *ce)
1223 {
1224 if (unlikely(ce->parallel.last_rq)) {
1225 i915_request_put(ce->parallel.last_rq);
1226 ce->parallel.last_rq = NULL;
1227 }
1228 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1229 ce->engine);
1230 }
1231
lrc_post_unpin(struct intel_context * ce)1232 void lrc_post_unpin(struct intel_context *ce)
1233 {
1234 i915_gem_object_unpin_map(ce->state->obj);
1235 }
1236
lrc_fini(struct intel_context * ce)1237 void lrc_fini(struct intel_context *ce)
1238 {
1239 if (!ce->state)
1240 return;
1241
1242 intel_ring_put(fetch_and_zero(&ce->ring));
1243 i915_vma_put(fetch_and_zero(&ce->state));
1244 }
1245
lrc_destroy(struct kref * kref)1246 void lrc_destroy(struct kref *kref)
1247 {
1248 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1249
1250 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1251 GEM_BUG_ON(intel_context_is_pinned(ce));
1252
1253 lrc_fini(ce);
1254
1255 intel_context_fini(ce);
1256 intel_context_free(ce);
1257 }
1258
1259 static u32 *
gen12_emit_timestamp_wa(const struct intel_context * ce,u32 * cs)1260 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1261 {
1262 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1263 MI_SRM_LRM_GLOBAL_GTT |
1264 MI_LRI_LRM_CS_MMIO;
1265 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1266 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1267 CTX_TIMESTAMP * sizeof(u32);
1268 *cs++ = 0;
1269
1270 *cs++ = MI_LOAD_REGISTER_REG |
1271 MI_LRR_SOURCE_CS_MMIO |
1272 MI_LRI_LRM_CS_MMIO;
1273 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1274 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1275
1276 *cs++ = MI_LOAD_REGISTER_REG |
1277 MI_LRR_SOURCE_CS_MMIO |
1278 MI_LRI_LRM_CS_MMIO;
1279 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1280 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1281
1282 return cs;
1283 }
1284
1285 static u32 *
gen12_emit_restore_scratch(const struct intel_context * ce,u32 * cs)1286 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1287 {
1288 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1289
1290 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1291 MI_SRM_LRM_GLOBAL_GTT |
1292 MI_LRI_LRM_CS_MMIO;
1293 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1294 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1295 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1296 *cs++ = 0;
1297
1298 return cs;
1299 }
1300
1301 static u32 *
gen12_emit_cmd_buf_wa(const struct intel_context * ce,u32 * cs)1302 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1303 {
1304 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1305
1306 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1307 MI_SRM_LRM_GLOBAL_GTT |
1308 MI_LRI_LRM_CS_MMIO;
1309 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1310 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1311 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1312 *cs++ = 0;
1313
1314 *cs++ = MI_LOAD_REGISTER_REG |
1315 MI_LRR_SOURCE_CS_MMIO |
1316 MI_LRI_LRM_CS_MMIO;
1317 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1318 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1319
1320 return cs;
1321 }
1322
1323 /*
1324 * The bspec's tuning guide asks us to program a vertical watermark value of
1325 * 0x3FF. However this register is not saved/restored properly by the
1326 * hardware, so we're required to apply the desired value via INDIRECT_CTX
1327 * batch buffer to ensure the value takes effect properly. All other bits
1328 * in this register should remain at 0 (the hardware default).
1329 */
1330 static u32 *
dg2_emit_draw_watermark_setting(u32 * cs)1331 dg2_emit_draw_watermark_setting(u32 *cs)
1332 {
1333 *cs++ = MI_LOAD_REGISTER_IMM(1);
1334 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1335 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1336
1337 return cs;
1338 }
1339
1340 static u32 *
gen12_invalidate_state_cache(u32 * cs)1341 gen12_invalidate_state_cache(u32 *cs)
1342 {
1343 *cs++ = MI_LOAD_REGISTER_IMM(1);
1344 *cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2);
1345 *cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1346 return cs;
1347 }
1348
1349 static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context * ce,u32 * cs)1350 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1351 {
1352 cs = gen12_emit_timestamp_wa(ce, cs);
1353 cs = gen12_emit_cmd_buf_wa(ce, cs);
1354 cs = gen12_emit_restore_scratch(ce, cs);
1355
1356 /* Wa_16013000631:dg2 */
1357 if (IS_DG2_G11(ce->engine->i915))
1358 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1359
1360 cs = gen12_emit_aux_table_inv(ce->engine, cs);
1361
1362 /* Wa_18022495364 */
1363 if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10)))
1364 cs = gen12_invalidate_state_cache(cs);
1365
1366 /* Wa_16014892111 */
1367 if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1368 IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
1369 IS_DG2(ce->engine->i915))
1370 cs = dg2_emit_draw_watermark_setting(cs);
1371
1372 return cs;
1373 }
1374
1375 static u32 *
gen12_emit_indirect_ctx_xcs(const struct intel_context * ce,u32 * cs)1376 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1377 {
1378 cs = gen12_emit_timestamp_wa(ce, cs);
1379 cs = gen12_emit_restore_scratch(ce, cs);
1380
1381 /* Wa_16013000631:dg2 */
1382 if (IS_DG2_G11(ce->engine->i915))
1383 if (ce->engine->class == COMPUTE_CLASS)
1384 cs = gen8_emit_pipe_control(cs,
1385 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1386 0);
1387
1388 return gen12_emit_aux_table_inv(ce->engine, cs);
1389 }
1390
xehp_emit_fastcolor_blt_wabb(const struct intel_context * ce,u32 * cs)1391 static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
1392 {
1393 struct intel_gt *gt = ce->engine->gt;
1394 int mocs = gt->mocs.uc_index << 1;
1395
1396 /**
1397 * Wa_16018031267 / Wa_16018063123 requires that SW forces the
1398 * main copy engine arbitration into round robin mode. We
1399 * additionally need to submit the following WABB blt command
1400 * to produce 4 subblits with each subblit generating 0 byte
1401 * write requests as WABB:
1402 *
1403 * XY_FASTCOLOR_BLT
1404 * BG0 -> 5100000E
1405 * BG1 -> 0000003F (Dest pitch)
1406 * BG2 -> 00000000 (X1, Y1) = (0, 0)
1407 * BG3 -> 00040001 (X2, Y2) = (1, 4)
1408 * BG4 -> scratch
1409 * BG5 -> scratch
1410 * BG6-12 -> 00000000
1411 * BG13 -> 20004004 (Surf. Width= 2,Surf. Height = 5 )
1412 * BG14 -> 00000010 (Qpitch = 4)
1413 * BG15 -> 00000000
1414 */
1415 *cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2);
1416 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f;
1417 *cs++ = 0;
1418 *cs++ = 4 << 16 | 1;
1419 *cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1420 *cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1421 *cs++ = 0;
1422 *cs++ = 0;
1423 *cs++ = 0;
1424 *cs++ = 0;
1425 *cs++ = 0;
1426 *cs++ = 0;
1427 *cs++ = 0;
1428 *cs++ = 0x20004004;
1429 *cs++ = 0x10;
1430 *cs++ = 0;
1431
1432 return cs;
1433 }
1434
1435 static u32 *
xehp_emit_per_ctx_bb(const struct intel_context * ce,u32 * cs)1436 xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
1437 {
1438 /* Wa_16018031267, Wa_16018063123 */
1439 if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine))
1440 cs = xehp_emit_fastcolor_blt_wabb(ce, cs);
1441
1442 return cs;
1443 }
1444
1445 static void
setup_per_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))1446 setup_per_ctx_bb(const struct intel_context *ce,
1447 const struct intel_engine_cs *engine,
1448 u32 *(*emit)(const struct intel_context *, u32 *))
1449 {
1450 /* Place PER_CTX_BB on next page after INDIRECT_CTX */
1451 u32 * const start = context_wabb(ce, true);
1452 u32 *cs;
1453
1454 cs = emit(ce, start);
1455
1456 /* PER_CTX_BB must manually terminate */
1457 *cs++ = MI_BATCH_BUFFER_END;
1458
1459 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1460 lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine,
1461 lrc_indirect_bb(ce) + PAGE_SIZE);
1462 }
1463
1464 static void
setup_indirect_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))1465 setup_indirect_ctx_bb(const struct intel_context *ce,
1466 const struct intel_engine_cs *engine,
1467 u32 *(*emit)(const struct intel_context *, u32 *))
1468 {
1469 u32 * const start = context_wabb(ce, false);
1470 u32 *cs;
1471
1472 cs = emit(ce, start);
1473 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1474 while ((unsigned long)cs % CACHELINE_BYTES)
1475 *cs++ = MI_NOOP;
1476
1477 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1478 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1479
1480 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1481 lrc_indirect_bb(ce),
1482 (cs - start) * sizeof(*cs));
1483 }
1484
1485 /*
1486 * The context descriptor encodes various attributes of a context,
1487 * including its GTT address and some flags. Because it's fairly
1488 * expensive to calculate, we'll just do it once and cache the result,
1489 * which remains valid until the context is unpinned.
1490 *
1491 * This is what a descriptor looks like, from LSB to MSB::
1492 *
1493 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
1494 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
1495 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
1496 * bits 53-54: mbz, reserved for use by hardware
1497 * bits 55-63: group ID, currently unused and set to 0
1498 *
1499 * Starting from Gen11, the upper dword of the descriptor has a new format:
1500 *
1501 * bits 32-36: reserved
1502 * bits 37-47: SW context ID
1503 * bits 48:53: engine instance
1504 * bit 54: mbz, reserved for use by hardware
1505 * bits 55-60: SW counter
1506 * bits 61-63: engine class
1507 *
1508 * On Xe_HP, the upper dword of the descriptor has a new format:
1509 *
1510 * bits 32-37: virtual function number
1511 * bit 38: mbz, reserved for use by hardware
1512 * bits 39-54: SW context ID
1513 * bits 55-57: reserved
1514 * bits 58-63: SW counter
1515 *
1516 * engine info, SW context ID and SW counter need to form a unique number
1517 * (Context ID) per lrc.
1518 */
lrc_descriptor(const struct intel_context * ce)1519 static u32 lrc_descriptor(const struct intel_context *ce)
1520 {
1521 u32 desc;
1522
1523 desc = INTEL_LEGACY_32B_CONTEXT;
1524 if (i915_vm_is_4lvl(ce->vm))
1525 desc = INTEL_LEGACY_64B_CONTEXT;
1526 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1527
1528 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1529 if (GRAPHICS_VER(ce->vm->i915) == 8)
1530 desc |= GEN8_CTX_L3LLC_COHERENT;
1531
1532 return i915_ggtt_offset(ce->state) | desc;
1533 }
1534
lrc_update_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 head)1535 u32 lrc_update_regs(const struct intel_context *ce,
1536 const struct intel_engine_cs *engine,
1537 u32 head)
1538 {
1539 struct intel_ring *ring = ce->ring;
1540 u32 *regs = ce->lrc_reg_state;
1541
1542 GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1543 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1544
1545 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1546 regs[CTX_RING_HEAD] = head;
1547 regs[CTX_RING_TAIL] = ring->tail;
1548 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1549
1550 /* RPCS */
1551 if (engine->class == RENDER_CLASS) {
1552 regs[CTX_R_PWR_CLK_STATE] =
1553 intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1554
1555 i915_oa_init_reg_state(ce, engine);
1556 }
1557
1558 if (ce->wa_bb_page) {
1559 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1560
1561 fn = gen12_emit_indirect_ctx_xcs;
1562 if (ce->engine->class == RENDER_CLASS)
1563 fn = gen12_emit_indirect_ctx_rcs;
1564
1565 /* Mutually exclusive wrt to global indirect bb */
1566 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1567 setup_indirect_ctx_bb(ce, engine, fn);
1568 setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb);
1569 }
1570
1571 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1572 }
1573
lrc_update_offsets(struct intel_context * ce,struct intel_engine_cs * engine)1574 void lrc_update_offsets(struct intel_context *ce,
1575 struct intel_engine_cs *engine)
1576 {
1577 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1578 }
1579
lrc_check_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,const char * when)1580 void lrc_check_regs(const struct intel_context *ce,
1581 const struct intel_engine_cs *engine,
1582 const char *when)
1583 {
1584 const struct intel_ring *ring = ce->ring;
1585 u32 *regs = ce->lrc_reg_state;
1586 bool valid = true;
1587 int x;
1588
1589 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1590 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1591 engine->name,
1592 regs[CTX_RING_START],
1593 i915_ggtt_offset(ring->vma));
1594 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1595 valid = false;
1596 }
1597
1598 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1599 (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1600 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1601 engine->name,
1602 regs[CTX_RING_CTL],
1603 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1604 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1605 valid = false;
1606 }
1607
1608 x = lrc_ring_mi_mode(engine);
1609 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1610 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1611 engine->name, regs[x + 1]);
1612 regs[x + 1] &= ~STOP_RING;
1613 regs[x + 1] |= STOP_RING << 16;
1614 valid = false;
1615 }
1616
1617 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1618 }
1619
1620 /*
1621 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1622 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1623 * but there is a slight complication as this is applied in WA batch where the
1624 * values are only initialized once so we cannot take register value at the
1625 * beginning and reuse it further; hence we save its value to memory, upload a
1626 * constant value with bit21 set and then we restore it back with the saved value.
1627 * To simplify the WA, a constant value is formed by using the default value
1628 * of this register. This shouldn't be a problem because we are only modifying
1629 * it for a short period and this batch in non-premptible. We can ofcourse
1630 * use additional instructions that read the actual value of the register
1631 * at that time and set our bit of interest but it makes the WA complicated.
1632 *
1633 * This WA is also required for Gen9 so extracting as a function avoids
1634 * code duplication.
1635 */
1636 static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs * engine,u32 * batch)1637 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1638 {
1639 /* NB no one else is allowed to scribble over scratch + 256! */
1640 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1641 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1642 *batch++ = intel_gt_scratch_offset(engine->gt,
1643 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1644 *batch++ = 0;
1645
1646 *batch++ = MI_LOAD_REGISTER_IMM(1);
1647 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1648 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1649
1650 batch = gen8_emit_pipe_control(batch,
1651 PIPE_CONTROL_CS_STALL |
1652 PIPE_CONTROL_DC_FLUSH_ENABLE,
1653 0);
1654
1655 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1656 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1657 *batch++ = intel_gt_scratch_offset(engine->gt,
1658 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1659 *batch++ = 0;
1660
1661 return batch;
1662 }
1663
1664 /*
1665 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1666 * initialized at the beginning and shared across all contexts but this field
1667 * helps us to have multiple batches at different offsets and select them based
1668 * on a criteria. At the moment this batch always start at the beginning of the page
1669 * and at this point we don't have multiple wa_ctx batch buffers.
1670 *
1671 * The number of WA applied are not known at the beginning; we use this field
1672 * to return the no of DWORDS written.
1673 *
1674 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1675 * so it adds NOOPs as padding to make it cacheline aligned.
1676 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1677 * makes a complete batch buffer.
1678 */
gen8_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1679 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1680 {
1681 /* WaDisableCtxRestoreArbitration:bdw,chv */
1682 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1683
1684 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1685 if (IS_BROADWELL(engine->i915))
1686 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1687
1688 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1689 /* Actual scratch location is at 128 bytes offset */
1690 batch = gen8_emit_pipe_control(batch,
1691 PIPE_CONTROL_FLUSH_L3 |
1692 PIPE_CONTROL_STORE_DATA_INDEX |
1693 PIPE_CONTROL_CS_STALL |
1694 PIPE_CONTROL_QW_WRITE,
1695 LRC_PPHWSP_SCRATCH_ADDR);
1696
1697 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1698
1699 /* Pad to end of cacheline */
1700 while ((unsigned long)batch % CACHELINE_BYTES)
1701 *batch++ = MI_NOOP;
1702
1703 /*
1704 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1705 * execution depends on the length specified in terms of cache lines
1706 * in the register CTX_RCS_INDIRECT_CTX
1707 */
1708
1709 return batch;
1710 }
1711
1712 struct lri {
1713 i915_reg_t reg;
1714 u32 value;
1715 };
1716
emit_lri(u32 * batch,const struct lri * lri,unsigned int count)1717 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1718 {
1719 GEM_BUG_ON(!count || count > 63);
1720
1721 *batch++ = MI_LOAD_REGISTER_IMM(count);
1722 do {
1723 *batch++ = i915_mmio_reg_offset(lri->reg);
1724 *batch++ = lri->value;
1725 } while (lri++, --count);
1726 *batch++ = MI_NOOP;
1727
1728 return batch;
1729 }
1730
gen9_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1731 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1732 {
1733 static const struct lri lri[] = {
1734 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1735 {
1736 COMMON_SLICE_CHICKEN2,
1737 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1738 0),
1739 },
1740
1741 /* BSpec: 11391 */
1742 {
1743 FF_SLICE_CHICKEN,
1744 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1745 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1746 },
1747
1748 /* BSpec: 11299 */
1749 {
1750 _3D_CHICKEN3,
1751 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1752 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1753 }
1754 };
1755
1756 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1757
1758 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1759 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1760
1761 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1762 batch = gen8_emit_pipe_control(batch,
1763 PIPE_CONTROL_FLUSH_L3 |
1764 PIPE_CONTROL_STORE_DATA_INDEX |
1765 PIPE_CONTROL_CS_STALL |
1766 PIPE_CONTROL_QW_WRITE,
1767 LRC_PPHWSP_SCRATCH_ADDR);
1768
1769 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1770
1771 /* WaMediaPoolStateCmdInWABB:bxt,glk */
1772 if (HAS_POOLED_EU(engine->i915)) {
1773 /*
1774 * EU pool configuration is setup along with golden context
1775 * during context initialization. This value depends on
1776 * device type (2x6 or 3x6) and needs to be updated based
1777 * on which subslice is disabled especially for 2x6
1778 * devices, however it is safe to load default
1779 * configuration of 3x6 device instead of masking off
1780 * corresponding bits because HW ignores bits of a disabled
1781 * subslice and drops down to appropriate config. Please
1782 * see render_state_setup() in i915_gem_render_state.c for
1783 * possible configurations, to avoid duplication they are
1784 * not shown here again.
1785 */
1786 *batch++ = GEN9_MEDIA_POOL_STATE;
1787 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1788 *batch++ = 0x00777000;
1789 *batch++ = 0;
1790 *batch++ = 0;
1791 *batch++ = 0;
1792 }
1793
1794 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1795
1796 /* Pad to end of cacheline */
1797 while ((unsigned long)batch % CACHELINE_BYTES)
1798 *batch++ = MI_NOOP;
1799
1800 return batch;
1801 }
1802
1803 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1804
lrc_create_wa_ctx(struct intel_engine_cs * engine)1805 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1806 {
1807 struct drm_i915_gem_object *obj;
1808 struct i915_vma *vma;
1809 int err;
1810
1811 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1812 if (IS_ERR(obj))
1813 return PTR_ERR(obj);
1814
1815 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1816 if (IS_ERR(vma)) {
1817 err = PTR_ERR(vma);
1818 goto err;
1819 }
1820
1821 engine->wa_ctx.vma = vma;
1822 return 0;
1823
1824 err:
1825 i915_gem_object_put(obj);
1826 return err;
1827 }
1828
lrc_fini_wa_ctx(struct intel_engine_cs * engine)1829 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1830 {
1831 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1832 }
1833
1834 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1835
lrc_init_wa_ctx(struct intel_engine_cs * engine)1836 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1837 {
1838 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1839 struct i915_wa_ctx_bb *wa_bb[] = {
1840 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1841 };
1842 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1843 struct i915_gem_ww_ctx ww;
1844 void *batch, *batch_ptr;
1845 unsigned int i;
1846 int err;
1847
1848 if (GRAPHICS_VER(engine->i915) >= 11 ||
1849 !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1850 return;
1851
1852 if (GRAPHICS_VER(engine->i915) == 9) {
1853 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1854 wa_bb_fn[1] = NULL;
1855 } else if (GRAPHICS_VER(engine->i915) == 8) {
1856 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1857 wa_bb_fn[1] = NULL;
1858 }
1859
1860 err = lrc_create_wa_ctx(engine);
1861 if (err) {
1862 /*
1863 * We continue even if we fail to initialize WA batch
1864 * because we only expect rare glitches but nothing
1865 * critical to prevent us from using GPU
1866 */
1867 drm_err(&engine->i915->drm,
1868 "Ignoring context switch w/a allocation error:%d\n",
1869 err);
1870 return;
1871 }
1872
1873 if (!engine->wa_ctx.vma)
1874 return;
1875
1876 i915_gem_ww_ctx_init(&ww, true);
1877 retry:
1878 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1879 if (!err)
1880 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1881 if (err)
1882 goto err;
1883
1884 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1885 if (IS_ERR(batch)) {
1886 err = PTR_ERR(batch);
1887 goto err_unpin;
1888 }
1889
1890 /*
1891 * Emit the two workaround batch buffers, recording the offset from the
1892 * start of the workaround batch buffer object for each and their
1893 * respective sizes.
1894 */
1895 batch_ptr = batch;
1896 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1897 wa_bb[i]->offset = batch_ptr - batch;
1898 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1899 CACHELINE_BYTES))) {
1900 err = -EINVAL;
1901 break;
1902 }
1903 if (wa_bb_fn[i])
1904 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1905 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1906 }
1907 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1908
1909 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1910 __i915_gem_object_release_map(wa_ctx->vma->obj);
1911
1912 /* Verify that we can handle failure to setup the wa_ctx */
1913 if (!err)
1914 err = i915_inject_probe_error(engine->i915, -ENODEV);
1915
1916 err_unpin:
1917 if (err)
1918 i915_vma_unpin(wa_ctx->vma);
1919 err:
1920 if (err == -EDEADLK) {
1921 err = i915_gem_ww_ctx_backoff(&ww);
1922 if (!err)
1923 goto retry;
1924 }
1925 i915_gem_ww_ctx_fini(&ww);
1926
1927 if (err) {
1928 i915_vma_put(engine->wa_ctx.vma);
1929
1930 /* Clear all flags to prevent further use */
1931 memset(wa_ctx, 0, sizeof(*wa_ctx));
1932 }
1933 }
1934
st_runtime_underflow(struct intel_context_stats * stats,s32 dt)1935 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1936 {
1937 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1938 stats->runtime.num_underflow++;
1939 stats->runtime.max_underflow =
1940 max_t(u32, stats->runtime.max_underflow, -dt);
1941 #endif
1942 }
1943
lrc_get_runtime(const struct intel_context * ce)1944 static u32 lrc_get_runtime(const struct intel_context *ce)
1945 {
1946 /*
1947 * We can use either ppHWSP[16] which is recorded before the context
1948 * switch (and so excludes the cost of context switches) or use the
1949 * value from the context image itself, which is saved/restored earlier
1950 * and so includes the cost of the save.
1951 */
1952 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1953 }
1954
lrc_update_runtime(struct intel_context * ce)1955 void lrc_update_runtime(struct intel_context *ce)
1956 {
1957 struct intel_context_stats *stats = &ce->stats;
1958 u32 old;
1959 s32 dt;
1960
1961 old = stats->runtime.last;
1962 stats->runtime.last = lrc_get_runtime(ce);
1963 dt = stats->runtime.last - old;
1964 if (!dt)
1965 return;
1966
1967 if (unlikely(dt < 0)) {
1968 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1969 old, stats->runtime.last, dt);
1970 st_runtime_underflow(stats, dt);
1971 return;
1972 }
1973
1974 ewma_runtime_add(&stats->runtime.avg, dt);
1975 stats->runtime.total += dt;
1976 }
1977
1978 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1979 #include "selftest_lrc.c"
1980 #endif
1981