1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2017 - Linaro Ltd
4 * Author: Jintack Lim <jintack.lim@linaro.org>
5 */
6
7 #include <linux/kvm_host.h>
8
9 #include <asm/esr.h>
10 #include <asm/kvm_hyp.h>
11 #include <asm/kvm_mmu.h>
12 #include <asm/lsui.h>
13
fail_s1_walk(struct s1_walk_result * wr,u8 fst,bool s1ptw)14 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
15 {
16 wr->fst = fst;
17 wr->ptw = s1ptw;
18 wr->s2 = s1ptw;
19 wr->failed = true;
20 }
21
22 #define S1_MMU_DISABLED (-127)
23
get_ia_size(struct s1_walk_info * wi)24 static int get_ia_size(struct s1_walk_info *wi)
25 {
26 return 64 - wi->txsz;
27 }
28
29 /* Return true if the IPA is out of the OA range */
check_output_size(u64 ipa,struct s1_walk_info * wi)30 static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
31 {
32 if (wi->pa52bit)
33 return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits));
34 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
35 }
36
has_52bit_pa(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,u64 tcr)37 static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr)
38 {
39 switch (BIT(wi->pgshift)) {
40 case SZ_64K:
41 default: /* IMPDEF: treat any other value as 64k */
42 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52))
43 return false;
44 return ((wi->regime == TR_EL2 ?
45 FIELD_GET(TCR_EL2_PS_MASK, tcr) :
46 FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110);
47 case SZ_16K:
48 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
49 return false;
50 break;
51 case SZ_4K:
52 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
53 return false;
54 break;
55 }
56
57 return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS));
58 }
59
desc_to_oa(struct s1_walk_info * wi,u64 desc)60 static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc)
61 {
62 u64 addr;
63
64 if (!wi->pa52bit)
65 return desc & GENMASK_ULL(47, wi->pgshift);
66
67 switch (BIT(wi->pgshift)) {
68 case SZ_4K:
69 case SZ_16K:
70 addr = desc & GENMASK_ULL(49, wi->pgshift);
71 addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50;
72 break;
73 case SZ_64K:
74 default: /* IMPDEF: treat any other value as 64k */
75 addr = desc & GENMASK_ULL(47, wi->pgshift);
76 addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48;
77 break;
78 }
79
80 return addr;
81 }
82
83 /* Return the translation regime that applies to an AT instruction */
compute_translation_regime(struct kvm_vcpu * vcpu,u32 op)84 static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
85 {
86 /*
87 * We only get here from guest EL2, so the translation
88 * regime AT applies to is solely defined by {E2H,TGE}.
89 */
90 switch (op) {
91 case OP_AT_S1E2R:
92 case OP_AT_S1E2W:
93 case OP_AT_S1E2A:
94 return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
95 default:
96 return (vcpu_el2_e2h_is_set(vcpu) &&
97 vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
98 }
99 }
100
effective_tcr2(struct kvm_vcpu * vcpu,enum trans_regime regime)101 static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime)
102 {
103 if (regime == TR_EL10) {
104 if (vcpu_has_nv(vcpu) &&
105 !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En))
106 return 0;
107
108 return vcpu_read_sys_reg(vcpu, TCR2_EL1);
109 }
110
111 return vcpu_read_sys_reg(vcpu, TCR2_EL2);
112 }
113
s1pie_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)114 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
115 {
116 if (!kvm_has_s1pie(vcpu->kvm))
117 return false;
118
119 /* Abuse TCR2_EL1_PIE and use it for EL2 as well */
120 return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE;
121 }
122
compute_s1poe(struct kvm_vcpu * vcpu,struct s1_walk_info * wi)123 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
124 {
125 u64 val;
126
127 if (!kvm_has_s1poe(vcpu->kvm)) {
128 wi->poe = wi->e0poe = false;
129 return;
130 }
131
132 val = effective_tcr2(vcpu, wi->regime);
133
134 /* Abuse TCR2_EL1_* for EL2 */
135 wi->poe = val & TCR2_EL1_POE;
136 wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE);
137 }
138
setup_s1_walk(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)139 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
140 struct s1_walk_result *wr, u64 va)
141 {
142 u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
143 unsigned int stride, x;
144 bool va55, tbi, lva;
145
146 va55 = va & BIT(55);
147
148 if (vcpu_has_nv(vcpu)) {
149 hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
150 wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
151 } else {
152 WARN_ON_ONCE(wi->regime != TR_EL10);
153 wi->s2 = false;
154 hcr = 0;
155 }
156
157 switch (wi->regime) {
158 case TR_EL10:
159 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
160 tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
161 ttbr = (va55 ?
162 vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
163 vcpu_read_sys_reg(vcpu, TTBR0_EL1));
164 break;
165 case TR_EL2:
166 case TR_EL20:
167 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
168 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
169 ttbr = (va55 ?
170 vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
171 vcpu_read_sys_reg(vcpu, TTBR0_EL2));
172 break;
173 default:
174 BUG();
175 }
176
177 /* Someone was silly enough to encode TG0/TG1 differently */
178 if (va55 && wi->regime != TR_EL2) {
179 wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
180 tg = FIELD_GET(TCR_TG1_MASK, tcr);
181
182 switch (tg << TCR_TG1_SHIFT) {
183 case TCR_TG1_4K:
184 wi->pgshift = 12; break;
185 case TCR_TG1_16K:
186 wi->pgshift = 14; break;
187 case TCR_TG1_64K:
188 default: /* IMPDEF: treat any other value as 64k */
189 wi->pgshift = 16; break;
190 }
191 } else {
192 wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
193 tg = FIELD_GET(TCR_TG0_MASK, tcr);
194
195 switch (tg << TCR_TG0_SHIFT) {
196 case TCR_TG0_4K:
197 wi->pgshift = 12; break;
198 case TCR_TG0_16K:
199 wi->pgshift = 14; break;
200 case TCR_TG0_64K:
201 default: /* IMPDEF: treat any other value as 64k */
202 wi->pgshift = 16; break;
203 }
204 }
205
206 wi->pa52bit = has_52bit_pa(vcpu, wi, tcr);
207
208 ia_bits = get_ia_size(wi);
209
210 /* AArch64.S1StartLevel() */
211 stride = wi->pgshift - 3;
212 wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
213
214 if (wi->regime == TR_EL2 && va55)
215 goto addrsz;
216
217 tbi = (wi->regime == TR_EL2 ?
218 FIELD_GET(TCR_EL2_TBI, tcr) :
219 (va55 ?
220 FIELD_GET(TCR_TBI1, tcr) :
221 FIELD_GET(TCR_TBI0, tcr)));
222
223 if (!tbi && (u64)sign_extend64(va, 55) != va)
224 goto addrsz;
225
226 wi->sh = (wi->regime == TR_EL2 ?
227 FIELD_GET(TCR_EL2_SH0_MASK, tcr) :
228 (va55 ?
229 FIELD_GET(TCR_SH1_MASK, tcr) :
230 FIELD_GET(TCR_SH0_MASK, tcr)));
231
232 va = (u64)sign_extend64(va, 55);
233
234 /* Let's put the MMU disabled case aside immediately */
235 switch (wi->regime) {
236 case TR_EL10:
237 /*
238 * If dealing with the EL1&0 translation regime, 3 things
239 * can disable the S1 translation:
240 *
241 * - HCR_EL2.DC = 1
242 * - HCR_EL2.{E2H,TGE} = {0,1}
243 * - SCTLR_EL1.M = 0
244 *
245 * The TGE part is interesting. If we have decided that this
246 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
247 * {0,x}, and we only need to test for TGE == 1.
248 */
249 if (hcr & (HCR_DC | HCR_TGE)) {
250 wr->level = S1_MMU_DISABLED;
251 break;
252 }
253 fallthrough;
254 case TR_EL2:
255 case TR_EL20:
256 if (!(sctlr & SCTLR_ELx_M))
257 wr->level = S1_MMU_DISABLED;
258 break;
259 }
260
261 if (wr->level == S1_MMU_DISABLED) {
262 if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
263 goto addrsz;
264
265 wr->pa = va;
266 return 0;
267 }
268
269 wi->be = sctlr & SCTLR_ELx_EE;
270
271 wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
272 wi->hpd &= (wi->regime == TR_EL2 ?
273 FIELD_GET(TCR_EL2_HPD, tcr) :
274 (va55 ?
275 FIELD_GET(TCR_HPD1, tcr) :
276 FIELD_GET(TCR_HPD0, tcr)));
277 /* R_JHSVW */
278 wi->hpd |= s1pie_enabled(vcpu, wi->regime);
279
280 /* Do we have POE? */
281 compute_s1poe(vcpu, wi);
282
283 /* R_BVXDG */
284 wi->hpd |= (wi->poe || wi->e0poe);
285
286 /* R_PLCGL, R_YXNYW */
287 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
288 if (wi->txsz > 39)
289 goto transfault;
290 } else {
291 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
292 goto transfault;
293 }
294
295 /* R_GTJBY, R_SXWGM */
296 switch (BIT(wi->pgshift)) {
297 case SZ_4K:
298 case SZ_16K:
299 lva = wi->pa52bit;
300 break;
301 case SZ_64K:
302 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
303 break;
304 }
305
306 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
307 goto transfault;
308
309 /* R_YYVYV, I_THCZK */
310 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
311 (va55 && va < GENMASK(63, ia_bits)))
312 goto transfault;
313
314 /* I_ZFSYQ */
315 if (wi->regime != TR_EL2 &&
316 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
317 goto transfault;
318
319 /* R_BNDVG and following statements */
320 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
321 wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
322 goto transfault;
323
324 ps = (wi->regime == TR_EL2 ?
325 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
326
327 wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit));
328
329 /* Compute minimal alignment */
330 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
331
332 wi->baddr = ttbr & TTBRx_EL1_BADDR;
333 if (wi->pa52bit) {
334 /*
335 * Force the alignment on 64 bytes for top-level tables
336 * smaller than 8 entries, since TTBR.BADDR[5:2] are used to
337 * store bits [51:48] of the first level of lookup.
338 */
339 x = max(x, 6);
340
341 wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48;
342 }
343
344 /* R_VPBBF */
345 if (check_output_size(wi->baddr, wi))
346 goto addrsz;
347
348 wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
349
350 wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF);
351 wi->ha &= (wi->regime == TR_EL2 ?
352 FIELD_GET(TCR_EL2_HA, tcr) :
353 FIELD_GET(TCR_HA, tcr));
354
355 return 0;
356
357 addrsz:
358 /*
359 * Address Size Fault level 0 to indicate it comes from TTBR.
360 * yes, this is an oddity.
361 */
362 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false);
363 return -EFAULT;
364
365 transfault:
366 /* Translation Fault on start level */
367 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false);
368 return -EFAULT;
369 }
370
kvm_read_s1_desc(struct kvm_vcpu * vcpu,u64 pa,u64 * desc,struct s1_walk_info * wi)371 static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
372 struct s1_walk_info *wi)
373 {
374 u64 val;
375 int r;
376
377 r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
378 if (r)
379 return r;
380
381 if (wi->be)
382 *desc = be64_to_cpu((__force __be64)val);
383 else
384 *desc = le64_to_cpu((__force __le64)val);
385
386 return 0;
387 }
388
kvm_swap_s1_desc(struct kvm_vcpu * vcpu,u64 pa,u64 old,u64 new,struct s1_walk_info * wi)389 static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new,
390 struct s1_walk_info *wi)
391 {
392 if (wi->be) {
393 old = (__force u64)cpu_to_be64(old);
394 new = (__force u64)cpu_to_be64(new);
395 } else {
396 old = (__force u64)cpu_to_le64(old);
397 new = (__force u64)cpu_to_le64(new);
398 }
399
400 return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
401 }
402
walk_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)403 static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
404 struct s1_walk_result *wr, u64 va)
405 {
406 u64 va_top, va_bottom, baddr, desc, new_desc, ipa;
407 struct kvm_s2_trans s2_trans = {};
408 int level, stride, ret;
409
410 level = wi->sl;
411 stride = wi->pgshift - 3;
412 baddr = wi->baddr;
413
414 va_top = get_ia_size(wi) - 1;
415
416 while (1) {
417 u64 index;
418
419 va_bottom = (3 - level) * stride + wi->pgshift;
420 index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
421
422 ipa = baddr | index;
423
424 if (wi->s2) {
425 ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
426 if (ret) {
427 fail_s1_walk(wr,
428 (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
429 true);
430 return ret;
431 }
432
433 if (!kvm_s2_trans_readable(&s2_trans)) {
434 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
435 true);
436
437 return -EPERM;
438 }
439
440 ipa = kvm_s2_trans_output(&s2_trans);
441 }
442
443 if (wi->filter) {
444 ret = wi->filter->fn(&(struct s1_walk_context)
445 {
446 .wi = wi,
447 .table_ipa = baddr,
448 .level = level,
449 }, wi->filter->priv);
450 if (ret)
451 return ret;
452 }
453
454 ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi);
455 if (ret) {
456 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
457 return ret;
458 }
459
460 new_desc = desc;
461
462 /* Invalid descriptor */
463 if (!(desc & BIT(0)))
464 goto transfault;
465
466 /* Block mapping, check validity down the line */
467 if (!(desc & BIT(1)))
468 break;
469
470 /* Page mapping */
471 if (level == 3)
472 break;
473
474 /* Table handling */
475 if (!wi->hpd) {
476 wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
477 wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
478 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
479 }
480
481 baddr = desc_to_oa(wi, desc);
482
483 /* Check for out-of-range OA */
484 if (check_output_size(baddr, wi))
485 goto addrsz;
486
487 /* Prepare for next round */
488 va_top = va_bottom - 1;
489 level++;
490 }
491
492 /* Block mapping, check the validity of the level */
493 if (!(desc & BIT(1))) {
494 bool valid_block = false;
495
496 switch (BIT(wi->pgshift)) {
497 case SZ_4K:
498 valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0);
499 break;
500 case SZ_16K:
501 case SZ_64K:
502 valid_block = level == 2 || (wi->pa52bit && level == 1);
503 break;
504 }
505
506 if (!valid_block)
507 goto transfault;
508 }
509
510 baddr = desc_to_oa(wi, desc);
511 if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
512 goto addrsz;
513
514 if (wi->ha)
515 new_desc |= PTE_AF;
516
517 if (new_desc != desc) {
518 if (wi->s2 && !kvm_s2_trans_writable(&s2_trans)) {
519 fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), true);
520 return -EPERM;
521 }
522
523 ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi);
524 if (ret)
525 return ret;
526
527 desc = new_desc;
528 }
529
530 if (!(desc & PTE_AF)) {
531 fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
532 return -EACCES;
533 }
534
535 va_bottom += contiguous_bit_shift(desc, wi, level);
536
537 wr->failed = false;
538 wr->level = level;
539 wr->desc = desc;
540 wr->pa = baddr & GENMASK(52, va_bottom);
541 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
542
543 wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
544 if (wr->nG)
545 wr->asid = get_asid_by_regime(vcpu, wi->regime);
546
547 return 0;
548
549 addrsz:
550 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false);
551 return -EINVAL;
552 transfault:
553 fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false);
554 return -ENOENT;
555 }
556
557 struct mmu_config {
558 u64 ttbr0;
559 u64 ttbr1;
560 u64 tcr;
561 u64 mair;
562 u64 tcr2;
563 u64 pir;
564 u64 pire0;
565 u64 por_el0;
566 u64 por_el1;
567 u64 sctlr;
568 u64 vttbr;
569 u64 vtcr;
570 };
571
__mmu_config_save(struct mmu_config * config)572 static void __mmu_config_save(struct mmu_config *config)
573 {
574 config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
575 config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
576 config->tcr = read_sysreg_el1(SYS_TCR);
577 config->mair = read_sysreg_el1(SYS_MAIR);
578 if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
579 config->tcr2 = read_sysreg_el1(SYS_TCR2);
580 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
581 config->pir = read_sysreg_el1(SYS_PIR);
582 config->pire0 = read_sysreg_el1(SYS_PIRE0);
583 }
584 if (system_supports_poe()) {
585 config->por_el1 = read_sysreg_el1(SYS_POR);
586 config->por_el0 = read_sysreg_s(SYS_POR_EL0);
587 }
588 }
589 config->sctlr = read_sysreg_el1(SYS_SCTLR);
590 config->vttbr = read_sysreg(vttbr_el2);
591 config->vtcr = read_sysreg(vtcr_el2);
592 }
593
__mmu_config_restore(struct mmu_config * config)594 static void __mmu_config_restore(struct mmu_config *config)
595 {
596 /*
597 * ARM errata 1165522 and 1530923 require TGE to be 1 before
598 * we update the guest state.
599 */
600 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
601
602 write_sysreg_el1(config->ttbr0, SYS_TTBR0);
603 write_sysreg_el1(config->ttbr1, SYS_TTBR1);
604 write_sysreg_el1(config->tcr, SYS_TCR);
605 write_sysreg_el1(config->mair, SYS_MAIR);
606 if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
607 write_sysreg_el1(config->tcr2, SYS_TCR2);
608 if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
609 write_sysreg_el1(config->pir, SYS_PIR);
610 write_sysreg_el1(config->pire0, SYS_PIRE0);
611 }
612 if (system_supports_poe()) {
613 write_sysreg_el1(config->por_el1, SYS_POR);
614 write_sysreg_s(config->por_el0, SYS_POR_EL0);
615 }
616 }
617 write_sysreg_el1(config->sctlr, SYS_SCTLR);
618 write_sysreg(config->vttbr, vttbr_el2);
619 write_sysreg(config->vtcr, vtcr_el2);
620 }
621
at_s1e1p_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)622 static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
623 {
624 u64 host_pan;
625 bool fail;
626
627 host_pan = read_sysreg_s(SYS_PSTATE_PAN);
628 write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
629
630 switch (op) {
631 case OP_AT_S1E1RP:
632 fail = __kvm_at(OP_AT_S1E1RP, vaddr);
633 break;
634 case OP_AT_S1E1WP:
635 fail = __kvm_at(OP_AT_S1E1WP, vaddr);
636 break;
637 }
638
639 write_sysreg_s(host_pan, SYS_PSTATE_PAN);
640
641 return fail;
642 }
643
644 #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
645 #define MEMATTR_NC 0b0100
646 #define MEMATTR_Wt 0b1000
647 #define MEMATTR_Wb 0b1100
648 #define MEMATTR_WbRaWa 0b1111
649
650 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
651
s2_memattr_to_attr(u8 memattr)652 static u8 s2_memattr_to_attr(u8 memattr)
653 {
654 memattr &= 0b1111;
655
656 switch (memattr) {
657 case 0b0000:
658 case 0b0001:
659 case 0b0010:
660 case 0b0011:
661 return memattr << 2;
662 case 0b0100:
663 return MEMATTR(Wb, Wb);
664 case 0b0101:
665 return MEMATTR(NC, NC);
666 case 0b0110:
667 return MEMATTR(Wt, NC);
668 case 0b0111:
669 return MEMATTR(Wb, NC);
670 case 0b1000:
671 /* Reserved, assume NC */
672 return MEMATTR(NC, NC);
673 case 0b1001:
674 return MEMATTR(NC, Wt);
675 case 0b1010:
676 return MEMATTR(Wt, Wt);
677 case 0b1011:
678 return MEMATTR(Wb, Wt);
679 case 0b1100:
680 /* Reserved, assume NC */
681 return MEMATTR(NC, NC);
682 case 0b1101:
683 return MEMATTR(NC, Wb);
684 case 0b1110:
685 return MEMATTR(Wt, Wb);
686 case 0b1111:
687 return MEMATTR(Wb, Wb);
688 default:
689 unreachable();
690 }
691 }
692
combine_s1_s2_attr(u8 s1,u8 s2)693 static u8 combine_s1_s2_attr(u8 s1, u8 s2)
694 {
695 bool transient;
696 u8 final = 0;
697
698 /* Upgrade transient s1 to non-transient to simplify things */
699 switch (s1) {
700 case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
701 transient = true;
702 s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
703 break;
704 case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
705 transient = true;
706 s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
707 break;
708 default:
709 transient = false;
710 }
711
712 /* S2CombineS1AttrHints() */
713 if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
714 (s2 & GENMASK(3, 2)) == MEMATTR_NC)
715 final = MEMATTR_NC;
716 else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
717 (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
718 final = MEMATTR_Wt;
719 else
720 final = MEMATTR_Wb;
721
722 if (final != MEMATTR_NC) {
723 /* Inherit RaWa hints form S1 */
724 if (transient) {
725 switch (s1 & GENMASK(3, 2)) {
726 case MEMATTR_Wt:
727 final = 0;
728 break;
729 case MEMATTR_Wb:
730 final = MEMATTR_NC;
731 break;
732 }
733 }
734
735 final |= s1 & GENMASK(1, 0);
736 }
737
738 return final;
739 }
740
741 #define ATTR_NSH 0b00
742 #define ATTR_RSV 0b01
743 #define ATTR_OSH 0b10
744 #define ATTR_ISH 0b11
745
compute_final_sh(u8 attr,u8 sh)746 static u8 compute_final_sh(u8 attr, u8 sh)
747 {
748 /* Any form of device, as well as NC has SH[1:0]=0b10 */
749 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
750 return ATTR_OSH;
751
752 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
753 sh = ATTR_NSH;
754
755 return sh;
756 }
757
compute_s1_sh(struct s1_walk_info * wi,struct s1_walk_result * wr,u8 attr)758 static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr,
759 u8 attr)
760 {
761 u8 sh;
762
763 /*
764 * non-52bit and LPA have their basic shareability described in the
765 * descriptor. LPA2 gets it from the corresponding field in TCR,
766 * conveniently recorded in the walk info.
767 */
768 if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K)
769 sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc);
770 else
771 sh = wi->sh;
772
773 return compute_final_sh(attr, sh);
774 }
775
combine_sh(u8 s1_sh,u8 s2_sh)776 static u8 combine_sh(u8 s1_sh, u8 s2_sh)
777 {
778 if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
779 return ATTR_OSH;
780 if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
781 return ATTR_ISH;
782
783 return ATTR_NSH;
784 }
785
compute_par_s12(struct kvm_vcpu * vcpu,u64 s1_par,struct kvm_s2_trans * tr)786 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
787 struct kvm_s2_trans *tr)
788 {
789 u8 s1_parattr, s2_memattr, final_attr, s2_sh;
790 u64 par;
791
792 /* If S2 has failed to translate, report the damage */
793 if (tr->esr) {
794 par = SYS_PAR_EL1_RES1;
795 par |= SYS_PAR_EL1_F;
796 par |= SYS_PAR_EL1_S;
797 par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
798 return par;
799 }
800
801 s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
802 s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
803
804 if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
805 if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
806 s2_memattr &= ~BIT(3);
807
808 /* Combination of R_VRJSW and R_RHWZM */
809 switch (s2_memattr) {
810 case 0b0101:
811 if (MEMATTR_IS_DEVICE(s1_parattr))
812 final_attr = s1_parattr;
813 else
814 final_attr = MEMATTR(NC, NC);
815 break;
816 case 0b0110:
817 case 0b1110:
818 final_attr = MEMATTR(WbRaWa, WbRaWa);
819 break;
820 case 0b0111:
821 case 0b1111:
822 /* Preserve S1 attribute */
823 final_attr = s1_parattr;
824 break;
825 case 0b0100:
826 case 0b1100:
827 case 0b1101:
828 /* Reserved, do something non-silly */
829 final_attr = s1_parattr;
830 break;
831 default:
832 /*
833 * MemAttr[2]=0, Device from S2.
834 *
835 * FWB does not influence the way that stage 1
836 * memory types and attributes are combined
837 * with stage 2 Device type and attributes.
838 */
839 final_attr = min(s2_memattr_to_attr(s2_memattr),
840 s1_parattr);
841 }
842 } else {
843 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
844 u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
845
846 if (MEMATTR_IS_DEVICE(s1_parattr) ||
847 MEMATTR_IS_DEVICE(s2_parattr)) {
848 final_attr = min(s1_parattr, s2_parattr);
849 } else {
850 /* At this stage, this is memory vs memory */
851 final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
852 s2_parattr & 0xf);
853 final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
854 s2_parattr >> 4) << 4;
855 }
856 }
857
858 if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
859 !MEMATTR_IS_DEVICE(final_attr))
860 final_attr = MEMATTR(NC, NC);
861
862 s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc);
863
864 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
865 par |= tr->output & GENMASK(47, 12);
866 par |= FIELD_PREP(SYS_PAR_EL1_SH,
867 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
868 compute_final_sh(final_attr, s2_sh)));
869
870 return par;
871 }
872
compute_par_s1(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)873 static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
874 struct s1_walk_result *wr)
875 {
876 u64 par;
877
878 if (wr->failed) {
879 par = SYS_PAR_EL1_RES1;
880 par |= SYS_PAR_EL1_F;
881 par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
882 par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
883 par |= wr->s2 ? SYS_PAR_EL1_S : 0;
884 } else if (wr->level == S1_MMU_DISABLED) {
885 /* MMU off or HCR_EL2.DC == 1 */
886 par = SYS_PAR_EL1_NSE;
887 par |= wr->pa & SYS_PAR_EL1_PA;
888
889 if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) &&
890 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
891 par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
892 MEMATTR(WbRaWa, WbRaWa));
893 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
894 } else {
895 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
896 par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
897 }
898 } else {
899 u64 mair, sctlr;
900 u8 sh;
901
902 par = SYS_PAR_EL1_NSE;
903
904 mair = (wi->regime == TR_EL10 ?
905 vcpu_read_sys_reg(vcpu, MAIR_EL1) :
906 vcpu_read_sys_reg(vcpu, MAIR_EL2));
907
908 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
909 mair &= 0xff;
910
911 sctlr = (wi->regime == TR_EL10 ?
912 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
913 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
914
915 /* Force NC for memory if SCTLR_ELx.C is clear */
916 if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
917 mair = MEMATTR(NC, NC);
918
919 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
920 par |= wr->pa & SYS_PAR_EL1_PA;
921
922 sh = compute_s1_sh(wi, wr, mair);
923 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
924 }
925
926 return par;
927 }
928
pan3_enabled(struct kvm_vcpu * vcpu,enum trans_regime regime)929 static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
930 {
931 u64 sctlr;
932
933 if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
934 return false;
935
936 if (s1pie_enabled(vcpu, regime))
937 return true;
938
939 if (regime == TR_EL10)
940 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
941 else
942 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
943
944 return sctlr & SCTLR_EL1_EPAN;
945 }
946
compute_s1_direct_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)947 static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
948 struct s1_walk_info *wi,
949 struct s1_walk_result *wr)
950 {
951 bool wxn;
952
953 /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
954 if (wi->regime != TR_EL2) {
955 switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
956 case 0b00:
957 wr->pr = wr->pw = true;
958 wr->ur = wr->uw = false;
959 break;
960 case 0b01:
961 wr->pr = wr->pw = wr->ur = wr->uw = true;
962 break;
963 case 0b10:
964 wr->pr = true;
965 wr->pw = wr->ur = wr->uw = false;
966 break;
967 case 0b11:
968 wr->pr = wr->ur = true;
969 wr->pw = wr->uw = false;
970 break;
971 }
972
973 /* We don't use px for anything yet, but hey... */
974 wr->px = !((wr->desc & PTE_PXN) || wr->uw);
975 wr->ux = !(wr->desc & PTE_UXN);
976 } else {
977 wr->ur = wr->uw = wr->ux = false;
978
979 if (!(wr->desc & PTE_RDONLY)) {
980 wr->pr = wr->pw = true;
981 } else {
982 wr->pr = true;
983 wr->pw = false;
984 }
985
986 /* XN maps to UXN */
987 wr->px = !(wr->desc & PTE_UXN);
988 }
989
990 switch (wi->regime) {
991 case TR_EL2:
992 case TR_EL20:
993 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
994 break;
995 case TR_EL10:
996 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
997 break;
998 }
999
1000 wr->pwxn = wr->uwxn = wxn;
1001 wr->pov = wi->poe;
1002 wr->uov = wi->e0poe;
1003 }
1004
compute_s1_hierarchical_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1005 static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu,
1006 struct s1_walk_info *wi,
1007 struct s1_walk_result *wr)
1008 {
1009 /* Hierarchical part of AArch64.S1DirectBasePermissions() */
1010 if (wi->regime != TR_EL2) {
1011 switch (wr->APTable) {
1012 case 0b00:
1013 break;
1014 case 0b01:
1015 wr->ur = wr->uw = false;
1016 break;
1017 case 0b10:
1018 wr->pw = wr->uw = false;
1019 break;
1020 case 0b11:
1021 wr->pw = wr->ur = wr->uw = false;
1022 break;
1023 }
1024
1025 wr->px &= !wr->PXNTable;
1026 wr->ux &= !wr->UXNTable;
1027 } else {
1028 if (wr->APTable & BIT(1))
1029 wr->pw = false;
1030
1031 /* XN maps to UXN */
1032 wr->px &= !wr->UXNTable;
1033 }
1034 }
1035
1036 #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf)
1037
1038 #define set_priv_perms(wr, r, w, x) \
1039 do { \
1040 (wr)->pr = (r); \
1041 (wr)->pw = (w); \
1042 (wr)->px = (x); \
1043 } while (0)
1044
1045 #define set_unpriv_perms(wr, r, w, x) \
1046 do { \
1047 (wr)->ur = (r); \
1048 (wr)->uw = (w); \
1049 (wr)->ux = (x); \
1050 } while (0)
1051
1052 #define set_priv_wxn(wr, v) \
1053 do { \
1054 (wr)->pwxn = (v); \
1055 } while (0)
1056
1057 #define set_unpriv_wxn(wr, v) \
1058 do { \
1059 (wr)->uwxn = (v); \
1060 } while (0)
1061
1062 /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */
1063 #define set_perms(w, wr, ip) \
1064 do { \
1065 /* R_LLZDZ */ \
1066 switch ((ip)) { \
1067 case 0b0000: \
1068 set_ ## w ## _perms((wr), false, false, false); \
1069 break; \
1070 case 0b0001: \
1071 set_ ## w ## _perms((wr), true , false, false); \
1072 break; \
1073 case 0b0010: \
1074 set_ ## w ## _perms((wr), false, false, true ); \
1075 break; \
1076 case 0b0011: \
1077 set_ ## w ## _perms((wr), true , false, true ); \
1078 break; \
1079 case 0b0100: \
1080 set_ ## w ## _perms((wr), false, false, false); \
1081 break; \
1082 case 0b0101: \
1083 set_ ## w ## _perms((wr), true , true , false); \
1084 break; \
1085 case 0b0110: \
1086 set_ ## w ## _perms((wr), true , true , true ); \
1087 break; \
1088 case 0b0111: \
1089 set_ ## w ## _perms((wr), true , true , true ); \
1090 break; \
1091 case 0b1000: \
1092 set_ ## w ## _perms((wr), true , false, false); \
1093 break; \
1094 case 0b1001: \
1095 set_ ## w ## _perms((wr), true , false, false); \
1096 break; \
1097 case 0b1010: \
1098 set_ ## w ## _perms((wr), true , false, true ); \
1099 break; \
1100 case 0b1011: \
1101 set_ ## w ## _perms((wr), false, false, false); \
1102 break; \
1103 case 0b1100: \
1104 set_ ## w ## _perms((wr), true , true , false); \
1105 break; \
1106 case 0b1101: \
1107 set_ ## w ## _perms((wr), false, false, false); \
1108 break; \
1109 case 0b1110: \
1110 set_ ## w ## _perms((wr), true , true , true ); \
1111 break; \
1112 case 0b1111: \
1113 set_ ## w ## _perms((wr), false, false, false); \
1114 break; \
1115 } \
1116 \
1117 /* R_HJYGR */ \
1118 set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \
1119 \
1120 } while (0)
1121
compute_s1_indirect_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1122 static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu,
1123 struct s1_walk_info *wi,
1124 struct s1_walk_result *wr)
1125 {
1126 u8 up, pp, idx;
1127
1128 idx = pte_pi_index(wr->desc);
1129
1130 switch (wi->regime) {
1131 case TR_EL10:
1132 pp = perm_idx(vcpu, PIR_EL1, idx);
1133 up = perm_idx(vcpu, PIRE0_EL1, idx);
1134 break;
1135 case TR_EL20:
1136 pp = perm_idx(vcpu, PIR_EL2, idx);
1137 up = perm_idx(vcpu, PIRE0_EL2, idx);
1138 break;
1139 case TR_EL2:
1140 pp = perm_idx(vcpu, PIR_EL2, idx);
1141 up = 0;
1142 break;
1143 }
1144
1145 set_perms(priv, wr, pp);
1146
1147 if (wi->regime != TR_EL2)
1148 set_perms(unpriv, wr, up);
1149 else
1150 set_unpriv_perms(wr, false, false, false);
1151
1152 wr->pov = wi->poe && !(pp & BIT(3));
1153 wr->uov = wi->e0poe && !(up & BIT(3));
1154
1155 /* R_VFPJF */
1156 if (wr->px && wr->uw) {
1157 set_priv_perms(wr, false, false, false);
1158 set_unpriv_perms(wr, false, false, false);
1159 }
1160 }
1161
compute_s1_overlay_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1162 static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
1163 struct s1_walk_info *wi,
1164 struct s1_walk_result *wr)
1165 {
1166 u8 idx, pov_perms, uov_perms;
1167
1168 idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
1169
1170 if (wr->pov) {
1171 switch (wi->regime) {
1172 case TR_EL10:
1173 pov_perms = perm_idx(vcpu, POR_EL1, idx);
1174 break;
1175 case TR_EL20:
1176 pov_perms = perm_idx(vcpu, POR_EL2, idx);
1177 break;
1178 case TR_EL2:
1179 pov_perms = perm_idx(vcpu, POR_EL2, idx);
1180 break;
1181 }
1182
1183 if (pov_perms & ~POE_RWX)
1184 pov_perms = POE_NONE;
1185
1186 /* R_QXXPC, S1PrivOverflow enabled */
1187 if (wr->pwxn && (pov_perms & POE_X))
1188 pov_perms &= ~POE_W;
1189
1190 wr->pr &= pov_perms & POE_R;
1191 wr->pw &= pov_perms & POE_W;
1192 wr->px &= pov_perms & POE_X;
1193 }
1194
1195 if (wr->uov) {
1196 switch (wi->regime) {
1197 case TR_EL10:
1198 uov_perms = perm_idx(vcpu, POR_EL0, idx);
1199 break;
1200 case TR_EL20:
1201 uov_perms = perm_idx(vcpu, POR_EL0, idx);
1202 break;
1203 case TR_EL2:
1204 uov_perms = 0;
1205 break;
1206 }
1207
1208 if (uov_perms & ~POE_RWX)
1209 uov_perms = POE_NONE;
1210
1211 /* R_NPBXC, S1UnprivOverlay enabled */
1212 if (wr->uwxn && (uov_perms & POE_X))
1213 uov_perms &= ~POE_W;
1214
1215 wr->ur &= uov_perms & POE_R;
1216 wr->uw &= uov_perms & POE_W;
1217 wr->ux &= uov_perms & POE_X;
1218 }
1219 }
1220
compute_s1_permissions(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr)1221 static void compute_s1_permissions(struct kvm_vcpu *vcpu,
1222 struct s1_walk_info *wi,
1223 struct s1_walk_result *wr)
1224 {
1225 bool pan;
1226
1227 if (!s1pie_enabled(vcpu, wi->regime))
1228 compute_s1_direct_permissions(vcpu, wi, wr);
1229 else
1230 compute_s1_indirect_permissions(vcpu, wi, wr);
1231
1232 if (!wi->hpd)
1233 compute_s1_hierarchical_permissions(vcpu, wi, wr);
1234
1235 compute_s1_overlay_permissions(vcpu, wi, wr);
1236
1237 /* R_QXXPC, S1PrivOverlay disabled */
1238 if (!wr->pov)
1239 wr->px &= !(wr->pwxn && wr->pw);
1240
1241 /* R_NPBXC, S1UnprivOverlay disabled */
1242 if (!wr->uov)
1243 wr->ux &= !(wr->uwxn && wr->uw);
1244
1245 pan = wi->pan && (wr->ur || wr->uw ||
1246 (pan3_enabled(vcpu, wi->regime) && wr->ux));
1247 wr->pw &= !pan;
1248 wr->pr &= !pan;
1249 }
1250
handle_at_slow(struct kvm_vcpu * vcpu,u32 op,u64 vaddr,u64 * par)1251 static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
1252 {
1253 struct s1_walk_result wr = {};
1254 struct s1_walk_info wi = {};
1255 bool perm_fail = false;
1256 int ret, idx;
1257
1258 wi.regime = compute_translation_regime(vcpu, op);
1259 wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
1260 wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
1261 (*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
1262
1263 ret = setup_s1_walk(vcpu, &wi, &wr, vaddr);
1264 if (ret)
1265 goto compute_par;
1266
1267 if (wr.level == S1_MMU_DISABLED)
1268 goto compute_par;
1269
1270 idx = srcu_read_lock(&vcpu->kvm->srcu);
1271
1272 ret = walk_s1(vcpu, &wi, &wr, vaddr);
1273
1274 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1275
1276 /*
1277 * Race to update a descriptor -- restart the walk.
1278 */
1279 if (ret == -EAGAIN)
1280 return ret;
1281 if (ret)
1282 goto compute_par;
1283
1284 compute_s1_permissions(vcpu, &wi, &wr);
1285
1286 switch (op) {
1287 case OP_AT_S1E1RP:
1288 case OP_AT_S1E1R:
1289 case OP_AT_S1E2R:
1290 perm_fail = !wr.pr;
1291 break;
1292 case OP_AT_S1E1WP:
1293 case OP_AT_S1E1W:
1294 case OP_AT_S1E2W:
1295 perm_fail = !wr.pw;
1296 break;
1297 case OP_AT_S1E0R:
1298 perm_fail = !wr.ur;
1299 break;
1300 case OP_AT_S1E0W:
1301 perm_fail = !wr.uw;
1302 break;
1303 case OP_AT_S1E1A:
1304 case OP_AT_S1E2A:
1305 break;
1306 default:
1307 BUG();
1308 }
1309
1310 if (perm_fail)
1311 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
1312
1313 compute_par:
1314 *par = compute_par_s1(vcpu, &wi, &wr);
1315 return 0;
1316 }
1317
1318 /*
1319 * Return the PAR_EL1 value as the result of a valid translation.
1320 *
1321 * If the translation is unsuccessful, the value may only contain
1322 * PAR_EL1.F, and cannot be taken at face value. It isn't an
1323 * indication of the translation having failed, only that the fast
1324 * path did not succeed, *unless* it indicates a S1 permission or
1325 * access fault.
1326 */
__kvm_at_s1e01_fast(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1327 static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1328 {
1329 struct mmu_config config;
1330 struct kvm_s2_mmu *mmu;
1331 bool fail, mmu_cs;
1332 u64 par;
1333
1334 par = SYS_PAR_EL1_F;
1335
1336 /*
1337 * We've trapped, so everything is live on the CPU. As we will
1338 * be switching contexts behind everybody's back, disable
1339 * interrupts while holding the mmu lock.
1340 */
1341 guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
1342
1343 /*
1344 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
1345 * the right one (as we trapped from vEL2). If not, save the
1346 * full MMU context.
1347 *
1348 * We are also guaranteed to be in the correct context if
1349 * we're not in a nested VM.
1350 */
1351 mmu_cs = (vcpu_has_nv(vcpu) &&
1352 !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)));
1353 if (!mmu_cs)
1354 goto skip_mmu_switch;
1355
1356 /*
1357 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
1358 * find it (recycled by another vcpu, for example). When this
1359 * happens, admit defeat immediately and use the SW (slow) path.
1360 */
1361 mmu = lookup_s2_mmu(vcpu);
1362 if (!mmu)
1363 return par;
1364
1365 __mmu_config_save(&config);
1366
1367 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
1368 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
1369 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
1370 write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
1371 if (kvm_has_tcr2(vcpu->kvm)) {
1372 write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2);
1373 if (kvm_has_s1pie(vcpu->kvm)) {
1374 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR);
1375 write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0);
1376 }
1377 if (kvm_has_s1poe(vcpu->kvm)) {
1378 write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR);
1379 write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0);
1380 }
1381 }
1382 write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
1383 __load_stage2(mmu, mmu->arch);
1384
1385 skip_mmu_switch:
1386 /* Temporarily switch back to guest context */
1387 write_sysreg_hcr(vcpu->arch.hcr_el2);
1388 isb();
1389
1390 switch (op) {
1391 case OP_AT_S1E1RP:
1392 case OP_AT_S1E1WP:
1393 fail = at_s1e1p_fast(vcpu, op, vaddr);
1394 break;
1395 case OP_AT_S1E1R:
1396 fail = __kvm_at(OP_AT_S1E1R, vaddr);
1397 break;
1398 case OP_AT_S1E1W:
1399 fail = __kvm_at(OP_AT_S1E1W, vaddr);
1400 break;
1401 case OP_AT_S1E0R:
1402 fail = __kvm_at(OP_AT_S1E0R, vaddr);
1403 break;
1404 case OP_AT_S1E0W:
1405 fail = __kvm_at(OP_AT_S1E0W, vaddr);
1406 break;
1407 case OP_AT_S1E1A:
1408 fail = __kvm_at(OP_AT_S1E1A, vaddr);
1409 break;
1410 default:
1411 WARN_ON_ONCE(1);
1412 fail = true;
1413 break;
1414 }
1415
1416 if (!fail)
1417 par = read_sysreg_par();
1418
1419 write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
1420
1421 if (mmu_cs)
1422 __mmu_config_restore(&config);
1423
1424 return par;
1425 }
1426
par_check_s1_perm_fault(u64 par)1427 static bool par_check_s1_perm_fault(u64 par)
1428 {
1429 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1430
1431 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
1432 !(par & SYS_PAR_EL1_S));
1433 }
1434
par_check_s1_access_fault(u64 par)1435 static bool par_check_s1_access_fault(u64 par)
1436 {
1437 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
1438
1439 return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS &&
1440 !(par & SYS_PAR_EL1_S));
1441 }
1442
__kvm_at_s1e01(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1443 int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1444 {
1445 u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
1446 int ret;
1447
1448 /*
1449 * If PAR_EL1 reports that AT failed on a S1 permission or access
1450 * fault, we know for sure that the PTW was able to walk the S1
1451 * tables and there's nothing else to do.
1452 *
1453 * If AT failed for any other reason, then we must walk the guest S1
1454 * to emulate the instruction.
1455 */
1456 if ((par & SYS_PAR_EL1_F) &&
1457 !par_check_s1_perm_fault(par) &&
1458 !par_check_s1_access_fault(par)) {
1459 ret = handle_at_slow(vcpu, op, vaddr, &par);
1460 if (ret)
1461 return ret;
1462 }
1463
1464 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1465 return 0;
1466 }
1467
__kvm_at_s1e2(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1468 int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1469 {
1470 u64 par;
1471 int ret;
1472
1473 /*
1474 * We've trapped, so everything is live on the CPU. As we will be
1475 * switching context behind everybody's back, disable interrupts...
1476 */
1477 scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
1478 u64 val, hcr;
1479 bool fail;
1480
1481 val = hcr = read_sysreg(hcr_el2);
1482 val &= ~HCR_TGE;
1483 val |= HCR_VM;
1484
1485 if (!vcpu_el2_e2h_is_set(vcpu))
1486 val |= HCR_NV | HCR_NV1;
1487
1488 write_sysreg_hcr(val);
1489 isb();
1490
1491 par = SYS_PAR_EL1_F;
1492
1493 switch (op) {
1494 case OP_AT_S1E2R:
1495 fail = __kvm_at(OP_AT_S1E1R, vaddr);
1496 break;
1497 case OP_AT_S1E2W:
1498 fail = __kvm_at(OP_AT_S1E1W, vaddr);
1499 break;
1500 case OP_AT_S1E2A:
1501 fail = __kvm_at(OP_AT_S1E1A, vaddr);
1502 break;
1503 default:
1504 WARN_ON_ONCE(1);
1505 fail = true;
1506 }
1507
1508 if (!fail)
1509 par = read_sysreg_par();
1510
1511 write_sysreg_hcr(hcr);
1512 isb();
1513 }
1514
1515 /* We failed the translation, let's replay it in slow motion */
1516 if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) {
1517 ret = handle_at_slow(vcpu, op, vaddr, &par);
1518 if (ret)
1519 return ret;
1520 }
1521
1522 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1523 return 0;
1524 }
1525
__kvm_at_s12(struct kvm_vcpu * vcpu,u32 op,u64 vaddr)1526 int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
1527 {
1528 struct kvm_s2_trans out = {};
1529 u64 ipa, par;
1530 bool write;
1531 int ret;
1532
1533 /* Do the stage-1 translation */
1534 switch (op) {
1535 case OP_AT_S12E1R:
1536 op = OP_AT_S1E1R;
1537 write = false;
1538 break;
1539 case OP_AT_S12E1W:
1540 op = OP_AT_S1E1W;
1541 write = true;
1542 break;
1543 case OP_AT_S12E0R:
1544 op = OP_AT_S1E0R;
1545 write = false;
1546 break;
1547 case OP_AT_S12E0W:
1548 op = OP_AT_S1E0W;
1549 write = true;
1550 break;
1551 default:
1552 WARN_ON_ONCE(1);
1553 return 0;
1554 }
1555
1556 __kvm_at_s1e01(vcpu, op, vaddr);
1557 par = vcpu_read_sys_reg(vcpu, PAR_EL1);
1558 if (par & SYS_PAR_EL1_F)
1559 return 0;
1560
1561 /*
1562 * If we only have a single stage of translation (EL2&0), exit
1563 * early. Same thing if {VM,DC}=={0,0}.
1564 */
1565 if (compute_translation_regime(vcpu, op) == TR_EL20 ||
1566 !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
1567 return 0;
1568
1569 /* Do the stage-2 translation */
1570 ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
1571 out.esr = 0;
1572 ret = kvm_walk_nested_s2(vcpu, ipa, &out);
1573 if (ret < 0)
1574 return ret;
1575
1576 /* Check the access permission */
1577 if (!out.esr &&
1578 ((!write && !out.readable) || (write && !out.writable)))
1579 out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
1580
1581 par = compute_par_s12(vcpu, par, &out);
1582 vcpu_write_sys_reg(vcpu, par, PAR_EL1);
1583 return 0;
1584 }
1585
1586 /*
1587 * Translate a VA for a given EL in a given translation regime, with
1588 * or without PAN. This requires wi->{regime, as_el0, pan} to be
1589 * set. The rest of the wi and wr should be 0-initialised.
1590 */
__kvm_translate_va(struct kvm_vcpu * vcpu,struct s1_walk_info * wi,struct s1_walk_result * wr,u64 va)1591 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
1592 struct s1_walk_result *wr, u64 va)
1593 {
1594 int ret;
1595
1596 ret = setup_s1_walk(vcpu, wi, wr, va);
1597 if (ret)
1598 return ret;
1599
1600 if (wr->level == S1_MMU_DISABLED) {
1601 wr->ur = wr->uw = wr->ux = true;
1602 wr->pr = wr->pw = wr->px = true;
1603 } else {
1604 ret = walk_s1(vcpu, wi, wr, va);
1605 if (ret)
1606 return ret;
1607
1608 compute_s1_permissions(vcpu, wi, wr);
1609 }
1610
1611 return 0;
1612 }
1613
1614 struct desc_match {
1615 u64 ipa;
1616 int level;
1617 };
1618
match_s1_desc(struct s1_walk_context * ctxt,void * priv)1619 static int match_s1_desc(struct s1_walk_context *ctxt, void *priv)
1620 {
1621 struct desc_match *dm = priv;
1622 u64 ipa = dm->ipa;
1623
1624 /* Use S1 granule alignment */
1625 ipa &= GENMASK(51, ctxt->wi->pgshift);
1626
1627 /* Not the IPA we're looking for? Continue. */
1628 if (ipa != ctxt->table_ipa)
1629 return 0;
1630
1631 /* Note the level and interrupt the walk */
1632 dm->level = ctxt->level;
1633 return -EINTR;
1634 }
1635
__kvm_find_s1_desc_level(struct kvm_vcpu * vcpu,u64 va,u64 ipa,int * level)1636 int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
1637 {
1638 struct desc_match dm = {
1639 .ipa = ipa,
1640 };
1641 struct s1_walk_info wi = {
1642 .filter = &(struct s1_walk_filter){
1643 .fn = match_s1_desc,
1644 .priv = &dm,
1645 },
1646 .as_el0 = false,
1647 .pan = false,
1648 };
1649 struct s1_walk_result wr = {};
1650 int ret;
1651
1652 if (is_hyp_ctxt(vcpu))
1653 wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
1654 else
1655 wi.regime = TR_EL10;
1656
1657 ret = setup_s1_walk(vcpu, &wi, &wr, va);
1658 if (ret)
1659 return ret;
1660
1661 /* We really expect the S1 MMU to be on here... */
1662 if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) {
1663 *level = 0;
1664 return 0;
1665 }
1666
1667 /* Walk the guest's PT, looking for a match along the way */
1668 ret = walk_s1(vcpu, &wi, &wr, va);
1669 switch (ret) {
1670 case -EINTR:
1671 /* We interrupted the walk on a match, return the level */
1672 *level = dm.level;
1673 return 0;
1674 case 0:
1675 /* The walk completed, we failed to find the entry */
1676 return -ENOENT;
1677 default:
1678 /* Any other error... */
1679 return ret;
1680 }
1681 }
1682
__lsui_swap_desc(u64 __user * ptep,u64 old,u64 new)1683 static int __lsui_swap_desc(u64 __user *ptep, u64 old, u64 new)
1684 {
1685 u64 tmp = old;
1686 int ret = 0;
1687
1688 /*
1689 * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(),
1690 * as PAN toggling is not required.
1691 */
1692 uaccess_ttbr0_enable();
1693
1694 asm volatile(__LSUI_PREAMBLE
1695 "1: cast %[old], %[new], %[addr]\n"
1696 "2:\n"
1697 _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
1698 : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
1699 : [new] "r" (new)
1700 : "memory");
1701
1702 uaccess_ttbr0_disable();
1703
1704 if (ret)
1705 return ret;
1706 if (tmp != old)
1707 return -EAGAIN;
1708
1709 return ret;
1710 }
1711
__lse_swap_desc(u64 __user * ptep,u64 old,u64 new)1712 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
1713 {
1714 u64 tmp = old;
1715 int ret = 0;
1716
1717 uaccess_enable_privileged();
1718
1719 asm volatile(__LSE_PREAMBLE
1720 "1: cas %[old], %[new], %[addr]\n"
1721 "2:\n"
1722 _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
1723 : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
1724 : [new] "r" (new)
1725 : "memory");
1726
1727 uaccess_disable_privileged();
1728
1729 if (ret)
1730 return ret;
1731 if (tmp != old)
1732 return -EAGAIN;
1733
1734 return ret;
1735 }
1736
__llsc_swap_desc(u64 __user * ptep,u64 old,u64 new)1737 static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new)
1738 {
1739 int ret = 1;
1740 u64 tmp;
1741
1742 uaccess_enable_privileged();
1743
1744 asm volatile("prfm pstl1strm, %[addr]\n"
1745 "1: ldxr %[tmp], %[addr]\n"
1746 "sub %[tmp], %[tmp], %[old]\n"
1747 "cbnz %[tmp], 3f\n"
1748 "2: stlxr %w[ret], %[new], %[addr]\n"
1749 "3:\n"
1750 _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret])
1751 _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret])
1752 : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp)
1753 : [old] "r" (old), [new] "r" (new)
1754 : "memory");
1755
1756 uaccess_disable_privileged();
1757
1758 /* STLXR didn't update the descriptor, or the compare failed */
1759 if (ret == 1)
1760 return -EAGAIN;
1761
1762 return ret;
1763 }
1764
__kvm_at_swap_desc(struct kvm * kvm,gpa_t ipa,u64 old,u64 new)1765 int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new)
1766 {
1767 struct kvm_memory_slot *slot;
1768 unsigned long hva;
1769 u64 __user *ptep;
1770 bool writable;
1771 int offset;
1772 gfn_t gfn;
1773 int r;
1774
1775 lockdep_assert(srcu_read_lock_held(&kvm->srcu));
1776
1777 gfn = ipa >> PAGE_SHIFT;
1778 offset = offset_in_page(ipa);
1779 slot = gfn_to_memslot(kvm, gfn);
1780 hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
1781 if (kvm_is_error_hva(hva))
1782 return -EINVAL;
1783 if (!writable)
1784 return -EPERM;
1785
1786 ptep = (void __user *)hva + offset;
1787 if (cpus_have_final_cap(ARM64_HAS_LSUI))
1788 r = __lsui_swap_desc(ptep, old, new);
1789 else if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
1790 r = __lse_swap_desc(ptep, old, new);
1791 else
1792 r = __llsc_swap_desc(ptep, old, new);
1793
1794 if (r < 0)
1795 return r;
1796
1797 mark_page_dirty_in_slot(kvm, slot, gfn);
1798 return 0;
1799 }
1800