1 // SPDX-License-Identifier: GPL-2.0-only 2 3 #include <linux/cpu.h> 4 #include <linux/kvm.h> 5 #include <linux/kvm_host.h> 6 #include <linux/interrupt.h> 7 #include <linux/io.h> 8 #include <linux/uaccess.h> 9 10 #include <kvm/arm_vgic.h> 11 12 #include <asm/kvm_arm.h> 13 #include <asm/kvm_emulate.h> 14 #include <asm/kvm_nested.h> 15 16 #include "vgic.h" 17 18 #define ICH_LRN(n) (ICH_LR0_EL2 + (n)) 19 #define ICH_AP0RN(n) (ICH_AP0R0_EL2 + (n)) 20 #define ICH_AP1RN(n) (ICH_AP1R0_EL2 + (n)) 21 22 struct mi_state { 23 u16 eisr; 24 u16 elrsr; 25 bool pend; 26 }; 27 28 /* 29 * The shadow registers loaded to the hardware when running a L2 guest 30 * with the virtual IMO/FMO bits set. 31 */ 32 struct shadow_if { 33 struct vgic_v3_cpu_if cpuif; 34 unsigned long lr_map; 35 }; 36 37 static DEFINE_PER_CPU(struct shadow_if, shadow_if); 38 39 static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) 40 { 41 return hweight16(shadow_if->lr_map & (BIT(idx) - 1)); 42 } 43 44 /* 45 * Nesting GICv3 support 46 * 47 * On a non-nesting VM (only running at EL0/EL1), the host hypervisor 48 * completely controls the interrupts injected via the list registers. 49 * Consequently, most of the state that is modified by the guest (by ACK-ing 50 * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we 51 * keep a semi-consistent view of the interrupts. 52 * 53 * This still applies for a NV guest, but only while "InHost" (either 54 * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}. 55 * 56 * When running a L2 guest ("not InHost"), things are radically different, 57 * as the L1 guest is in charge of provisioning the interrupts via its own 58 * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR 59 * page. This means that the flow described above does work (there is no 60 * state to rebuild in the L0 hypervisor), and that most things happed on L2 61 * load/put: 62 * 63 * - on L2 load: move the in-memory L1 vGIC configuration into a shadow, 64 * per-CPU data structure that is used to populate the actual LRs. This is 65 * an extra copy that we could avoid, but life is short. In the process, 66 * we remap any interrupt that has the HW bit set to the mapped interrupt 67 * on the host, should the host consider it a HW one. This allows the HW 68 * deactivation to take its course, such as for the timer. 69 * 70 * - on L2 put: perform the inverse transformation, so that the result of L2 71 * running becomes visible to L1 in the VNCR-accessible registers. 72 * 73 * - there is nothing to do on L2 entry, as everything will have happened 74 * on load. However, this is the point where we detect that an interrupt 75 * targeting L1 and prepare the grand switcheroo. 76 * 77 * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1 78 * interrupt. The L0 active state will be cleared by the HW if the L1 79 * interrupt was itself backed by a HW interrupt. 80 * 81 * Maintenance Interrupt (MI) management: 82 * 83 * Since the L2 guest runs the vgic in its full glory, MIs get delivered and 84 * used as a handover point between L2 and L1. 85 * 86 * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending, 87 * and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to 88 * run and process the MI. 89 * 90 * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its 91 * state must be computed at each entry/exit of the guest, much like we do 92 * it for the PMU interrupt. 93 * 94 * - because most of the ICH_*_EL2 registers live in the VNCR page, the 95 * quality of emulation is poor: L1 can setup the vgic so that an MI would 96 * immediately fire, and not observe anything until the next exit. Trying 97 * to read ICH_MISR_EL2 would do the trick, for example. 98 * 99 * System register emulation: 100 * 101 * We get two classes of registers: 102 * 103 * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access 104 * them, and L0 doesn't see a thing. 105 * 106 * - those that always trap (ELRSR, EISR, MISR): these are status registers 107 * that are built on the fly based on the in-memory state. 108 * 109 * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot, 110 * and a NV L2 would either access the VNCR page provided by L1 (memory 111 * based registers), or see the access redirected to L1 (registers that 112 * trap) thanks to NV being set by L1. 113 */ 114 115 bool vgic_state_is_nested(struct kvm_vcpu *vcpu) 116 { 117 u64 xmo; 118 119 if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { 120 xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO); 121 WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO), 122 "Separate virtual IRQ/FIQ settings not supported\n"); 123 124 return !!xmo; 125 } 126 127 return false; 128 } 129 130 static struct shadow_if *get_shadow_if(void) 131 { 132 return this_cpu_ptr(&shadow_if); 133 } 134 135 static bool lr_triggers_eoi(u64 lr) 136 { 137 return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI); 138 } 139 140 static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state) 141 { 142 u16 eisr = 0, elrsr = 0; 143 bool pend = false; 144 145 for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { 146 u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); 147 148 if (lr_triggers_eoi(lr)) 149 eisr |= BIT(i); 150 if (!(lr & ICH_LR_STATE)) 151 elrsr |= BIT(i); 152 pend |= (lr & ICH_LR_PENDING_BIT); 153 } 154 155 mi_state->eisr = eisr; 156 mi_state->elrsr = elrsr; 157 mi_state->pend = pend; 158 } 159 160 u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu) 161 { 162 struct mi_state mi_state; 163 164 vgic_compute_mi_state(vcpu, &mi_state); 165 return mi_state.eisr; 166 } 167 168 u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu) 169 { 170 struct mi_state mi_state; 171 172 vgic_compute_mi_state(vcpu, &mi_state); 173 return mi_state.elrsr; 174 } 175 176 u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) 177 { 178 struct mi_state mi_state; 179 u64 reg = 0, hcr, vmcr; 180 181 hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); 182 vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); 183 184 vgic_compute_mi_state(vcpu, &mi_state); 185 186 if (mi_state.eisr) 187 reg |= ICH_MISR_EL2_EOI; 188 189 if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) { 190 int used_lrs = kvm_vgic_global_state.nr_lr; 191 192 used_lrs -= hweight16(mi_state.elrsr); 193 reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0; 194 } 195 196 if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr)) 197 reg |= ICH_MISR_EL2_LRENP; 198 199 if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend) 200 reg |= ICH_MISR_EL2_NP; 201 202 if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK)) 203 reg |= ICH_MISR_EL2_VGrp0E; 204 205 if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK)) 206 reg |= ICH_MISR_EL2_VGrp0D; 207 208 if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK)) 209 reg |= ICH_MISR_EL2_VGrp1E; 210 211 if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK)) 212 reg |= ICH_MISR_EL2_VGrp1D; 213 214 return reg; 215 } 216 217 static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr) 218 { 219 struct vgic_irq *irq; 220 221 if (!(lr & ICH_LR_HW)) 222 return lr; 223 224 /* We have the HW bit set, check for validity of pINTID */ 225 irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); 226 /* If there was no real mapping, nuke the HW bit */ 227 if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI) 228 lr &= ~ICH_LR_HW; 229 230 /* Translate the virtual mapping to the real one, even if invalid */ 231 if (irq) { 232 lr &= ~ICH_LR_PHYS_ID_MASK; 233 lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); 234 vgic_put_irq(vcpu->kvm, irq); 235 } 236 237 return lr; 238 } 239 240 /* 241 * For LRs which have HW bit set such as timer interrupts, we modify them to 242 * have the host hardware interrupt number instead of the virtual one programmed 243 * by the guest hypervisor. 244 */ 245 static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, 246 struct vgic_v3_cpu_if *s_cpu_if) 247 { 248 struct shadow_if *shadow_if; 249 250 shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif); 251 shadow_if->lr_map = 0; 252 253 for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { 254 u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); 255 256 if (!(lr & ICH_LR_STATE)) 257 continue; 258 259 lr = translate_lr_pintid(vcpu, lr); 260 261 s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr; 262 shadow_if->lr_map |= BIT(i); 263 } 264 265 s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); 266 } 267 268 void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) 269 { 270 struct shadow_if *shadow_if = get_shadow_if(); 271 int i; 272 273 for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { 274 u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); 275 struct vgic_irq *irq; 276 277 if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) 278 continue; 279 280 /* 281 * If we had a HW lr programmed by the guest hypervisor, we 282 * need to emulate the HW effect between the guest hypervisor 283 * and the nested guest. 284 */ 285 irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); 286 if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ 287 continue; 288 289 lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); 290 if (!(lr & ICH_LR_STATE)) 291 irq->active = false; 292 293 vgic_put_irq(vcpu->kvm, irq); 294 } 295 } 296 297 static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, 298 struct vgic_v3_cpu_if *s_cpu_if) 299 { 300 struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; 301 u64 val = 0; 302 int i; 303 304 /* 305 * If we're on a system with a broken vgic that requires 306 * trapping, propagate the trapping requirements. 307 * 308 * Ah, the smell of rotten fruits... 309 */ 310 if (static_branch_unlikely(&vgic_v3_cpuif_trap)) 311 val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | 312 ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR); 313 s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val; 314 s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); 315 s_cpu_if->vgic_sre = host_if->vgic_sre; 316 317 for (i = 0; i < 4; i++) { 318 s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i)); 319 s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i)); 320 } 321 322 vgic_v3_create_shadow_lr(vcpu, s_cpu_if); 323 } 324 325 void vgic_v3_load_nested(struct kvm_vcpu *vcpu) 326 { 327 struct shadow_if *shadow_if = get_shadow_if(); 328 struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif; 329 330 BUG_ON(!vgic_state_is_nested(vcpu)); 331 332 vgic_v3_create_shadow_state(vcpu, cpu_if); 333 334 __vgic_v3_restore_vmcr_aprs(cpu_if); 335 __vgic_v3_activate_traps(cpu_if); 336 337 __vgic_v3_restore_state(cpu_if); 338 339 /* 340 * Propagate the number of used LRs for the benefit of the HYP 341 * GICv3 emulation code. Yes, this is a pretty sorry hack. 342 */ 343 vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs; 344 } 345 346 void vgic_v3_put_nested(struct kvm_vcpu *vcpu) 347 { 348 struct shadow_if *shadow_if = get_shadow_if(); 349 struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; 350 u64 val; 351 int i; 352 353 __vgic_v3_save_vmcr_aprs(s_cpu_if); 354 __vgic_v3_deactivate_traps(s_cpu_if); 355 __vgic_v3_save_state(s_cpu_if); 356 357 /* 358 * Translate the shadow state HW fields back to the virtual ones 359 * before copying the shadow struct back to the nested one. 360 */ 361 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); 362 val &= ~ICH_HCR_EL2_EOIcount_MASK; 363 val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK); 364 __vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val); 365 __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr); 366 367 for (i = 0; i < 4; i++) { 368 __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]); 369 __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]); 370 } 371 372 for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { 373 val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); 374 375 val &= ~ICH_LR_STATE; 376 val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE; 377 378 __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); 379 } 380 381 vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; 382 } 383 384 /* 385 * If we exit a L2 VM with a pending maintenance interrupt from the GIC, 386 * then we need to forward this to L1 so that it can re-sync the appropriate 387 * LRs and sample level triggered interrupts again. 388 */ 389 void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu) 390 { 391 bool state = read_sysreg_s(SYS_ICH_MISR_EL2); 392 393 /* This will force a switch back to L1 if the level is high */ 394 kvm_vgic_inject_irq(vcpu->kvm, vcpu, 395 vcpu->kvm->arch.vgic.mi_intid, state, vcpu); 396 397 sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); 398 } 399 400 void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu) 401 { 402 bool level; 403 404 level = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En; 405 if (level) 406 level &= vgic_v3_get_misr(vcpu); 407 kvm_vgic_inject_irq(vcpu->kvm, vcpu, 408 vcpu->kvm->arch.vgic.mi_intid, level, vcpu); 409 } 410