1 // SPDX-License-Identifier: GPL-2.0-only
2 
3 #include <linux/cpu.h>
4 #include <linux/kvm.h>
5 #include <linux/kvm_host.h>
6 #include <linux/interrupt.h>
7 #include <linux/io.h>
8 #include <linux/uaccess.h>
9 
10 #include <kvm/arm_vgic.h>
11 
12 #include <asm/kvm_arm.h>
13 #include <asm/kvm_emulate.h>
14 #include <asm/kvm_nested.h>
15 
16 #include "vgic.h"
17 
18 #define ICH_LRN(n)	(ICH_LR0_EL2 + (n))
19 #define ICH_AP0RN(n)	(ICH_AP0R0_EL2 + (n))
20 #define ICH_AP1RN(n)	(ICH_AP1R0_EL2 + (n))
21 
22 struct mi_state {
23 	u16	eisr;
24 	u16	elrsr;
25 	bool	pend;
26 };
27 
28 /*
29  * The shadow registers loaded to the hardware when running a L2 guest
30  * with the virtual IMO/FMO bits set.
31  */
32 struct shadow_if {
33 	struct vgic_v3_cpu_if	cpuif;
34 	unsigned long		lr_map;
35 };
36 
37 static DEFINE_PER_CPU(struct shadow_if, shadow_if);
38 
39 /*
40  * Nesting GICv3 support
41  *
42  * On a non-nesting VM (only running at EL0/EL1), the host hypervisor
43  * completely controls the interrupts injected via the list registers.
44  * Consequently, most of the state that is modified by the guest (by ACK-ing
45  * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we
46  * keep a semi-consistent view of the interrupts.
47  *
48  * This still applies for a NV guest, but only while "InHost" (either
49  * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}.
50  *
51  * When running a L2 guest ("not InHost"), things are radically different,
52  * as the L1 guest is in charge of provisioning the interrupts via its own
53  * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR
54  * page.  This means that the flow described above does work (there is no
55  * state to rebuild in the L0 hypervisor), and that most things happed on L2
56  * load/put:
57  *
58  * - on L2 load: move the in-memory L1 vGIC configuration into a shadow,
59  *   per-CPU data structure that is used to populate the actual LRs. This is
60  *   an extra copy that we could avoid, but life is short. In the process,
61  *   we remap any interrupt that has the HW bit set to the mapped interrupt
62  *   on the host, should the host consider it a HW one. This allows the HW
63  *   deactivation to take its course, such as for the timer.
64  *
65  * - on L2 put: perform the inverse transformation, so that the result of L2
66  *   running becomes visible to L1 in the VNCR-accessible registers.
67  *
68  * - there is nothing to do on L2 entry, as everything will have happened
69  *   on load. However, this is the point where we detect that an interrupt
70  *   targeting L1 and prepare the grand switcheroo.
71  *
72  * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1
73  *   interrupt. The L0 active state will be cleared by the HW if the L1
74  *   interrupt was itself backed by a HW interrupt.
75  *
76  * Maintenance Interrupt (MI) management:
77  *
78  * Since the L2 guest runs the vgic in its full glory, MIs get delivered and
79  * used as a handover point between L2 and L1.
80  *
81  * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending,
82  *   and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to
83  *   run and process the MI.
84  *
85  * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its
86  *   state must be computed at each entry/exit of the guest, much like we do
87  *   it for the PMU interrupt.
88  *
89  * - because most of the ICH_*_EL2 registers live in the VNCR page, the
90  *   quality of emulation is poor: L1 can setup the vgic so that an MI would
91  *   immediately fire, and not observe anything until the next exit. Trying
92  *   to read ICH_MISR_EL2 would do the trick, for example.
93  *
94  * System register emulation:
95  *
96  * We get two classes of registers:
97  *
98  * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access
99  *   them, and L0 doesn't see a thing.
100  *
101  * - those that always trap (ELRSR, EISR, MISR): these are status registers
102  *   that are built on the fly based on the in-memory state.
103  *
104  * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot,
105  * and a NV L2 would either access the VNCR page provided by L1 (memory
106  * based registers), or see the access redirected to L1 (registers that
107  * trap) thanks to NV being set by L1.
108  */
109 
vgic_state_is_nested(struct kvm_vcpu * vcpu)110 bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
111 {
112 	u64 xmo;
113 
114 	if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
115 		xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
116 		WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
117 			  "Separate virtual IRQ/FIQ settings not supported\n");
118 
119 		return !!xmo;
120 	}
121 
122 	return false;
123 }
124 
get_shadow_if(void)125 static struct shadow_if *get_shadow_if(void)
126 {
127 	return this_cpu_ptr(&shadow_if);
128 }
129 
lr_triggers_eoi(u64 lr)130 static bool lr_triggers_eoi(u64 lr)
131 {
132 	return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI);
133 }
134 
vgic_compute_mi_state(struct kvm_vcpu * vcpu,struct mi_state * mi_state)135 static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state)
136 {
137 	u16 eisr = 0, elrsr = 0;
138 	bool pend = false;
139 
140 	for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
141 		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
142 
143 		if (lr_triggers_eoi(lr))
144 			eisr |= BIT(i);
145 		if (!(lr & ICH_LR_STATE))
146 			elrsr |= BIT(i);
147 		pend |= (lr & ICH_LR_PENDING_BIT);
148 	}
149 
150 	mi_state->eisr	= eisr;
151 	mi_state->elrsr	= elrsr;
152 	mi_state->pend	= pend;
153 }
154 
vgic_v3_get_eisr(struct kvm_vcpu * vcpu)155 u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu)
156 {
157 	struct mi_state mi_state;
158 
159 	vgic_compute_mi_state(vcpu, &mi_state);
160 	return mi_state.eisr;
161 }
162 
vgic_v3_get_elrsr(struct kvm_vcpu * vcpu)163 u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu)
164 {
165 	struct mi_state mi_state;
166 
167 	vgic_compute_mi_state(vcpu, &mi_state);
168 	return mi_state.elrsr;
169 }
170 
vgic_v3_get_misr(struct kvm_vcpu * vcpu)171 u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
172 {
173 	struct mi_state mi_state;
174 	u64 reg = 0, hcr, vmcr;
175 
176 	hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
177 	vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
178 
179 	vgic_compute_mi_state(vcpu, &mi_state);
180 
181 	if (mi_state.eisr)
182 		reg |= ICH_MISR_EL2_EOI;
183 
184 	if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) {
185 		int used_lrs = kvm_vgic_global_state.nr_lr;
186 
187 		used_lrs -= hweight16(mi_state.elrsr);
188 		reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0;
189 	}
190 
191 	if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr))
192 		reg |= ICH_MISR_EL2_LRENP;
193 
194 	if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend)
195 		reg |= ICH_MISR_EL2_NP;
196 
197 	if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK))
198 		reg |= ICH_MISR_EL2_VGrp0E;
199 
200 	if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK))
201 		reg |= ICH_MISR_EL2_VGrp0D;
202 
203 	if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK))
204 		reg |= ICH_MISR_EL2_VGrp1E;
205 
206 	if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK))
207 		reg |= ICH_MISR_EL2_VGrp1D;
208 
209 	return reg;
210 }
211 
212 /*
213  * For LRs which have HW bit set such as timer interrupts, we modify them to
214  * have the host hardware interrupt number instead of the virtual one programmed
215  * by the guest hypervisor.
216  */
vgic_v3_create_shadow_lr(struct kvm_vcpu * vcpu,struct vgic_v3_cpu_if * s_cpu_if)217 static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
218 				     struct vgic_v3_cpu_if *s_cpu_if)
219 {
220 	unsigned long lr_map = 0;
221 	int index = 0;
222 
223 	for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
224 		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
225 		struct vgic_irq *irq;
226 
227 		if (!(lr & ICH_LR_STATE))
228 			lr = 0;
229 
230 		if (!(lr & ICH_LR_HW))
231 			goto next;
232 
233 		/* We have the HW bit set, check for validity of pINTID */
234 		irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
235 		if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) {
236 			/* There was no real mapping, so nuke the HW bit */
237 			lr &= ~ICH_LR_HW;
238 			if (irq)
239 				vgic_put_irq(vcpu->kvm, irq);
240 			goto next;
241 		}
242 
243 		/* It is illegal to have the EOI bit set with HW */
244 		lr &= ~ICH_LR_EOI;
245 
246 		/* Translate the virtual mapping to the real one */
247 		lr &= ~ICH_LR_PHYS_ID_MASK;
248 		lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
249 
250 		vgic_put_irq(vcpu->kvm, irq);
251 
252 next:
253 		s_cpu_if->vgic_lr[index] = lr;
254 		if (lr) {
255 			lr_map |= BIT(i);
256 			index++;
257 		}
258 	}
259 
260 	container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map;
261 	s_cpu_if->used_lrs = index;
262 }
263 
vgic_v3_sync_nested(struct kvm_vcpu * vcpu)264 void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
265 {
266 	struct shadow_if *shadow_if = get_shadow_if();
267 	int i, index = 0;
268 
269 	for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
270 		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
271 		struct vgic_irq *irq;
272 
273 		if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
274 			goto next;
275 
276 		/*
277 		 * If we had a HW lr programmed by the guest hypervisor, we
278 		 * need to emulate the HW effect between the guest hypervisor
279 		 * and the nested guest.
280 		 */
281 		irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
282 		if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
283 			goto next;
284 
285 		lr = __gic_v3_get_lr(index);
286 		if (!(lr & ICH_LR_STATE))
287 			irq->active = false;
288 
289 		vgic_put_irq(vcpu->kvm, irq);
290 	next:
291 		index++;
292 	}
293 }
294 
vgic_v3_create_shadow_state(struct kvm_vcpu * vcpu,struct vgic_v3_cpu_if * s_cpu_if)295 static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
296 					struct vgic_v3_cpu_if *s_cpu_if)
297 {
298 	struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
299 	u64 val = 0;
300 	int i;
301 
302 	/*
303 	 * If we're on a system with a broken vgic that requires
304 	 * trapping, propagate the trapping requirements.
305 	 *
306 	 * Ah, the smell of rotten fruits...
307 	 */
308 	if (static_branch_unlikely(&vgic_v3_cpuif_trap))
309 		val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
310 					   ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR);
311 	s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val;
312 	s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
313 	s_cpu_if->vgic_sre = host_if->vgic_sre;
314 
315 	for (i = 0; i < 4; i++) {
316 		s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i));
317 		s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i));
318 	}
319 
320 	vgic_v3_create_shadow_lr(vcpu, s_cpu_if);
321 }
322 
vgic_v3_load_nested(struct kvm_vcpu * vcpu)323 void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
324 {
325 	struct shadow_if *shadow_if = get_shadow_if();
326 	struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif;
327 
328 	BUG_ON(!vgic_state_is_nested(vcpu));
329 
330 	vgic_v3_create_shadow_state(vcpu, cpu_if);
331 
332 	__vgic_v3_restore_vmcr_aprs(cpu_if);
333 	__vgic_v3_activate_traps(cpu_if);
334 
335 	__vgic_v3_restore_state(cpu_if);
336 
337 	/*
338 	 * Propagate the number of used LRs for the benefit of the HYP
339 	 * GICv3 emulation code. Yes, this is a pretty sorry hack.
340 	 */
341 	vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs;
342 }
343 
vgic_v3_put_nested(struct kvm_vcpu * vcpu)344 void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
345 {
346 	struct shadow_if *shadow_if = get_shadow_if();
347 	struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
348 	u64 val;
349 	int i;
350 
351 	__vgic_v3_save_vmcr_aprs(s_cpu_if);
352 	__vgic_v3_deactivate_traps(s_cpu_if);
353 	__vgic_v3_save_state(s_cpu_if);
354 
355 	/*
356 	 * Translate the shadow state HW fields back to the virtual ones
357 	 * before copying the shadow struct back to the nested one.
358 	 */
359 	val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
360 	val &= ~ICH_HCR_EL2_EOIcount_MASK;
361 	val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK);
362 	__vcpu_sys_reg(vcpu, ICH_HCR_EL2) = val;
363 	__vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr;
364 
365 	for (i = 0; i < 4; i++) {
366 		__vcpu_sys_reg(vcpu, ICH_AP0RN(i)) = s_cpu_if->vgic_ap0r[i];
367 		__vcpu_sys_reg(vcpu, ICH_AP1RN(i)) = s_cpu_if->vgic_ap1r[i];
368 	}
369 
370 	for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
371 		val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
372 
373 		val &= ~ICH_LR_STATE;
374 		val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE;
375 
376 		__vcpu_sys_reg(vcpu, ICH_LRN(i)) = val;
377 		s_cpu_if->vgic_lr[i] = 0;
378 	}
379 
380 	shadow_if->lr_map = 0;
381 	vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
382 }
383 
384 /*
385  * If we exit a L2 VM with a pending maintenance interrupt from the GIC,
386  * then we need to forward this to L1 so that it can re-sync the appropriate
387  * LRs and sample level triggered interrupts again.
388  */
vgic_v3_handle_nested_maint_irq(struct kvm_vcpu * vcpu)389 void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu)
390 {
391 	bool state = read_sysreg_s(SYS_ICH_MISR_EL2);
392 
393 	/* This will force a switch back to L1 if the level is high */
394 	kvm_vgic_inject_irq(vcpu->kvm, vcpu,
395 			    vcpu->kvm->arch.vgic.mi_intid, state, vcpu);
396 
397 	sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0);
398 }
399 
vgic_v3_nested_update_mi(struct kvm_vcpu * vcpu)400 void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
401 {
402 	bool level;
403 
404 	level  = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En;
405 	if (level)
406 		level &= vgic_v3_get_misr(vcpu);
407 	kvm_vgic_inject_irq(vcpu->kvm, vcpu,
408 			    vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
409 }
410