xref: /src/sys/arm64/vmm/vmm_arm64.c (revision a8c3933840448eaf04ecfe162c0d05caf11090a4)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/smp.h>
33 #include <sys/kernel.h>
34 #include <sys/malloc.h>
35 #include <sys/mman.h>
36 #include <sys/pcpu.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/vmem.h>
42 
43 #include <vm/vm.h>
44 #include <vm/pmap.h>
45 #include <vm/vm_extern.h>
46 #include <vm/vm_map.h>
47 #include <vm/vm_page.h>
48 #include <vm/vm_param.h>
49 
50 #include <machine/vm.h>
51 #include <machine/cpufunc.h>
52 #include <machine/cpu.h>
53 #include <machine/machdep.h>
54 #include <machine/vmm.h>
55 #include <machine/atomic.h>
56 #include <machine/hypervisor.h>
57 #include <machine/pmap.h>
58 
59 #include <dev/vmm/vmm_mem.h>
60 #include <dev/vmm/vmm_vm.h>
61 
62 #include "mmu.h"
63 #include "arm64.h"
64 #include "hyp.h"
65 #include "reset.h"
66 #include "io/vgic.h"
67 #include "io/vgic_v3.h"
68 #include "io/vtimer.h"
69 #include "vmm_handlers.h"
70 #include "vmm_stat.h"
71 
72 #define	HANDLED		1
73 #define	UNHANDLED	0
74 
75 /* Number of bits in an EL2 virtual address */
76 #define	EL2_VIRT_BITS	48
77 CTASSERT((1ul << EL2_VIRT_BITS) >= HYP_VM_MAX_ADDRESS);
78 
79 /* TODO: Move the host hypctx off the stack */
80 #define	VMM_STACK_PAGES	4
81 #define	VMM_STACK_SIZE	(VMM_STACK_PAGES * PAGE_SIZE)
82 
83 static int vmm_pmap_levels, vmm_virt_bits, vmm_max_ipa_bits;
84 
85 /* Register values passed to arm_setup_vectors to set in the hypervisor */
86 struct vmm_init_regs {
87 	uint64_t tcr_el2;
88 	uint64_t vtcr_el2;
89 };
90 
91 MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP");
92 
93 extern char hyp_init_vectors[];
94 extern char hyp_vectors[];
95 extern char hyp_stub_vectors[];
96 
97 static vm_paddr_t hyp_code_base;
98 static size_t hyp_code_len;
99 
100 static char *stack[MAXCPU];
101 static vm_offset_t stack_hyp_va[MAXCPU];
102 
103 static vmem_t *el2_mem_alloc;
104 
105 static void arm_setup_vectors(void *arg);
106 
107 DPCPU_DEFINE_STATIC(struct hypctx *, vcpu);
108 
109 static inline void
arm64_set_active_vcpu(struct hypctx * hypctx)110 arm64_set_active_vcpu(struct hypctx *hypctx)
111 {
112 	DPCPU_SET(vcpu, hypctx);
113 }
114 
115 struct hypctx *
arm64_get_active_vcpu(void)116 arm64_get_active_vcpu(void)
117 {
118 	return (DPCPU_GET(vcpu));
119 }
120 
121 static void
arm_setup_vectors(void * arg)122 arm_setup_vectors(void *arg)
123 {
124 	struct vmm_init_regs *el2_regs;
125 	uintptr_t stack_top;
126 	uint32_t sctlr_el2;
127 	register_t daif;
128 
129 	el2_regs = arg;
130 	arm64_set_active_vcpu(NULL);
131 
132 	/*
133 	 * Configure the system control register for EL2:
134 	 *
135 	 * SCTLR_EL2_M: MMU on
136 	 * SCTLR_EL2_C: Data cacheability not affected
137 	 * SCTLR_EL2_I: Instruction cacheability not affected
138 	 * SCTLR_EL2_A: Instruction alignment check
139 	 * SCTLR_EL2_SA: Stack pointer alignment check
140 	 * SCTLR_EL2_WXN: Treat writable memory as execute never
141 	 * ~SCTLR_EL2_EE: Data accesses are little-endian
142 	 */
143 	sctlr_el2 = SCTLR_EL2_RES1;
144 	sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I;
145 	sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA;
146 	sctlr_el2 |= SCTLR_EL2_WXN;
147 	sctlr_el2 &= ~SCTLR_EL2_EE;
148 
149 	daif = intr_disable();
150 
151 	if (in_vhe()) {
152 		WRITE_SPECIALREG(vtcr_el2, el2_regs->vtcr_el2);
153 	} else {
154 		/*
155 		 * Install the temporary vectors which will be responsible for
156 		 * initializing the VMM when we next trap into EL2.
157 		 *
158 		 * x0: the exception vector table responsible for hypervisor
159 		 * initialization on the next call.
160 		 */
161 		vmm_call_hyp(vtophys(&vmm_hyp_code));
162 
163 		/* Create and map the hypervisor stack */
164 		stack_top = stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE;
165 
166 		/* Special call to initialize EL2 */
167 		vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2,
168 		    sctlr_el2, el2_regs->vtcr_el2);
169 	}
170 
171 	intr_restore(daif);
172 }
173 
174 static void
arm_teardown_vectors(void * arg)175 arm_teardown_vectors(void *arg)
176 {
177 	register_t daif;
178 
179 	/*
180 	 * vmm_cleanup() will disable the MMU. For the next few instructions,
181 	 * before the hardware disables the MMU, one of the following is
182 	 * possible:
183 	 *
184 	 * a. The instruction addresses are fetched with the MMU disabled,
185 	 * and they must represent the actual physical addresses. This will work
186 	 * because we call the vmm_cleanup() function by its physical address.
187 	 *
188 	 * b. The instruction addresses are fetched using the old translation
189 	 * tables. This will work because we have an identity mapping in place
190 	 * in the translation tables and vmm_cleanup() is called by its physical
191 	 * address.
192 	 */
193 	daif = intr_disable();
194 	/* TODO: Invalidate the cache */
195 	vmm_call_hyp(HYP_CLEANUP, vtophys(hyp_stub_vectors));
196 	intr_restore(daif);
197 
198 	arm64_set_active_vcpu(NULL);
199 }
200 
201 static uint64_t
vmm_vtcr_el2_sl(u_int levels)202 vmm_vtcr_el2_sl(u_int levels)
203 {
204 #if PAGE_SIZE == PAGE_SIZE_4K
205 	switch (levels) {
206 	case 2:
207 		return (VTCR_EL2_SL0_4K_LVL2);
208 	case 3:
209 		return (VTCR_EL2_SL0_4K_LVL1);
210 	case 4:
211 		return (VTCR_EL2_SL0_4K_LVL0);
212 	default:
213 		panic("%s: Invalid number of page table levels %u", __func__,
214 		    levels);
215 	}
216 #elif PAGE_SIZE == PAGE_SIZE_16K
217 	switch (levels) {
218 	case 2:
219 		return (VTCR_EL2_SL0_16K_LVL2);
220 	case 3:
221 		return (VTCR_EL2_SL0_16K_LVL1);
222 	case 4:
223 		return (VTCR_EL2_SL0_16K_LVL0);
224 	default:
225 		panic("%s: Invalid number of page table levels %u", __func__,
226 		    levels);
227 	}
228 #else
229 #error Unsupported page size
230 #endif
231 }
232 
233 int
vmmops_modinit(int ipinum)234 vmmops_modinit(int ipinum)
235 {
236 	struct vmm_init_regs el2_regs;
237 	vm_offset_t next_hyp_va;
238 	vm_paddr_t vmm_base;
239 	uint64_t id_aa64mmfr0_el1, pa_range_bits, pa_range_field;
240 	int cpu, i;
241 	bool rv __diagused;
242 
243 	if (!has_hyp()) {
244 		printf(
245 		    "vmm: Processor doesn't have support for virtualization\n");
246 		return (ENXIO);
247 	}
248 
249 	if (!vgic_present()) {
250 		printf("vmm: No vgic found\n");
251 		return (ENODEV);
252 	}
253 
254 	get_kernel_reg(ID_AA64MMFR0_EL1, &id_aa64mmfr0_el1);
255 	pa_range_field = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1);
256 	/*
257 	 * Use 3 levels to give us up to 39 bits with 4k pages, or
258 	 * 47 bits with 16k pages.
259 	 */
260 	/* TODO: Check the number of levels for 64k pages */
261 	vmm_pmap_levels = 3;
262 	switch (pa_range_field) {
263 	case ID_AA64MMFR0_PARange_4G:
264 		printf("vmm: Not enough physical address bits\n");
265 		return (ENXIO);
266 	case ID_AA64MMFR0_PARange_64G:
267 		vmm_virt_bits = 36;
268 #if PAGE_SIZE == PAGE_SIZE_16K
269 		vmm_pmap_levels = 2;
270 #endif
271 		break;
272 	default:
273 		vmm_virt_bits = 39;
274 		break;
275 	}
276 	pa_range_bits = pa_range_field >> ID_AA64MMFR0_PARange_SHIFT;
277 
278 	if (!in_vhe()) {
279 		/* Initialise the EL2 MMU */
280 		if (!vmmpmap_init()) {
281 			printf("vmm: Failed to init the EL2 MMU\n");
282 			return (ENOMEM);
283 		}
284 	}
285 
286 	/* Set up the stage 2 pmap callbacks */
287 	MPASS(pmap_clean_stage2_tlbi == NULL);
288 	pmap_clean_stage2_tlbi = vmm_clean_s2_tlbi;
289 	pmap_stage2_invalidate_range = vmm_s2_tlbi_range;
290 	pmap_stage2_invalidate_all = vmm_s2_tlbi_all;
291 
292 	if (!in_vhe()) {
293 		/*
294 		 * Create an allocator for the virtual address space used by
295 		 * EL2. EL2 code is identity-mapped; the allocator is used to
296 		 * find space for VM structures.
297 		 */
298 		el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0,
299 		    M_WAITOK);
300 
301 		/* Create the mappings for the hypervisor translation table. */
302 		hyp_code_len = round_page(&vmm_hyp_code_end - &vmm_hyp_code);
303 
304 		/* We need an physical identity mapping for when we activate the MMU */
305 		hyp_code_base = vmm_base = vtophys(&vmm_hyp_code);
306 		rv = vmmpmap_enter(vmm_base, hyp_code_len, vmm_base,
307 		    VM_PROT_READ | VM_PROT_EXECUTE);
308 		MPASS(rv);
309 
310 		next_hyp_va = roundup2(vmm_base + hyp_code_len, L2_SIZE);
311 
312 		/* Create a per-CPU hypervisor stack */
313 		CPU_FOREACH(cpu) {
314 			stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO);
315 			stack_hyp_va[cpu] = next_hyp_va;
316 
317 			for (i = 0; i < VMM_STACK_PAGES; i++) {
318 				rv = vmmpmap_enter(stack_hyp_va[cpu] + ptoa(i),
319 				    PAGE_SIZE, vtophys(stack[cpu] + ptoa(i)),
320 				    VM_PROT_READ | VM_PROT_WRITE);
321 				MPASS(rv);
322 			}
323 			next_hyp_va += L2_SIZE;
324 		}
325 
326 		el2_regs.tcr_el2 = TCR_EL2_RES1;
327 		el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT,
328 		    TCR_EL2_PS_52BITS);
329 		el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS);
330 		el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA;
331 #if PAGE_SIZE == PAGE_SIZE_4K
332 		el2_regs.tcr_el2 |= TCR_EL2_TG0_4K;
333 #elif PAGE_SIZE == PAGE_SIZE_16K
334 		el2_regs.tcr_el2 |= TCR_EL2_TG0_16K;
335 #else
336 #error Unsupported page size
337 #endif
338 #ifdef SMP
339 		el2_regs.tcr_el2 |= TCR_EL2_SH0_IS;
340 #endif
341 	}
342 
343 	switch (pa_range_bits << TCR_EL2_PS_SHIFT) {
344 	case TCR_EL2_PS_32BITS:
345 		vmm_max_ipa_bits = 32;
346 		break;
347 	case TCR_EL2_PS_36BITS:
348 		vmm_max_ipa_bits = 36;
349 		break;
350 	case TCR_EL2_PS_40BITS:
351 		vmm_max_ipa_bits = 40;
352 		break;
353 	case TCR_EL2_PS_42BITS:
354 		vmm_max_ipa_bits = 42;
355 		break;
356 	case TCR_EL2_PS_44BITS:
357 		vmm_max_ipa_bits = 44;
358 		break;
359 	case TCR_EL2_PS_48BITS:
360 		vmm_max_ipa_bits = 48;
361 		break;
362 	case TCR_EL2_PS_52BITS:
363 	default:
364 		vmm_max_ipa_bits = 52;
365 		break;
366 	}
367 
368 	/*
369 	 * Configure the Stage 2 translation control register:
370 	 *
371 	 * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable
372 	 * normal memory
373 	 * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable
374 	 * normal memory
375 	 * VTCR_EL2_TG0_4K/16K: Stage 2 uses the same page size as the kernel
376 	 * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables
377 	 * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner
378 	 * shareable
379 	 */
380 	el2_regs.vtcr_el2 = VTCR_EL2_RES1;
381 	el2_regs.vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA;
382 	el2_regs.vtcr_el2 |= VTCR_EL2_T0SZ(64 - vmm_virt_bits);
383 	el2_regs.vtcr_el2 |= vmm_vtcr_el2_sl(vmm_pmap_levels);
384 #if PAGE_SIZE == PAGE_SIZE_4K
385 	el2_regs.vtcr_el2 |= VTCR_EL2_TG0_4K;
386 #elif PAGE_SIZE == PAGE_SIZE_16K
387 	el2_regs.vtcr_el2 |= VTCR_EL2_TG0_16K;
388 #else
389 #error Unsupported page size
390 #endif
391 #ifdef SMP
392 	el2_regs.vtcr_el2 |= VTCR_EL2_SH0_IS;
393 #endif
394 	/*
395 	 * If FEAT_LPA2 is enabled in the host then we need to enable it here
396 	 * so the page tables created by pmap.c are correct. The meaning of
397 	 * the shareability field changes to become address bits when this
398 	 * is set.
399 	 */
400 	if ((READ_SPECIALREG(tcr_el1) & TCR_DS) != 0) {
401 		el2_regs.vtcr_el2 |= VTCR_EL2_DS;
402 		el2_regs.vtcr_el2 |=
403 		    min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_52BIT);
404 	} else {
405 		el2_regs.vtcr_el2 |=
406 		    min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_48BIT);
407 	}
408 
409 	smp_rendezvous(NULL, arm_setup_vectors, NULL, &el2_regs);
410 
411 	if (!in_vhe()) {
412 		/* Add memory to the vmem allocator (checking there is space) */
413 		if (vmm_base > (L2_SIZE + PAGE_SIZE)) {
414 			/*
415 			 * Ensure there is an L2 block before the vmm code to check
416 			 * for buffer overflows on earlier data. Include the PAGE_SIZE
417 			 * of the minimum we can allocate.
418 			 */
419 			vmm_base -= L2_SIZE + PAGE_SIZE;
420 			vmm_base = rounddown2(vmm_base, L2_SIZE);
421 
422 			/*
423 			 * Check there is memory before the vmm code to add.
424 			 *
425 			 * Reserve the L2 block at address 0 so NULL dereference will
426 			 * raise an exception.
427 			 */
428 			if (vmm_base > L2_SIZE)
429 				vmem_add(el2_mem_alloc, L2_SIZE, vmm_base - L2_SIZE,
430 				    M_WAITOK);
431 		}
432 
433 		/*
434 		 * Add the memory after the stacks. There is most of an L2 block
435 		 * between the last stack and the first allocation so this should
436 		 * be safe without adding more padding.
437 		 */
438 		if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE)
439 			vmem_add(el2_mem_alloc, next_hyp_va,
440 			    HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK);
441 	}
442 
443 	vgic_init();
444 	vtimer_init();
445 
446 	return (0);
447 }
448 
449 int
vmmops_modcleanup(void)450 vmmops_modcleanup(void)
451 {
452 	int cpu;
453 
454 	if (!in_vhe()) {
455 		smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL);
456 
457 		CPU_FOREACH(cpu) {
458 			vmmpmap_remove(stack_hyp_va[cpu],
459 			    VMM_STACK_PAGES * PAGE_SIZE, false);
460 		}
461 
462 		vmmpmap_remove(hyp_code_base, hyp_code_len, false);
463 	}
464 
465 	vtimer_cleanup();
466 
467 	if (!in_vhe()) {
468 		vmmpmap_fini();
469 
470 		CPU_FOREACH(cpu)
471 			free(stack[cpu], M_HYP);
472 	}
473 
474 	pmap_clean_stage2_tlbi = NULL;
475 	pmap_stage2_invalidate_range = NULL;
476 	pmap_stage2_invalidate_all = NULL;
477 
478 	return (0);
479 }
480 
481 static vm_size_t
el2_hyp_size(struct vm * vm)482 el2_hyp_size(struct vm *vm)
483 {
484 	return (round_page(sizeof(struct hyp) +
485 	    sizeof(struct hypctx *) * vm_get_maxcpus(vm)));
486 }
487 
488 static vm_size_t
el2_hypctx_size(void)489 el2_hypctx_size(void)
490 {
491 	return (round_page(sizeof(struct hypctx)));
492 }
493 
494 static vm_offset_t
el2_map_enter(vm_offset_t data,vm_size_t size,vm_prot_t prot)495 el2_map_enter(vm_offset_t data, vm_size_t size, vm_prot_t prot)
496 {
497 	vmem_addr_t addr;
498 	int err __diagused;
499 	bool rv __diagused;
500 
501 	err = vmem_alloc(el2_mem_alloc, size, M_NEXTFIT | M_WAITOK, &addr);
502 	MPASS(err == 0);
503 	rv = vmmpmap_enter(addr, size, vtophys(data), prot);
504 	MPASS(rv);
505 
506 	return (addr);
507 }
508 
509 void *
vmmops_init(struct vm * vm,pmap_t pmap)510 vmmops_init(struct vm *vm, pmap_t pmap)
511 {
512 	struct hyp *hyp;
513 	vm_size_t size;
514 	uint64_t idreg;
515 
516 	size = el2_hyp_size(vm);
517 	hyp = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO);
518 
519 	hyp->vm = vm;
520 	hyp->vgic_attached = false;
521 
522 	get_kernel_reg(ID_AA64MMFR0_EL1, &idreg);
523 	if (ID_AA64MMFR0_ECV_VAL(idreg) >= ID_AA64MMFR0_ECV_POFF)
524 		hyp->feats |= HYP_FEAT_ECV_POFF;
525 
526 	switch (ID_AA64MMFR0_FGT_VAL(idreg)) {
527 	case ID_AA64MMFR0_FGT_NONE:
528 		break;
529 	default:
530 	case ID_AA64MMFR0_FGT_8_9:
531 		hyp->feats |= HYP_FEAT_FGT2;
532 		/* FALLTHROUGH */
533 	case ID_AA64MMFR0_FGT_8_6:
534 		hyp->feats |= HYP_FEAT_FGT;
535 		break;
536 	}
537 
538 	get_kernel_reg(ID_AA64MMFR1_EL1, &idreg);
539 	if (ID_AA64MMFR1_HCX_VAL(idreg) >= ID_AA64MMFR1_HCX_IMPL)
540 		hyp->feats |= HYP_FEAT_HCX;
541 
542 	vtimer_vminit(hyp);
543 	vgic_vminit(hyp);
544 
545 	if (!in_vhe())
546 		hyp->el2_addr = el2_map_enter((vm_offset_t)hyp, size,
547 		    VM_PROT_READ | VM_PROT_WRITE);
548 
549 	return (hyp);
550 }
551 
552 void *
vmmops_vcpu_init(void * vmi,struct vcpu * vcpu1,int vcpuid)553 vmmops_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid)
554 {
555 	struct hyp *hyp = vmi;
556 	struct hypctx *hypctx;
557 	vm_size_t size;
558 
559 	size = el2_hypctx_size();
560 	hypctx = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO);
561 
562 	KASSERT(vcpuid >= 0 && vcpuid < vm_get_maxcpus(hyp->vm),
563 	    ("%s: Invalid vcpuid %d", __func__, vcpuid));
564 	hyp->ctx[vcpuid] = hypctx;
565 
566 	hypctx->hyp = hyp;
567 	hypctx->vcpu = vcpu1;
568 
569 	reset_vm_el01_regs(hypctx);
570 	reset_vm_el2_regs(hypctx);
571 
572 	vtimer_cpuinit(hypctx);
573 	vgic_cpuinit(hypctx);
574 
575 	if (!in_vhe())
576 		hypctx->el2_addr = el2_map_enter((vm_offset_t)hypctx, size,
577 		    VM_PROT_READ | VM_PROT_WRITE);
578 
579 	return (hypctx);
580 }
581 
582 static int
arm_vmm_pinit(pmap_t pmap)583 arm_vmm_pinit(pmap_t pmap)
584 {
585 
586 	pmap_pinit_stage(pmap, PM_STAGE2, vmm_pmap_levels);
587 	return (1);
588 }
589 
590 struct vmspace *
vmmops_vmspace_alloc(vm_offset_t min,vm_offset_t max)591 vmmops_vmspace_alloc(vm_offset_t min, vm_offset_t max)
592 {
593 	return (vmspace_alloc(min, max, arm_vmm_pinit));
594 }
595 
596 void
vmmops_vmspace_free(struct vmspace * vmspace)597 vmmops_vmspace_free(struct vmspace *vmspace)
598 {
599 
600 	pmap_remove_pages(vmspace_pmap(vmspace));
601 	vmspace_free(vmspace);
602 }
603 
604 static inline void
arm64_print_hyp_regs(struct vm_exit * vme)605 arm64_print_hyp_regs(struct vm_exit *vme)
606 {
607 	printf("esr_el2:   0x%016lx\n", vme->u.hyp.esr_el2);
608 	printf("far_el2:   0x%016lx\n", vme->u.hyp.far_el2);
609 	printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2);
610 	printf("elr_el2:   0x%016lx\n", vme->pc);
611 }
612 
613 static void
arm64_gen_inst_emul_data(struct hypctx * hypctx,uint32_t esr_iss,struct vm_exit * vme_ret)614 arm64_gen_inst_emul_data(struct hypctx *hypctx, uint32_t esr_iss,
615     struct vm_exit *vme_ret)
616 {
617 	struct vm_guest_paging *paging;
618 	struct vie *vie;
619 	uint32_t esr_sas, reg_num;
620 
621 	/*
622 	 * Get the page address from HPFAR_EL2.
623 	 */
624 	vme_ret->u.inst_emul.gpa =
625 	    HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2);
626 	/* Bits [11:0] are the same as bits [11:0] from the virtual address. */
627 	vme_ret->u.inst_emul.gpa += hypctx->exit_info.far_el2 &
628 	    FAR_EL2_HPFAR_PAGE_MASK;
629 
630 	esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT;
631 	reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT;
632 
633 	vie = &vme_ret->u.inst_emul.vie;
634 	vie->access_size = 1 << esr_sas;
635 	vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0;
636 	vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ;
637 	vie->reg = reg_num;
638 
639 	paging = &vme_ret->u.inst_emul.paging;
640 	paging->ttbr0_addr = hypctx->ttbr0_el1 & ~(TTBR_ASID_MASK | TTBR_CnP);
641 	paging->ttbr1_addr = hypctx->ttbr1_el1 & ~(TTBR_ASID_MASK | TTBR_CnP);
642 	paging->tcr_el1 = hypctx->tcr_el1;
643 	paging->tcr2_el1 = hypctx->tcr2_el1;
644 	paging->flags = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32);
645 	if ((hypctx->sctlr_el1 & SCTLR_M) != 0)
646 		paging->flags |= VM_GP_MMU_ENABLED;
647 }
648 
649 static void
arm64_gen_reg_emul_data(uint32_t esr_iss,struct vm_exit * vme_ret)650 arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret)
651 {
652 	uint32_t reg_num;
653 	struct vre *vre;
654 
655 	/* u.hyp member will be replaced by u.reg_emul */
656 	vre = &vme_ret->u.reg_emul.vre;
657 
658 	vre->inst_syndrome = esr_iss;
659 	/* ARMv8 Architecture Manual, p. D7-2273: 1 means read */
660 	vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE;
661 	reg_num = ISS_MSR_Rt(esr_iss);
662 	vre->reg = reg_num;
663 }
664 
665 void
raise_data_insn_abort(struct hypctx * hypctx,uint64_t far,bool dabort,int fsc)666 raise_data_insn_abort(struct hypctx *hypctx, uint64_t far, bool dabort, int fsc)
667 {
668 	uint64_t esr;
669 
670 	if ((hypctx->tf.tf_spsr & PSR_M_MASK) == PSR_M_EL0t)
671 		esr = EXCP_INSN_ABORT_L << ESR_ELx_EC_SHIFT;
672 	else
673 		esr = EXCP_INSN_ABORT << ESR_ELx_EC_SHIFT;
674 	/* Set the bit that changes from insn -> data abort */
675 	if (dabort)
676 		esr |= EXCP_DATA_ABORT_L << ESR_ELx_EC_SHIFT;
677 	/* Set the IL bit if set by hardware */
678 	esr |= hypctx->tf.tf_esr & ESR_ELx_IL;
679 
680 	vmmops_exception(hypctx, esr | fsc, far);
681 }
682 
683 static int
handle_el1_sync_excp(struct hypctx * hypctx,struct vm_exit * vme_ret,pmap_t pmap)684 handle_el1_sync_excp(struct hypctx *hypctx, struct vm_exit *vme_ret,
685     pmap_t pmap)
686 {
687 	uint64_t gpa;
688 	uint32_t esr_ec, esr_iss;
689 
690 	esr_ec = ESR_ELx_EXCEPTION(hypctx->tf.tf_esr);
691 	esr_iss = hypctx->tf.tf_esr & ESR_ELx_ISS_MASK;
692 
693 	switch (esr_ec) {
694 	case EXCP_UNKNOWN:
695 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNKNOWN, 1);
696 		arm64_print_hyp_regs(vme_ret);
697 		vme_ret->exitcode = VM_EXITCODE_HYP;
698 		break;
699 	case EXCP_TRAP_WFI_WFE:
700 		if ((hypctx->tf.tf_esr & 0x3) == 0) { /* WFI */
701 			vmm_stat_incr(hypctx->vcpu, VMEXIT_WFI, 1);
702 			vme_ret->exitcode = VM_EXITCODE_WFI;
703 		} else {
704 			vmm_stat_incr(hypctx->vcpu, VMEXIT_WFE, 1);
705 			vme_ret->exitcode = VM_EXITCODE_HYP;
706 		}
707 		break;
708 	case EXCP_HVC:
709 		vmm_stat_incr(hypctx->vcpu, VMEXIT_HVC, 1);
710 		vme_ret->exitcode = VM_EXITCODE_HVC;
711 		break;
712 	case EXCP_MSR:
713 		vmm_stat_incr(hypctx->vcpu, VMEXIT_MSR, 1);
714 		arm64_gen_reg_emul_data(esr_iss, vme_ret);
715 		vme_ret->exitcode = VM_EXITCODE_REG_EMUL;
716 		break;
717 	case EXCP_BRK:
718 		vmm_stat_incr(hypctx->vcpu, VMEXIT_BRK, 1);
719 		vme_ret->exitcode = VM_EXITCODE_BRK;
720 		break;
721 	case EXCP_SOFTSTP_EL0:
722 		vmm_stat_incr(hypctx->vcpu, VMEXIT_SS, 1);
723 		vme_ret->exitcode = VM_EXITCODE_SS;
724 		break;
725 	case EXCP_INSN_ABORT_L:
726 	case EXCP_DATA_ABORT_L:
727 		vmm_stat_incr(hypctx->vcpu, esr_ec == EXCP_DATA_ABORT_L ?
728 		    VMEXIT_DATA_ABORT : VMEXIT_INSN_ABORT, 1);
729 		switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) {
730 		case ISS_DATA_DFSC_TF_L0:
731 		case ISS_DATA_DFSC_TF_L1:
732 		case ISS_DATA_DFSC_TF_L2:
733 		case ISS_DATA_DFSC_TF_L3:
734 		case ISS_DATA_DFSC_AFF_L1:
735 		case ISS_DATA_DFSC_AFF_L2:
736 		case ISS_DATA_DFSC_AFF_L3:
737 		case ISS_DATA_DFSC_PF_L1:
738 		case ISS_DATA_DFSC_PF_L2:
739 		case ISS_DATA_DFSC_PF_L3:
740 			gpa = HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2);
741 			/* Check the IPA is valid */
742 			if (gpa >= (1ul << vmm_max_ipa_bits)) {
743 				raise_data_insn_abort(hypctx,
744 				    hypctx->exit_info.far_el2,
745 				    esr_ec == EXCP_DATA_ABORT_L,
746 				    ISS_DATA_DFSC_ASF_L0);
747 				vme_ret->inst_length = 0;
748 				return (HANDLED);
749 			}
750 
751 			if (vm_mem_allocated(hypctx->vcpu, gpa)) {
752 				vme_ret->exitcode = VM_EXITCODE_PAGING;
753 				vme_ret->inst_length = 0;
754 				vme_ret->u.paging.esr = hypctx->tf.tf_esr;
755 				vme_ret->u.paging.gpa = gpa;
756 			} else if (esr_ec == EXCP_INSN_ABORT_L) {
757 				/*
758 				 * Raise an external abort. Device memory is
759 				 * not executable
760 				 */
761 				raise_data_insn_abort(hypctx,
762 				    hypctx->exit_info.far_el2, false,
763 				    ISS_DATA_DFSC_EXT);
764 				vme_ret->inst_length = 0;
765 				return (HANDLED);
766 			} else {
767 				arm64_gen_inst_emul_data(hypctx, esr_iss,
768 				    vme_ret);
769 				vme_ret->exitcode = VM_EXITCODE_INST_EMUL;
770 			}
771 			break;
772 		default:
773 			arm64_print_hyp_regs(vme_ret);
774 			vme_ret->exitcode = VM_EXITCODE_HYP;
775 			break;
776 		}
777 
778 		break;
779 
780 	default:
781 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_SYNC, 1);
782 		arm64_print_hyp_regs(vme_ret);
783 		vme_ret->exitcode = VM_EXITCODE_HYP;
784 		break;
785 	}
786 
787 	/* We don't don't do any instruction emulation here */
788 	return (UNHANDLED);
789 }
790 
791 static int
arm64_handle_world_switch(struct hypctx * hypctx,int excp_type,struct vm_exit * vme,pmap_t pmap)792 arm64_handle_world_switch(struct hypctx *hypctx, int excp_type,
793     struct vm_exit *vme, pmap_t pmap)
794 {
795 	int handled;
796 
797 	switch (excp_type) {
798 	case EXCP_TYPE_EL1_SYNC:
799 		/* The exit code will be set by handle_el1_sync_excp(). */
800 		handled = handle_el1_sync_excp(hypctx, vme, pmap);
801 		break;
802 
803 	case EXCP_TYPE_EL1_IRQ:
804 	case EXCP_TYPE_EL1_FIQ:
805 		/* The host kernel will handle IRQs and FIQs. */
806 		vmm_stat_incr(hypctx->vcpu,
807 		    excp_type == EXCP_TYPE_EL1_IRQ ? VMEXIT_IRQ : VMEXIT_FIQ,1);
808 		vme->exitcode = VM_EXITCODE_BOGUS;
809 		handled = UNHANDLED;
810 		break;
811 
812 	case EXCP_TYPE_EL1_ERROR:
813 	case EXCP_TYPE_EL2_SYNC:
814 	case EXCP_TYPE_EL2_IRQ:
815 	case EXCP_TYPE_EL2_FIQ:
816 	case EXCP_TYPE_EL2_ERROR:
817 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_EL2, 1);
818 		vme->exitcode = VM_EXITCODE_BOGUS;
819 		handled = UNHANDLED;
820 		break;
821 
822 	default:
823 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED, 1);
824 		vme->exitcode = VM_EXITCODE_BOGUS;
825 		handled = UNHANDLED;
826 		break;
827 	}
828 
829 	return (handled);
830 }
831 
832 static void
ptp_release(void ** cookie)833 ptp_release(void **cookie)
834 {
835 	if (*cookie != NULL) {
836 		vm_gpa_release(*cookie);
837 		*cookie = NULL;
838 	}
839 }
840 
841 static void *
ptp_hold(struct vcpu * vcpu,vm_paddr_t ptpphys,size_t len,void ** cookie)842 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
843 {
844 	void *ptr;
845 
846 	ptp_release(cookie);
847 	ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);
848 	return (ptr);
849 }
850 
851 /* log2 of the number of bytes in a page table entry */
852 #define	PTE_SHIFT	3
853 int
vmmops_gla2gpa(void * vcpui,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * is_fault)854 vmmops_gla2gpa(void *vcpui, struct vm_guest_paging *paging, uint64_t gla,
855     int prot, uint64_t *gpa, int *is_fault)
856 {
857 	struct hypctx *hypctx;
858 	void *cookie;
859 	uint64_t mask, *ptep, pte, pte_addr;
860 	int address_bits, granule_shift, ia_bits, levels, pte_shift, tsz;
861 	bool is_el0;
862 
863 	/* Check if the MMU is off */
864 	if ((paging->flags & VM_GP_MMU_ENABLED) == 0) {
865 		*is_fault = 0;
866 		*gpa = gla;
867 		return (0);
868 	}
869 
870 	is_el0 = (paging->flags & PSR_M_MASK) == PSR_M_EL0t;
871 
872 	if (ADDR_IS_KERNEL(gla)) {
873 		/* If address translation is disabled raise an exception */
874 		if ((paging->tcr_el1 & TCR_EPD1) != 0) {
875 			*is_fault = 1;
876 			return (0);
877 		}
878 		if (is_el0 && (paging->tcr_el1 & TCR_E0PD1) != 0) {
879 			*is_fault = 1;
880 			return (0);
881 		}
882 		pte_addr = paging->ttbr1_addr;
883 		tsz = (paging->tcr_el1 & TCR_T1SZ_MASK) >> TCR_T1SZ_SHIFT;
884 		/* Clear the top byte if TBI is on */
885 		if ((paging->tcr_el1 & TCR_TBI1) != 0)
886 			gla |= (0xfful << 56);
887 		switch (paging->tcr_el1 & TCR_TG1_MASK) {
888 		case TCR_TG1_4K:
889 			granule_shift = PAGE_SHIFT_4K;
890 			break;
891 		case TCR_TG1_16K:
892 			granule_shift = PAGE_SHIFT_16K;
893 			break;
894 		case TCR_TG1_64K:
895 			granule_shift = PAGE_SHIFT_64K;
896 			break;
897 		default:
898 			*is_fault = 1;
899 			return (EINVAL);
900 		}
901 	} else {
902 		/* If address translation is disabled raise an exception */
903 		if ((paging->tcr_el1 & TCR_EPD0) != 0) {
904 			*is_fault = 1;
905 			return (0);
906 		}
907 		if (is_el0 && (paging->tcr_el1 & TCR_E0PD0) != 0) {
908 			*is_fault = 1;
909 			return (0);
910 		}
911 		pte_addr = paging->ttbr0_addr;
912 		tsz = (paging->tcr_el1 & TCR_T0SZ_MASK) >> TCR_T0SZ_SHIFT;
913 		/* Clear the top byte if TBI is on */
914 		if ((paging->tcr_el1 & TCR_TBI0) != 0)
915 			gla &= ~(0xfful << 56);
916 		switch (paging->tcr_el1 & TCR_TG0_MASK) {
917 		case TCR_TG0_4K:
918 			granule_shift = PAGE_SHIFT_4K;
919 			break;
920 		case TCR_TG0_16K:
921 			granule_shift = PAGE_SHIFT_16K;
922 			break;
923 		case TCR_TG0_64K:
924 			granule_shift = PAGE_SHIFT_64K;
925 			break;
926 		default:
927 			*is_fault = 1;
928 			return (EINVAL);
929 		}
930 	}
931 
932 	/*
933 	 * TODO: Support FEAT_TTST for smaller tsz values and FEAT_LPA2
934 	 * for larger values.
935 	 */
936 	switch (granule_shift) {
937 	case PAGE_SHIFT_4K:
938 	case PAGE_SHIFT_16K:
939 		/*
940 		 * See "Table D8-11 4KB granule, determining stage 1 initial
941 		 * lookup level" and "Table D8-21 16KB granule, determining
942 		 * stage 1 initial lookup level" from the "Arm Architecture
943 		 * Reference Manual for A-Profile architecture" revision I.a
944 		 * for the minimum and maximum values.
945 		 *
946 		 * TODO: Support less than 16 when FEAT_LPA2 is implemented
947 		 * and TCR_EL1.DS == 1
948 		 * TODO: Support more than 39 when FEAT_TTST is implemented
949 		 */
950 		if (tsz < 16 || tsz > 39) {
951 			*is_fault = 1;
952 			return (EINVAL);
953 		}
954 		break;
955 	case PAGE_SHIFT_64K:
956 	/* TODO: Support 64k granule. It will probably work, but is untested */
957 	default:
958 		*is_fault = 1;
959 		return (EINVAL);
960 	}
961 
962 	/*
963 	 * Calculate the input address bits. These are 64 bit in an address
964 	 * with the top tsz bits being all 0 or all 1.
965 	  */
966 	ia_bits = 64 - tsz;
967 
968 	/*
969 	 * Calculate the number of address bits used in the page table
970 	 * calculation. This is ia_bits minus the bottom granule_shift
971 	 * bits that are passed to the output address.
972 	 */
973 	address_bits = ia_bits - granule_shift;
974 
975 	/*
976 	 * Calculate the number of levels. Each level uses
977 	 * granule_shift - PTE_SHIFT bits of the input address.
978 	 * This is because the table is 1 << granule_shift and each
979 	 * entry is 1 << PTE_SHIFT bytes.
980 	 */
981 	levels = howmany(address_bits, granule_shift - PTE_SHIFT);
982 
983 	/* Mask of the upper unused bits in the virtual address */
984 	gla &= (1ul << ia_bits) - 1;
985 	hypctx = (struct hypctx *)vcpui;
986 	cookie = NULL;
987 	/* TODO: Check if the level supports block descriptors */
988 	for (;levels > 0; levels--) {
989 		int idx;
990 
991 		pte_shift = (levels - 1) * (granule_shift - PTE_SHIFT) +
992 		    granule_shift;
993 		idx = (gla >> pte_shift) &
994 		    ((1ul << (granule_shift - PTE_SHIFT)) - 1);
995 		while (idx > PAGE_SIZE / sizeof(pte)) {
996 			idx -= PAGE_SIZE / sizeof(pte);
997 			pte_addr += PAGE_SIZE;
998 		}
999 
1000 		ptep = ptp_hold(hypctx->vcpu, pte_addr, PAGE_SIZE, &cookie);
1001 		if (ptep == NULL)
1002 			goto error;
1003 		pte = ptep[idx];
1004 
1005 		/* Calculate the level we are looking at */
1006 		switch (levels) {
1007 		default:
1008 			goto fault;
1009 		/* TODO: Level -1 when FEAT_LPA2 is implemented */
1010 		case 4: /* Level 0 */
1011 			if ((pte & ATTR_DESCR_MASK) != L0_TABLE)
1012 				goto fault;
1013 			/* FALLTHROUGH */
1014 		case 3: /* Level 1 */
1015 		case 2: /* Level 2 */
1016 			switch (pte & ATTR_DESCR_MASK) {
1017 			/* Use L1 macro as all levels are the same */
1018 			case L1_TABLE:
1019 				/* Check if EL0 can access this address space */
1020 				if (is_el0 &&
1021 				    (pte & TATTR_AP_TABLE_NO_EL0) != 0)
1022 					goto fault;
1023 				/* Check if the address space is writable */
1024 				if ((prot & PROT_WRITE) != 0 &&
1025 				    (pte & TATTR_AP_TABLE_RO) != 0)
1026 					goto fault;
1027 				if ((prot & PROT_EXEC) != 0) {
1028 					/* Check the table exec attribute */
1029 					if ((is_el0 &&
1030 					    (pte & TATTR_UXN_TABLE) != 0) ||
1031 					    (!is_el0 &&
1032 					     (pte & TATTR_PXN_TABLE) != 0))
1033 						goto fault;
1034 				}
1035 				pte_addr = pte & ~ATTR_MASK;
1036 				break;
1037 			case L1_BLOCK:
1038 				goto done;
1039 			default:
1040 				goto fault;
1041 			}
1042 			break;
1043 		case 1: /* Level 3 */
1044 			if ((pte & ATTR_DESCR_MASK) == L3_PAGE)
1045 				goto done;
1046 			goto fault;
1047 		}
1048 	}
1049 
1050 done:
1051 	/* Check if EL0 has access to the block/page */
1052 	if (is_el0 && (pte & ATTR_S1_AP(ATTR_S1_AP_USER)) == 0)
1053 		goto fault;
1054 	if ((prot & PROT_WRITE) != 0 && (pte & ATTR_S1_AP_RW_BIT) != 0)
1055 		goto fault;
1056 	if ((prot & PROT_EXEC) != 0) {
1057 		if ((is_el0 && (pte & ATTR_S1_UXN) != 0) ||
1058 		    (!is_el0 && (pte & ATTR_S1_PXN) != 0))
1059 			goto fault;
1060 	}
1061 	mask = (1ul << pte_shift) - 1;
1062 	*gpa = (pte & ~ATTR_MASK) | (gla & mask);
1063 	*is_fault = 0;
1064 	ptp_release(&cookie);
1065 	return (0);
1066 
1067 error:
1068 	ptp_release(&cookie);
1069 	return (EFAULT);
1070 fault:
1071 	*is_fault = 1;
1072 	ptp_release(&cookie);
1073 	return (0);
1074 }
1075 
1076 int
vmmops_run(void * vcpui,register_t pc,pmap_t pmap,struct vm_eventinfo * evinfo)1077 vmmops_run(void *vcpui, register_t pc, pmap_t pmap, struct vm_eventinfo *evinfo)
1078 {
1079 	uint64_t excp_type;
1080 	int handled;
1081 	register_t daif;
1082 	struct hyp *hyp;
1083 	struct hypctx *hypctx;
1084 	struct vcpu *vcpu;
1085 	struct vm_exit *vme;
1086 	int mode;
1087 
1088 	hypctx = (struct hypctx *)vcpui;
1089 	hyp = hypctx->hyp;
1090 	vcpu = hypctx->vcpu;
1091 	vme = vm_exitinfo(vcpu);
1092 
1093 	hypctx->tf.tf_elr = (uint64_t)pc;
1094 
1095 	for (;;) {
1096 		if (hypctx->has_exception) {
1097 			hypctx->has_exception = false;
1098 			hypctx->elr_el1 = hypctx->tf.tf_elr;
1099 
1100 			mode = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32);
1101 
1102 			if (mode == PSR_M_EL1t) {
1103 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x0;
1104 			} else if (mode == PSR_M_EL1h) {
1105 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x200;
1106 			} else if ((mode & PSR_M_32) == PSR_M_64) {
1107 				/* 64-bit EL0 */
1108 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x400;
1109 			} else {
1110 				/* 32-bit EL0 */
1111 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x600;
1112 			}
1113 
1114 			/* Set the new spsr */
1115 			hypctx->spsr_el1 = hypctx->tf.tf_spsr;
1116 
1117 			/* Set the new cpsr */
1118 			hypctx->tf.tf_spsr = hypctx->spsr_el1 & PSR_FLAGS;
1119 			hypctx->tf.tf_spsr |= PSR_DAIF | PSR_M_EL1h;
1120 
1121 			/*
1122 			 * Update fields that may change on exeption entry
1123 			 * based on how sctlr_el1 is configured.
1124 			 */
1125 			if ((hypctx->sctlr_el1 & SCTLR_SPAN) == 0)
1126 				hypctx->tf.tf_spsr |= PSR_PAN;
1127 			if ((hypctx->sctlr_el1 & SCTLR_DSSBS) == 0)
1128 				hypctx->tf.tf_spsr &= ~PSR_SSBS;
1129 			else
1130 				hypctx->tf.tf_spsr |= PSR_SSBS;
1131 		}
1132 
1133 		daif = intr_disable();
1134 
1135 		/* Check if the vcpu is suspended */
1136 		if (vcpu_suspended(evinfo)) {
1137 			intr_restore(daif);
1138 			vm_exit_suspended(vcpu, pc);
1139 			break;
1140 		}
1141 
1142 		if (vcpu_debugged(vcpu)) {
1143 			intr_restore(daif);
1144 			vm_exit_debug(vcpu, pc);
1145 			break;
1146 		}
1147 
1148 		/* Activate the stage2 pmap so the vmid is valid */
1149 		pmap_activate_vm(pmap);
1150 		hyp->vttbr_el2 = pmap_to_ttbr0(pmap);
1151 
1152 		/*
1153 		 * TODO: What happens if a timer interrupt is asserted exactly
1154 		 * here, but for the previous VM?
1155 		 */
1156 		arm64_set_active_vcpu(hypctx);
1157 		vgic_flush_hwstate(hypctx);
1158 
1159 		/* Call into EL2 to switch to the guest */
1160 		excp_type = vmm_enter_guest(hyp, hypctx);
1161 
1162 		vgic_sync_hwstate(hypctx);
1163 		vtimer_sync_hwstate(hypctx);
1164 
1165 		/*
1166 		 * Deactivate the stage2 pmap.
1167 		 */
1168 		PCPU_SET(curvmpmap, NULL);
1169 		intr_restore(daif);
1170 
1171 		vmm_stat_incr(vcpu, VMEXIT_COUNT, 1);
1172 		if (excp_type == EXCP_TYPE_MAINT_IRQ)
1173 			continue;
1174 
1175 		vme->pc = hypctx->tf.tf_elr;
1176 		vme->inst_length = INSN_SIZE;
1177 		vme->u.hyp.exception_nr = excp_type;
1178 		vme->u.hyp.esr_el2 = hypctx->tf.tf_esr;
1179 		vme->u.hyp.far_el2 = hypctx->exit_info.far_el2;
1180 		vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2;
1181 
1182 		handled = arm64_handle_world_switch(hypctx, excp_type, vme,
1183 		    pmap);
1184 		if (handled == UNHANDLED)
1185 			/* Exit loop to emulate instruction. */
1186 			break;
1187 		else
1188 			/* Resume guest execution from the next instruction. */
1189 			hypctx->tf.tf_elr += vme->inst_length;
1190 	}
1191 
1192 	return (0);
1193 }
1194 
1195 static void
arm_pcpu_vmcleanup(void * arg)1196 arm_pcpu_vmcleanup(void *arg)
1197 {
1198 	struct hyp *hyp;
1199 	int i, maxcpus;
1200 
1201 	hyp = arg;
1202 	maxcpus = vm_get_maxcpus(hyp->vm);
1203 	for (i = 0; i < maxcpus; i++) {
1204 		if (arm64_get_active_vcpu() == hyp->ctx[i]) {
1205 			arm64_set_active_vcpu(NULL);
1206 			break;
1207 		}
1208 	}
1209 }
1210 
1211 void
vmmops_vcpu_cleanup(void * vcpui)1212 vmmops_vcpu_cleanup(void *vcpui)
1213 {
1214 	struct hypctx *hypctx = vcpui;
1215 
1216 	vtimer_cpucleanup(hypctx);
1217 	vgic_cpucleanup(hypctx);
1218 
1219 	if (!in_vhe())
1220 		vmmpmap_remove(hypctx->el2_addr, el2_hypctx_size(), true);
1221 
1222 	free(hypctx, M_HYP);
1223 }
1224 
1225 void
vmmops_cleanup(void * vmi)1226 vmmops_cleanup(void *vmi)
1227 {
1228 	struct hyp *hyp = vmi;
1229 
1230 	vtimer_vmcleanup(hyp);
1231 	vgic_vmcleanup(hyp);
1232 
1233 	smp_rendezvous(NULL, arm_pcpu_vmcleanup, NULL, hyp);
1234 
1235 	if (!in_vhe())
1236 		vmmpmap_remove(hyp->el2_addr, el2_hyp_size(hyp->vm), true);
1237 
1238 	free(hyp, M_HYP);
1239 }
1240 
1241 /*
1242  * Return register value. Registers have different sizes and an explicit cast
1243  * must be made to ensure proper conversion.
1244  */
1245 static uint64_t *
hypctx_regptr(struct hypctx * hypctx,int reg)1246 hypctx_regptr(struct hypctx *hypctx, int reg)
1247 {
1248 	switch (reg) {
1249 	case VM_REG_GUEST_X0 ... VM_REG_GUEST_X29:
1250 		return (&hypctx->tf.tf_x[reg]);
1251 	case VM_REG_GUEST_LR:
1252 		return (&hypctx->tf.tf_lr);
1253 	case VM_REG_GUEST_SP:
1254 		return (&hypctx->tf.tf_sp);
1255 	case VM_REG_GUEST_CPSR:
1256 		return (&hypctx->tf.tf_spsr);
1257 	case VM_REG_GUEST_PC:
1258 		return (&hypctx->tf.tf_elr);
1259 	case VM_REG_GUEST_SCTLR_EL1:
1260 		return (&hypctx->sctlr_el1);
1261 	case VM_REG_GUEST_TTBR0_EL1:
1262 		return (&hypctx->ttbr0_el1);
1263 	case VM_REG_GUEST_TTBR1_EL1:
1264 		return (&hypctx->ttbr1_el1);
1265 	case VM_REG_GUEST_TCR_EL1:
1266 		return (&hypctx->tcr_el1);
1267 	case VM_REG_GUEST_TCR2_EL1:
1268 		return (&hypctx->tcr2_el1);
1269 	case VM_REG_GUEST_MPIDR_EL1:
1270 		return (&hypctx->vmpidr_el2);
1271 	default:
1272 		break;
1273 	}
1274 	return (NULL);
1275 }
1276 
1277 int
vmmops_getreg(void * vcpui,int reg,uint64_t * retval)1278 vmmops_getreg(void *vcpui, int reg, uint64_t *retval)
1279 {
1280 	uint64_t *regp;
1281 	int running, hostcpu;
1282 	struct hypctx *hypctx = vcpui;
1283 
1284 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1285 	if (running && hostcpu != curcpu)
1286 		panic("arm_getreg: %s%d is running", vm_name(hypctx->hyp->vm),
1287 		    vcpu_vcpuid(hypctx->vcpu));
1288 
1289 	regp = hypctx_regptr(hypctx, reg);
1290 	if (regp == NULL)
1291 		return (EINVAL);
1292 
1293 	*retval = *regp;
1294 	return (0);
1295 }
1296 
1297 int
vmmops_setreg(void * vcpui,int reg,uint64_t val)1298 vmmops_setreg(void *vcpui, int reg, uint64_t val)
1299 {
1300 	uint64_t *regp;
1301 	struct hypctx *hypctx = vcpui;
1302 	int running, hostcpu;
1303 
1304 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1305 	if (running && hostcpu != curcpu)
1306 		panic("arm_setreg: %s%d is running", vm_name(hypctx->hyp->vm),
1307 		    vcpu_vcpuid(hypctx->vcpu));
1308 
1309 	regp = hypctx_regptr(hypctx, reg);
1310 	if (regp == NULL)
1311 		return (EINVAL);
1312 
1313 	*regp = val;
1314 	return (0);
1315 }
1316 
1317 int
vmmops_exception(void * vcpui,uint64_t esr,uint64_t far)1318 vmmops_exception(void *vcpui, uint64_t esr, uint64_t far)
1319 {
1320 	struct hypctx *hypctx = vcpui;
1321 	int running, hostcpu;
1322 
1323 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1324 	if (running && hostcpu != curcpu)
1325 		panic("%s: %s%d is running", __func__, vm_name(hypctx->hyp->vm),
1326 		    vcpu_vcpuid(hypctx->vcpu));
1327 
1328 	hypctx->far_el1 = far;
1329 	hypctx->esr_el1 = esr;
1330 	hypctx->has_exception = true;
1331 
1332 	return (0);
1333 }
1334 
1335 int
vmmops_getcap(void * vcpui,int num,int * retval)1336 vmmops_getcap(void *vcpui, int num, int *retval)
1337 {
1338 	struct hypctx *hypctx = vcpui;
1339 	int ret;
1340 
1341 	ret = ENOENT;
1342 
1343 	switch (num) {
1344 	case VM_CAP_UNRESTRICTED_GUEST:
1345 		*retval = 1;
1346 		ret = 0;
1347 		break;
1348 	case VM_CAP_BRK_EXIT:
1349 	case VM_CAP_SS_EXIT:
1350 	case VM_CAP_MASK_HWINTR:
1351 		*retval = (hypctx->setcaps & (1ul << num)) != 0;
1352 		break;
1353 	default:
1354 		break;
1355 	}
1356 
1357 	return (ret);
1358 }
1359 
1360 int
vmmops_setcap(void * vcpui,int num,int val)1361 vmmops_setcap(void *vcpui, int num, int val)
1362 {
1363 	struct hypctx *hypctx = vcpui;
1364 	int ret;
1365 
1366 	ret = 0;
1367 
1368 	switch (num) {
1369 	case VM_CAP_BRK_EXIT:
1370 		if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0))
1371 			break;
1372 		if (val != 0)
1373 			hypctx->mdcr_el2 |= MDCR_EL2_TDE;
1374 		else if ((hypctx->setcaps & (1ul << VM_CAP_SS_EXIT)) == 0)
1375 			hypctx->mdcr_el2 &= ~MDCR_EL2_TDE;
1376 		break;
1377 	case VM_CAP_SS_EXIT:
1378 		if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0))
1379 			break;
1380 
1381 		if (val != 0) {
1382 			hypctx->debug_spsr |= (hypctx->tf.tf_spsr & PSR_SS);
1383 			hypctx->debug_mdscr |= (hypctx->mdscr_el1 & MDSCR_SS);
1384 
1385 			hypctx->tf.tf_spsr |= PSR_SS;
1386 			hypctx->mdscr_el1 |= MDSCR_SS;
1387 			hypctx->mdcr_el2 |= MDCR_EL2_TDE;
1388 		} else {
1389 			hypctx->tf.tf_spsr &= ~PSR_SS;
1390 			hypctx->tf.tf_spsr |= hypctx->debug_spsr;
1391 			hypctx->debug_spsr &= ~PSR_SS;
1392 			hypctx->mdscr_el1 &= ~MDSCR_SS;
1393 			hypctx->mdscr_el1 |= hypctx->debug_mdscr;
1394 			hypctx->debug_mdscr &= ~MDSCR_SS;
1395 			if ((hypctx->setcaps & (1ul << VM_CAP_BRK_EXIT)) == 0)
1396 				hypctx->mdcr_el2 &= ~MDCR_EL2_TDE;
1397 		}
1398 		break;
1399 	case VM_CAP_MASK_HWINTR:
1400 		if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0))
1401 			break;
1402 
1403 		if (val != 0) {
1404 			hypctx->debug_spsr |= (hypctx->tf.tf_spsr &
1405 			    (PSR_I | PSR_F));
1406 			hypctx->tf.tf_spsr |= PSR_I | PSR_F;
1407 		} else {
1408 			hypctx->tf.tf_spsr &= ~(PSR_I | PSR_F);
1409 			hypctx->tf.tf_spsr |= (hypctx->debug_spsr &
1410 			    (PSR_I | PSR_F));
1411 			hypctx->debug_spsr &= ~(PSR_I | PSR_F);
1412 		}
1413 		break;
1414 	default:
1415 		ret = ENOENT;
1416 		break;
1417 	}
1418 
1419 	if (ret == 0) {
1420 		if (val == 0)
1421 			hypctx->setcaps &= ~(1ul << num);
1422 		else
1423 			hypctx->setcaps |= (1ul << num);
1424 	}
1425 
1426 	return (ret);
1427 }
1428