xref: /kvm-unit-tests/x86/vmx.c (revision 4c8a99ca02252d4a2bee43f4558fe47ce5ab7ec0)
1 /*
2  * x86/vmx.c : Framework for testing nested virtualization
3  *	This is a framework to test nested VMX for KVM, which
4  * 	started as a project of GSoC 2013. All test cases should
5  *	be located in x86/vmx_tests.c and framework related
6  *	functions should be in this file.
7  *
8  * How to write test cases?
9  *	Add callbacks of test suite in variant "vmx_tests". You can
10  *	write:
11  *		1. init function used for initializing test suite
12  *		2. main function for codes running in L2 guest,
13  *		3. exit_handler to handle vmexit of L2 to L1
14  *		4. syscall handler to handle L2 syscall vmexit
15  *		5. vmenter fail handler to handle direct failure of vmenter
16  *		6. guest_regs is loaded when vmenter and saved when
17  *			vmexit, you can read and set it in exit_handler
18  *	If no special function is needed for a test suite, use
19  *	coressponding basic_* functions as callback. More handlers
20  *	can be added to "vmx_tests", see details of "struct vmx_test"
21  *	and function test_run().
22  *
23  * Currently, vmx test framework only set up one VCPU and one
24  * concurrent guest test environment with same paging for L2 and
25  * L1. For usage of EPT, only 1:1 mapped paging is used from VFN
26  * to PFN.
27  *
28  * Author : Arthur Chunqi Li <yzt356@gmail.com>
29  */
30 
31 #include "libcflat.h"
32 #include "processor.h"
33 #include "alloc_page.h"
34 #include "vm.h"
35 #include "vmalloc.h"
36 #include "desc.h"
37 #include "vmx.h"
38 #include "msr.h"
39 #include "smp.h"
40 #include "apic.h"
41 
42 u64 *bsp_vmxon_region;
43 struct vmcs *vmcs_root;
44 u32 vpid_cnt;
45 u64 guest_stack_top, guest_syscall_stack_top;
46 u32 ctrl_pin, ctrl_enter, ctrl_exit, ctrl_cpu[2];
47 struct regs regs;
48 
49 struct vmx_test *current;
50 
51 #define MAX_TEST_TEARDOWN_STEPS 10
52 
53 struct test_teardown_step {
54 	test_teardown_func func;
55 	void *data;
56 };
57 
58 static int teardown_count;
59 static struct test_teardown_step teardown_steps[MAX_TEST_TEARDOWN_STEPS];
60 
61 static test_guest_func v2_guest_main;
62 
63 u64 hypercall_field;
64 bool launched;
65 static int matched;
66 static int guest_finished;
67 static int in_guest;
68 
69 union vmx_basic basic;
70 union vmx_ctrl_msr ctrl_pin_rev;
71 union vmx_ctrl_msr ctrl_cpu_rev[2];
72 union vmx_ctrl_msr ctrl_exit_rev;
73 union vmx_ctrl_msr ctrl_enter_rev;
74 union vmx_ept_vpid  ept_vpid;
75 
76 extern struct descriptor_table_ptr gdt_descr;
77 extern struct descriptor_table_ptr idt_descr;
78 extern void *vmx_return;
79 extern void *entry_sysenter;
80 extern void *guest_entry;
81 
82 static volatile u32 stage;
83 
84 static jmp_buf abort_target;
85 
86 struct vmcs_field {
87 	u64 mask;
88 	u64 encoding;
89 };
90 
91 #define MASK(_bits) GENMASK_ULL((_bits) - 1, 0)
92 #define MASK_NATURAL MASK(sizeof(unsigned long) * 8)
93 
94 static struct vmcs_field vmcs_fields[] = {
95 	{ MASK(16), VPID },
96 	{ MASK(16), PINV },
97 	{ MASK(16), EPTP_IDX },
98 
99 	{ MASK(16), GUEST_SEL_ES },
100 	{ MASK(16), GUEST_SEL_CS },
101 	{ MASK(16), GUEST_SEL_SS },
102 	{ MASK(16), GUEST_SEL_DS },
103 	{ MASK(16), GUEST_SEL_FS },
104 	{ MASK(16), GUEST_SEL_GS },
105 	{ MASK(16), GUEST_SEL_LDTR },
106 	{ MASK(16), GUEST_SEL_TR },
107 	{ MASK(16), GUEST_INT_STATUS },
108 
109 	{ MASK(16), HOST_SEL_ES },
110 	{ MASK(16), HOST_SEL_CS },
111 	{ MASK(16), HOST_SEL_SS },
112 	{ MASK(16), HOST_SEL_DS },
113 	{ MASK(16), HOST_SEL_FS },
114 	{ MASK(16), HOST_SEL_GS },
115 	{ MASK(16), HOST_SEL_TR },
116 
117 	{ MASK(64), IO_BITMAP_A },
118 	{ MASK(64), IO_BITMAP_B },
119 	{ MASK(64), MSR_BITMAP },
120 	{ MASK(64), EXIT_MSR_ST_ADDR },
121 	{ MASK(64), EXIT_MSR_LD_ADDR },
122 	{ MASK(64), ENTER_MSR_LD_ADDR },
123 	{ MASK(64), VMCS_EXEC_PTR },
124 	{ MASK(64), TSC_OFFSET },
125 	{ MASK(64), APIC_VIRT_ADDR },
126 	{ MASK(64), APIC_ACCS_ADDR },
127 	{ MASK(64), EPTP },
128 
129 	{ MASK(64), INFO_PHYS_ADDR },
130 
131 	{ MASK(64), VMCS_LINK_PTR },
132 	{ MASK(64), GUEST_DEBUGCTL },
133 	{ MASK(64), GUEST_EFER },
134 	{ MASK(64), GUEST_PAT },
135 	{ MASK(64), GUEST_PERF_GLOBAL_CTRL },
136 	{ MASK(64), GUEST_PDPTE },
137 
138 	{ MASK(64), HOST_PAT },
139 	{ MASK(64), HOST_EFER },
140 	{ MASK(64), HOST_PERF_GLOBAL_CTRL },
141 
142 	{ MASK(32), PIN_CONTROLS },
143 	{ MASK(32), CPU_EXEC_CTRL0 },
144 	{ MASK(32), EXC_BITMAP },
145 	{ MASK(32), PF_ERROR_MASK },
146 	{ MASK(32), PF_ERROR_MATCH },
147 	{ MASK(32), CR3_TARGET_COUNT },
148 	{ MASK(32), EXI_CONTROLS },
149 	{ MASK(32), EXI_MSR_ST_CNT },
150 	{ MASK(32), EXI_MSR_LD_CNT },
151 	{ MASK(32), ENT_CONTROLS },
152 	{ MASK(32), ENT_MSR_LD_CNT },
153 	{ MASK(32), ENT_INTR_INFO },
154 	{ MASK(32), ENT_INTR_ERROR },
155 	{ MASK(32), ENT_INST_LEN },
156 	{ MASK(32), TPR_THRESHOLD },
157 	{ MASK(32), CPU_EXEC_CTRL1 },
158 
159 	{ MASK(32), VMX_INST_ERROR },
160 	{ MASK(32), EXI_REASON },
161 	{ MASK(32), EXI_INTR_INFO },
162 	{ MASK(32), EXI_INTR_ERROR },
163 	{ MASK(32), IDT_VECT_INFO },
164 	{ MASK(32), IDT_VECT_ERROR },
165 	{ MASK(32), EXI_INST_LEN },
166 	{ MASK(32), EXI_INST_INFO },
167 
168 	{ MASK(32), GUEST_LIMIT_ES },
169 	{ MASK(32), GUEST_LIMIT_CS },
170 	{ MASK(32), GUEST_LIMIT_SS },
171 	{ MASK(32), GUEST_LIMIT_DS },
172 	{ MASK(32), GUEST_LIMIT_FS },
173 	{ MASK(32), GUEST_LIMIT_GS },
174 	{ MASK(32), GUEST_LIMIT_LDTR },
175 	{ MASK(32), GUEST_LIMIT_TR },
176 	{ MASK(32), GUEST_LIMIT_GDTR },
177 	{ MASK(32), GUEST_LIMIT_IDTR },
178 	{ 0x1d0ff, GUEST_AR_ES },
179 	{ 0x1f0ff, GUEST_AR_CS },
180 	{ 0x1d0ff, GUEST_AR_SS },
181 	{ 0x1d0ff, GUEST_AR_DS },
182 	{ 0x1d0ff, GUEST_AR_FS },
183 	{ 0x1d0ff, GUEST_AR_GS },
184 	{ 0x1d0ff, GUEST_AR_LDTR },
185 	{ 0x1d0ff, GUEST_AR_TR },
186 	{ MASK(32), GUEST_INTR_STATE },
187 	{ MASK(32), GUEST_ACTV_STATE },
188 	{ MASK(32), GUEST_SMBASE },
189 	{ MASK(32), GUEST_SYSENTER_CS },
190 	{ MASK(32), PREEMPT_TIMER_VALUE },
191 
192 	{ MASK(32), HOST_SYSENTER_CS },
193 
194 	{ MASK_NATURAL, CR0_MASK },
195 	{ MASK_NATURAL, CR4_MASK },
196 	{ MASK_NATURAL, CR0_READ_SHADOW },
197 	{ MASK_NATURAL, CR4_READ_SHADOW },
198 	{ MASK_NATURAL, CR3_TARGET_0 },
199 	{ MASK_NATURAL, CR3_TARGET_1 },
200 	{ MASK_NATURAL, CR3_TARGET_2 },
201 	{ MASK_NATURAL, CR3_TARGET_3 },
202 
203 	{ MASK_NATURAL, EXI_QUALIFICATION },
204 	{ MASK_NATURAL, IO_RCX },
205 	{ MASK_NATURAL, IO_RSI },
206 	{ MASK_NATURAL, IO_RDI },
207 	{ MASK_NATURAL, IO_RIP },
208 	{ MASK_NATURAL, GUEST_LINEAR_ADDRESS },
209 
210 	{ MASK_NATURAL, GUEST_CR0 },
211 	{ MASK_NATURAL, GUEST_CR3 },
212 	{ MASK_NATURAL, GUEST_CR4 },
213 	{ MASK_NATURAL, GUEST_BASE_ES },
214 	{ MASK_NATURAL, GUEST_BASE_CS },
215 	{ MASK_NATURAL, GUEST_BASE_SS },
216 	{ MASK_NATURAL, GUEST_BASE_DS },
217 	{ MASK_NATURAL, GUEST_BASE_FS },
218 	{ MASK_NATURAL, GUEST_BASE_GS },
219 	{ MASK_NATURAL, GUEST_BASE_LDTR },
220 	{ MASK_NATURAL, GUEST_BASE_TR },
221 	{ MASK_NATURAL, GUEST_BASE_GDTR },
222 	{ MASK_NATURAL, GUEST_BASE_IDTR },
223 	{ MASK_NATURAL, GUEST_DR7 },
224 	{ MASK_NATURAL, GUEST_RSP },
225 	{ MASK_NATURAL, GUEST_RIP },
226 	{ MASK_NATURAL, GUEST_RFLAGS },
227 	{ MASK_NATURAL, GUEST_PENDING_DEBUG },
228 	{ MASK_NATURAL, GUEST_SYSENTER_ESP },
229 	{ MASK_NATURAL, GUEST_SYSENTER_EIP },
230 
231 	{ MASK_NATURAL, HOST_CR0 },
232 	{ MASK_NATURAL, HOST_CR3 },
233 	{ MASK_NATURAL, HOST_CR4 },
234 	{ MASK_NATURAL, HOST_BASE_FS },
235 	{ MASK_NATURAL, HOST_BASE_GS },
236 	{ MASK_NATURAL, HOST_BASE_TR },
237 	{ MASK_NATURAL, HOST_BASE_GDTR },
238 	{ MASK_NATURAL, HOST_BASE_IDTR },
239 	{ MASK_NATURAL, HOST_SYSENTER_ESP },
240 	{ MASK_NATURAL, HOST_SYSENTER_EIP },
241 	{ MASK_NATURAL, HOST_RSP },
242 	{ MASK_NATURAL, HOST_RIP },
243 };
244 
245 enum vmcs_field_type {
246 	VMCS_FIELD_TYPE_CONTROL = 0,
247 	VMCS_FIELD_TYPE_READ_ONLY_DATA = 1,
248 	VMCS_FIELD_TYPE_GUEST = 2,
249 	VMCS_FIELD_TYPE_HOST = 3,
250 	VMCS_FIELD_TYPES,
251 };
252 
253 static inline int vmcs_field_type(struct vmcs_field *f)
254 {
255 	return (f->encoding >> VMCS_FIELD_TYPE_SHIFT) & 0x3;
256 }
257 
258 static int vmcs_field_readonly(struct vmcs_field *f)
259 {
260 	u64 ia32_vmx_misc;
261 
262 	ia32_vmx_misc = rdmsr(MSR_IA32_VMX_MISC);
263 	return !(ia32_vmx_misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS) &&
264 		(vmcs_field_type(f) == VMCS_FIELD_TYPE_READ_ONLY_DATA);
265 }
266 
267 static inline u64 vmcs_field_value(struct vmcs_field *f, u8 cookie)
268 {
269 	u64 value;
270 
271 	/* Incorporate the cookie and the field encoding into the value. */
272 	value = cookie;
273 	value |= (f->encoding << 8);
274 	value |= 0xdeadbeefull << 32;
275 
276 	return value & f->mask;
277 }
278 
279 static void set_vmcs_field(struct vmcs_field *f, u8 cookie)
280 {
281 	vmcs_write(f->encoding, vmcs_field_value(f, cookie));
282 }
283 
284 static bool check_vmcs_field(struct vmcs_field *f, u8 cookie)
285 {
286 	u64 expected;
287 	u64 actual;
288 	int ret;
289 
290 	if (f->encoding == VMX_INST_ERROR) {
291 		printf("Skipping volatile field %lx\n", f->encoding);
292 		return true;
293 	}
294 
295 	ret = vmcs_read_safe(f->encoding, &actual);
296 	assert(!(ret & X86_EFLAGS_CF));
297 	/* Skip VMCS fields that aren't recognized by the CPU */
298 	if (ret & X86_EFLAGS_ZF)
299 		return true;
300 
301 	if (vmcs_field_readonly(f)) {
302 		printf("Skipping read-only field %lx\n", f->encoding);
303 		return true;
304 	}
305 
306 	expected = vmcs_field_value(f, cookie);
307 	actual &= f->mask;
308 
309 	if (expected == actual)
310 		return true;
311 
312 	printf("FAIL: VMWRITE/VMREAD %lx (expected: %lx, actual: %lx)\n",
313 	       f->encoding, (unsigned long) expected, (unsigned long) actual);
314 
315 	return false;
316 }
317 
318 static void set_all_vmcs_fields(u8 cookie)
319 {
320 	int i;
321 
322 	for (i = 0; i < ARRAY_SIZE(vmcs_fields); i++)
323 		set_vmcs_field(&vmcs_fields[i], cookie);
324 }
325 
326 static bool check_all_vmcs_fields(u8 cookie)
327 {
328 	bool pass = true;
329 	int i;
330 
331 	for (i = 0; i < ARRAY_SIZE(vmcs_fields); i++) {
332 		if (!check_vmcs_field(&vmcs_fields[i], cookie))
333 			pass = false;
334 	}
335 
336 	return pass;
337 }
338 
339 static u32 find_vmcs_max_index(void)
340 {
341 	u32 idx, width, type, enc;
342 	u64 actual;
343 	int ret;
344 
345 	/* scan backwards and stop when found */
346 	for (idx = (1 << 9) - 1; idx >= 0; idx--) {
347 
348 		/* try all combinations of width and type */
349 		for (type = 0; type < (1 << 2); type++) {
350 			for (width = 0; width < (1 << 2) ; width++) {
351 				enc = (idx << VMCS_FIELD_INDEX_SHIFT) |
352 				      (type << VMCS_FIELD_TYPE_SHIFT) |
353 				      (width << VMCS_FIELD_WIDTH_SHIFT);
354 
355 				ret = vmcs_read_safe(enc, &actual);
356 				assert(!(ret & X86_EFLAGS_CF));
357 				if (!(ret & X86_EFLAGS_ZF))
358 					return idx;
359 			}
360 		}
361 	}
362 	/* some VMCS fields should exist */
363 	assert(0);
364 	return 0;
365 }
366 
367 static void test_vmwrite_vmread(void)
368 {
369 	struct vmcs *vmcs = alloc_page();
370 	u32 vmcs_enum_max, max_index = 0;
371 
372 	vmcs->hdr.revision_id = basic.revision;
373 	assert(!vmcs_clear(vmcs));
374 	assert(!make_vmcs_current(vmcs));
375 
376 	set_all_vmcs_fields(0x42);
377 	report(check_all_vmcs_fields(0x42), "VMWRITE/VMREAD");
378 
379 	vmcs_enum_max = (rdmsr(MSR_IA32_VMX_VMCS_ENUM) & VMCS_FIELD_INDEX_MASK)
380 			>> VMCS_FIELD_INDEX_SHIFT;
381 	max_index = find_vmcs_max_index();
382 	report(vmcs_enum_max == max_index,
383 	       "VMX_VMCS_ENUM.MAX_INDEX expected: %x, actual: %x",
384 	       max_index, vmcs_enum_max);
385 
386 	assert(!vmcs_clear(vmcs));
387 	free_page(vmcs);
388 }
389 
390 ulong finish_fault;
391 u8 sentinel;
392 bool handler_called;
393 
394 static void pf_handler(struct ex_regs *regs)
395 {
396 	/*
397 	 * check that RIP was not improperly advanced and that the
398 	 * flags value was preserved.
399 	 */
400 	report(regs->rip < finish_fault, "RIP has not been advanced!");
401 	report(((u8)regs->rflags == ((sentinel | 2) & 0xd7)),
402 	       "The low byte of RFLAGS was preserved!");
403 	regs->rip = finish_fault;
404 	handler_called = true;
405 
406 }
407 
408 static void prep_flags_test_env(void **vpage, struct vmcs **vmcs, handler *old)
409 {
410 	/*
411 	 * get an unbacked address that will cause a #PF
412 	 */
413 	*vpage = alloc_vpage();
414 
415 	/*
416 	 * set up VMCS so we have something to read from
417 	 */
418 	*vmcs = alloc_page();
419 
420 	memset(*vmcs, 0, PAGE_SIZE);
421 	(*vmcs)->hdr.revision_id = basic.revision;
422 	assert(!vmcs_clear(*vmcs));
423 	assert(!make_vmcs_current(*vmcs));
424 
425 	*old = handle_exception(PF_VECTOR, &pf_handler);
426 }
427 
428 static noinline void test_read_sentinel(void)
429 {
430 	void *vpage;
431 	struct vmcs *vmcs;
432 	handler old;
433 
434 	prep_flags_test_env(&vpage, &vmcs, &old);
435 
436 	/*
437 	 * set the proper label
438 	 */
439 	extern char finish_read_fault;
440 
441 	finish_fault = (ulong)&finish_read_fault;
442 
443 	/*
444 	 * execute the vmread instruction that will cause a #PF
445 	 */
446 	handler_called = false;
447 	asm volatile ("movb %[byte], %%ah\n\t"
448 		      "sahf\n\t"
449 		      "vmread %[enc], %[val]; finish_read_fault:"
450 		      : [val] "=m" (*(u64 *)vpage)
451 		      : [byte] "Krm" (sentinel),
452 		      [enc] "r" ((u64)GUEST_SEL_SS)
453 		      : "cc", "ah");
454 	report(handler_called, "The #PF handler was invoked");
455 
456 	/*
457 	 * restore the old #PF handler
458 	 */
459 	handle_exception(PF_VECTOR, old);
460 }
461 
462 static void test_vmread_flags_touch(void)
463 {
464 	/*
465 	 * set up the sentinel value in the flags register. we
466 	 * choose these two values because they candy-stripe
467 	 * the 5 flags that sahf sets.
468 	 */
469 	sentinel = 0x91;
470 	test_read_sentinel();
471 
472 	sentinel = 0x45;
473 	test_read_sentinel();
474 }
475 
476 static noinline void test_write_sentinel(void)
477 {
478 	void *vpage;
479 	struct vmcs *vmcs;
480 	handler old;
481 
482 	prep_flags_test_env(&vpage, &vmcs, &old);
483 
484 	/*
485 	 * set the proper label
486 	 */
487 	extern char finish_write_fault;
488 
489 	finish_fault = (ulong)&finish_write_fault;
490 
491 	/*
492 	 * execute the vmwrite instruction that will cause a #PF
493 	 */
494 	handler_called = false;
495 	asm volatile ("movb %[byte], %%ah\n\t"
496 		      "sahf\n\t"
497 		      "vmwrite %[val], %[enc]; finish_write_fault:"
498 		      : [val] "=m" (*(u64 *)vpage)
499 		      : [byte] "Krm" (sentinel),
500 		      [enc] "r" ((u64)GUEST_SEL_SS)
501 		      : "cc", "ah");
502 	report(handler_called, "The #PF handler was invoked");
503 
504 	/*
505 	 * restore the old #PF handler
506 	 */
507 	handle_exception(PF_VECTOR, old);
508 }
509 
510 static void test_vmwrite_flags_touch(void)
511 {
512 	/*
513 	 * set up the sentinel value in the flags register. we
514 	 * choose these two values because they candy-stripe
515 	 * the 5 flags that sahf sets.
516 	 */
517 	sentinel = 0x91;
518 	test_write_sentinel();
519 
520 	sentinel = 0x45;
521 	test_write_sentinel();
522 }
523 
524 
525 static void test_vmcs_high(void)
526 {
527 	struct vmcs *vmcs = alloc_page();
528 
529 	vmcs->hdr.revision_id = basic.revision;
530 	assert(!vmcs_clear(vmcs));
531 	assert(!make_vmcs_current(vmcs));
532 
533 	vmcs_write(TSC_OFFSET, 0x0123456789ABCDEFull);
534 	report(vmcs_read(TSC_OFFSET) == 0x0123456789ABCDEFull,
535 	       "VMREAD TSC_OFFSET after VMWRITE TSC_OFFSET");
536 	report(vmcs_read(TSC_OFFSET_HI) == 0x01234567ull,
537 	       "VMREAD TSC_OFFSET_HI after VMWRITE TSC_OFFSET");
538 	vmcs_write(TSC_OFFSET_HI, 0x76543210ul);
539 	report(vmcs_read(TSC_OFFSET_HI) == 0x76543210ul,
540 	       "VMREAD TSC_OFFSET_HI after VMWRITE TSC_OFFSET_HI");
541 	report(vmcs_read(TSC_OFFSET) == 0x7654321089ABCDEFull,
542 	       "VMREAD TSC_OFFSET after VMWRITE TSC_OFFSET_HI");
543 
544 	assert(!vmcs_clear(vmcs));
545 	free_page(vmcs);
546 }
547 
548 static void test_vmcs_lifecycle(void)
549 {
550 	struct vmcs *vmcs[2] = {};
551 	int i;
552 
553 	for (i = 0; i < ARRAY_SIZE(vmcs); i++) {
554 		vmcs[i] = alloc_page();
555 		vmcs[i]->hdr.revision_id = basic.revision;
556 	}
557 
558 #define VMPTRLD(_i) do { \
559 	assert(_i < ARRAY_SIZE(vmcs)); \
560 	assert(!make_vmcs_current(vmcs[_i])); \
561 	printf("VMPTRLD VMCS%d\n", (_i)); \
562 } while (0)
563 
564 #define VMCLEAR(_i) do { \
565 	assert(_i < ARRAY_SIZE(vmcs)); \
566 	assert(!vmcs_clear(vmcs[_i])); \
567 	printf("VMCLEAR VMCS%d\n", (_i)); \
568 } while (0)
569 
570 	VMCLEAR(0);
571 	VMPTRLD(0);
572 	set_all_vmcs_fields(0);
573 	report(check_all_vmcs_fields(0), "current:VMCS0 active:[VMCS0]");
574 
575 	VMCLEAR(0);
576 	VMPTRLD(0);
577 	report(check_all_vmcs_fields(0), "current:VMCS0 active:[VMCS0]");
578 
579 	VMCLEAR(1);
580 	report(check_all_vmcs_fields(0), "current:VMCS0 active:[VMCS0]");
581 
582 	VMPTRLD(1);
583 	set_all_vmcs_fields(1);
584 	report(check_all_vmcs_fields(1), "current:VMCS1 active:[VMCS0,VCMS1]");
585 
586 	VMPTRLD(0);
587 	report(check_all_vmcs_fields(0), "current:VMCS0 active:[VMCS0,VCMS1]");
588 	VMPTRLD(1);
589 	report(check_all_vmcs_fields(1), "current:VMCS1 active:[VMCS0,VCMS1]");
590 	VMPTRLD(1);
591 	report(check_all_vmcs_fields(1), "current:VMCS1 active:[VMCS0,VCMS1]");
592 
593 	VMCLEAR(0);
594 	report(check_all_vmcs_fields(1), "current:VMCS1 active:[VCMS1]");
595 
596 	/* VMPTRLD should not erase VMWRITEs to the current VMCS */
597 	set_all_vmcs_fields(2);
598 	VMPTRLD(1);
599 	report(check_all_vmcs_fields(2), "current:VMCS1 active:[VCMS1]");
600 
601 	for (i = 0; i < ARRAY_SIZE(vmcs); i++) {
602 		VMCLEAR(i);
603 		free_page(vmcs[i]);
604 	}
605 
606 #undef VMPTRLD
607 #undef VMCLEAR
608 }
609 
610 void vmx_set_test_stage(u32 s)
611 {
612 	barrier();
613 	stage = s;
614 	barrier();
615 }
616 
617 u32 vmx_get_test_stage(void)
618 {
619 	u32 s;
620 
621 	barrier();
622 	s = stage;
623 	barrier();
624 	return s;
625 }
626 
627 void vmx_inc_test_stage(void)
628 {
629 	barrier();
630 	stage++;
631 	barrier();
632 }
633 
634 /* entry_sysenter */
635 asm(
636 	".align	4, 0x90\n\t"
637 	".globl	entry_sysenter\n\t"
638 	"entry_sysenter:\n\t"
639 	SAVE_GPR
640 	"	and	$0xf, %rax\n\t"
641 	"	mov	%rax, %rdi\n\t"
642 	"	call	syscall_handler\n\t"
643 	LOAD_GPR
644 	"	vmresume\n\t"
645 );
646 
647 static void __attribute__((__used__)) syscall_handler(u64 syscall_no)
648 {
649 	if (current->syscall_handler)
650 		current->syscall_handler(syscall_no);
651 }
652 
653 static const char * const exit_reason_descriptions[] = {
654 	[VMX_EXC_NMI]		= "VMX_EXC_NMI",
655 	[VMX_EXTINT]		= "VMX_EXTINT",
656 	[VMX_TRIPLE_FAULT]	= "VMX_TRIPLE_FAULT",
657 	[VMX_INIT]		= "VMX_INIT",
658 	[VMX_SIPI]		= "VMX_SIPI",
659 	[VMX_SMI_IO]		= "VMX_SMI_IO",
660 	[VMX_SMI_OTHER]		= "VMX_SMI_OTHER",
661 	[VMX_INTR_WINDOW]	= "VMX_INTR_WINDOW",
662 	[VMX_NMI_WINDOW]	= "VMX_NMI_WINDOW",
663 	[VMX_TASK_SWITCH]	= "VMX_TASK_SWITCH",
664 	[VMX_CPUID]		= "VMX_CPUID",
665 	[VMX_GETSEC]		= "VMX_GETSEC",
666 	[VMX_HLT]		= "VMX_HLT",
667 	[VMX_INVD]		= "VMX_INVD",
668 	[VMX_INVLPG]		= "VMX_INVLPG",
669 	[VMX_RDPMC]		= "VMX_RDPMC",
670 	[VMX_RDTSC]		= "VMX_RDTSC",
671 	[VMX_RSM]		= "VMX_RSM",
672 	[VMX_VMCALL]		= "VMX_VMCALL",
673 	[VMX_VMCLEAR]		= "VMX_VMCLEAR",
674 	[VMX_VMLAUNCH]		= "VMX_VMLAUNCH",
675 	[VMX_VMPTRLD]		= "VMX_VMPTRLD",
676 	[VMX_VMPTRST]		= "VMX_VMPTRST",
677 	[VMX_VMREAD]		= "VMX_VMREAD",
678 	[VMX_VMRESUME]		= "VMX_VMRESUME",
679 	[VMX_VMWRITE]		= "VMX_VMWRITE",
680 	[VMX_VMXOFF]		= "VMX_VMXOFF",
681 	[VMX_VMXON]		= "VMX_VMXON",
682 	[VMX_CR]		= "VMX_CR",
683 	[VMX_DR]		= "VMX_DR",
684 	[VMX_IO]		= "VMX_IO",
685 	[VMX_RDMSR]		= "VMX_RDMSR",
686 	[VMX_WRMSR]		= "VMX_WRMSR",
687 	[VMX_FAIL_STATE]	= "VMX_FAIL_STATE",
688 	[VMX_FAIL_MSR]		= "VMX_FAIL_MSR",
689 	[VMX_MWAIT]		= "VMX_MWAIT",
690 	[VMX_MTF]		= "VMX_MTF",
691 	[VMX_MONITOR]		= "VMX_MONITOR",
692 	[VMX_PAUSE]		= "VMX_PAUSE",
693 	[VMX_FAIL_MCHECK]	= "VMX_FAIL_MCHECK",
694 	[VMX_TPR_THRESHOLD]	= "VMX_TPR_THRESHOLD",
695 	[VMX_APIC_ACCESS]	= "VMX_APIC_ACCESS",
696 	[VMX_EOI_INDUCED]	= "VMX_EOI_INDUCED",
697 	[VMX_GDTR_IDTR]		= "VMX_GDTR_IDTR",
698 	[VMX_LDTR_TR]		= "VMX_LDTR_TR",
699 	[VMX_EPT_VIOLATION]	= "VMX_EPT_VIOLATION",
700 	[VMX_EPT_MISCONFIG]	= "VMX_EPT_MISCONFIG",
701 	[VMX_INVEPT]		= "VMX_INVEPT",
702 	[VMX_PREEMPT]		= "VMX_PREEMPT",
703 	[VMX_INVVPID]		= "VMX_INVVPID",
704 	[VMX_WBINVD]		= "VMX_WBINVD",
705 	[VMX_XSETBV]		= "VMX_XSETBV",
706 	[VMX_APIC_WRITE]	= "VMX_APIC_WRITE",
707 	[VMX_RDRAND]		= "VMX_RDRAND",
708 	[VMX_INVPCID]		= "VMX_INVPCID",
709 	[VMX_VMFUNC]		= "VMX_VMFUNC",
710 	[VMX_RDSEED]		= "VMX_RDSEED",
711 	[VMX_PML_FULL]		= "VMX_PML_FULL",
712 	[VMX_XSAVES]		= "VMX_XSAVES",
713 	[VMX_XRSTORS]		= "VMX_XRSTORS",
714 };
715 
716 const char *exit_reason_description(u64 reason)
717 {
718 	if (reason >= ARRAY_SIZE(exit_reason_descriptions))
719 		return "(unknown)";
720 	return exit_reason_descriptions[reason] ? : "(unused)";
721 }
722 
723 void print_vmexit_info(union exit_reason exit_reason)
724 {
725 	u64 guest_rip, guest_rsp;
726 	ulong exit_qual = vmcs_read(EXI_QUALIFICATION);
727 	guest_rip = vmcs_read(GUEST_RIP);
728 	guest_rsp = vmcs_read(GUEST_RSP);
729 	printf("VMEXIT info:\n");
730 	printf("\tvmexit reason = %u\n", exit_reason.basic);
731 	printf("\tfailed vmentry = %u\n", !!exit_reason.failed_vmentry);
732 	printf("\texit qualification = %#lx\n", exit_qual);
733 	printf("\tguest_rip = %#lx\n", guest_rip);
734 	printf("\tRAX=%#lx    RBX=%#lx    RCX=%#lx    RDX=%#lx\n",
735 		regs.rax, regs.rbx, regs.rcx, regs.rdx);
736 	printf("\tRSP=%#lx    RBP=%#lx    RSI=%#lx    RDI=%#lx\n",
737 		guest_rsp, regs.rbp, regs.rsi, regs.rdi);
738 	printf("\tR8 =%#lx    R9 =%#lx    R10=%#lx    R11=%#lx\n",
739 		regs.r8, regs.r9, regs.r10, regs.r11);
740 	printf("\tR12=%#lx    R13=%#lx    R14=%#lx    R15=%#lx\n",
741 		regs.r12, regs.r13, regs.r14, regs.r15);
742 }
743 
744 void print_vmentry_failure_info(struct vmentry_result *result)
745 {
746 	if (result->entered)
747 		return;
748 
749 	if (result->vm_fail) {
750 		printf("VM-Fail on %s: ", result->instr);
751 		switch (result->flags & VMX_ENTRY_FLAGS) {
752 		case X86_EFLAGS_CF:
753 			printf("current-VMCS pointer is not valid.\n");
754 			break;
755 		case X86_EFLAGS_ZF:
756 			printf("error number is %ld. See Intel 30.4.\n",
757 			       vmcs_read(VMX_INST_ERROR));
758 			break;
759 		default:
760 			printf("unexpected flags %lx!\n", result->flags);
761 		}
762 	} else {
763 		u64 qual = vmcs_read(EXI_QUALIFICATION);
764 
765 		printf("VM-Exit failure on %s (reason=%#x, qual=%#lx): ",
766 			result->instr, result->exit_reason.full, qual);
767 
768 		switch (result->exit_reason.basic) {
769 		case VMX_FAIL_STATE:
770 			printf("invalid guest state\n");
771 			break;
772 		case VMX_FAIL_MSR:
773 			printf("MSR loading\n");
774 			break;
775 		case VMX_FAIL_MCHECK:
776 			printf("machine-check event\n");
777 			break;
778 		default:
779 			printf("unexpected basic exit reason %u\n",
780 			  result->exit_reason.basic);
781 		}
782 
783 		if (!result->exit_reason.failed_vmentry)
784 			printf("\tVMX_ENTRY_FAILURE BIT NOT SET!\n");
785 
786 		if (result->exit_reason.full & 0x7fff0000)
787 			printf("\tRESERVED BITS SET!\n");
788 	}
789 }
790 
791 /*
792  * VMCLEAR should ensures all VMCS state is flushed to the VMCS
793  * region in memory.
794  */
795 static void test_vmclear_flushing(void)
796 {
797 	struct vmcs *vmcs[3] = {};
798 	int i;
799 
800 	for (i = 0; i < ARRAY_SIZE(vmcs); i++) {
801 		vmcs[i] = alloc_page();
802 	}
803 
804 	vmcs[0]->hdr.revision_id = basic.revision;
805 	assert(!vmcs_clear(vmcs[0]));
806 	assert(!make_vmcs_current(vmcs[0]));
807 	set_all_vmcs_fields(0x86);
808 
809 	assert(!vmcs_clear(vmcs[0]));
810 	memcpy(vmcs[1], vmcs[0], basic.size);
811 	assert(!make_vmcs_current(vmcs[1]));
812 	report(check_all_vmcs_fields(0x86),
813 	       "test vmclear flush (current VMCS)");
814 
815 	set_all_vmcs_fields(0x87);
816 	assert(!make_vmcs_current(vmcs[0]));
817 	assert(!vmcs_clear(vmcs[1]));
818 	memcpy(vmcs[2], vmcs[1], basic.size);
819 	assert(!make_vmcs_current(vmcs[2]));
820 	report(check_all_vmcs_fields(0x87),
821 	       "test vmclear flush (!current VMCS)");
822 
823 	for (i = 0; i < ARRAY_SIZE(vmcs); i++) {
824 		assert(!vmcs_clear(vmcs[i]));
825 		free_page(vmcs[i]);
826 	}
827 }
828 
829 static void test_vmclear(void)
830 {
831 	struct vmcs *tmp_root;
832 	int width = cpuid_maxphyaddr();
833 
834 	/*
835 	 * Note- The tests below do not necessarily have a
836 	 * valid VMCS, but that's ok since the invalid vmcs
837 	 * is only used for a specific test and is discarded
838 	 * without touching its contents
839 	 */
840 
841 	/* Unaligned page access */
842 	tmp_root = (struct vmcs *)((intptr_t)vmcs_root + 1);
843 	report(vmcs_clear(tmp_root) == 1, "test vmclear with unaligned vmcs");
844 
845 	/* gpa bits beyond physical address width are set*/
846 	tmp_root = (struct vmcs *)((intptr_t)vmcs_root |
847 				   ((u64)1 << (width+1)));
848 	report(vmcs_clear(tmp_root) == 1,
849 	       "test vmclear with vmcs address bits set beyond physical address width");
850 
851 	/* Pass VMXON region */
852 	tmp_root = (struct vmcs *)bsp_vmxon_region;
853 	report(vmcs_clear(tmp_root) == 1, "test vmclear with vmxon region");
854 
855 	/* Valid VMCS */
856 	report(vmcs_clear(vmcs_root) == 0,
857 	       "test vmclear with valid vmcs region");
858 
859 	test_vmclear_flushing();
860 }
861 
862 static void __attribute__((__used__)) guest_main(void)
863 {
864 	if (current->v2)
865 		v2_guest_main();
866 	else
867 		current->guest_main();
868 }
869 
870 /* guest_entry */
871 asm(
872 	".align	4, 0x90\n\t"
873 	".globl	entry_guest\n\t"
874 	"guest_entry:\n\t"
875 	"	call guest_main\n\t"
876 	"	mov $1, %edi\n\t"
877 	"	call hypercall\n\t"
878 );
879 
880 /* EPT paging structure related functions */
881 /* split_large_ept_entry: Split a 2M/1G large page into 512 smaller PTEs.
882 		@ptep : large page table entry to split
883 		@level : level of ptep (2 or 3)
884  */
885 static void split_large_ept_entry(unsigned long *ptep, int level)
886 {
887 	unsigned long *new_pt;
888 	unsigned long gpa;
889 	unsigned long pte;
890 	unsigned long prototype;
891 	int i;
892 
893 	pte = *ptep;
894 	assert(pte & EPT_PRESENT);
895 	assert(pte & EPT_LARGE_PAGE);
896 	assert(level == 2 || level == 3);
897 
898 	new_pt = alloc_page();
899 	assert(new_pt);
900 
901 	prototype = pte & ~EPT_ADDR_MASK;
902 	if (level == 2)
903 		prototype &= ~EPT_LARGE_PAGE;
904 
905 	gpa = pte & EPT_ADDR_MASK;
906 	for (i = 0; i < EPT_PGDIR_ENTRIES; i++) {
907 		new_pt[i] = prototype | gpa;
908 		gpa += 1ul << EPT_LEVEL_SHIFT(level - 1);
909 	}
910 
911 	pte &= ~EPT_LARGE_PAGE;
912 	pte &= ~EPT_ADDR_MASK;
913 	pte |= virt_to_phys(new_pt);
914 
915 	*ptep = pte;
916 }
917 
918 /* install_ept_entry : Install a page to a given level in EPT
919 		@pml4 : addr of pml4 table
920 		@pte_level : level of PTE to set
921 		@guest_addr : physical address of guest
922 		@pte : pte value to set
923 		@pt_page : address of page table, NULL for a new page
924  */
925 void install_ept_entry(unsigned long *pml4,
926 		int pte_level,
927 		unsigned long guest_addr,
928 		unsigned long pte,
929 		unsigned long *pt_page)
930 {
931 	int level;
932 	unsigned long *pt = pml4;
933 	unsigned offset;
934 
935 	/* EPT only uses 48 bits of GPA. */
936 	assert(guest_addr < (1ul << 48));
937 
938 	for (level = EPT_PAGE_LEVEL; level > pte_level; --level) {
939 		offset = (guest_addr >> EPT_LEVEL_SHIFT(level))
940 				& EPT_PGDIR_MASK;
941 		if (!(pt[offset] & (EPT_PRESENT))) {
942 			unsigned long *new_pt = pt_page;
943 			if (!new_pt)
944 				new_pt = alloc_page();
945 			else
946 				pt_page = 0;
947 			memset(new_pt, 0, PAGE_SIZE);
948 			pt[offset] = virt_to_phys(new_pt)
949 					| EPT_RA | EPT_WA | EPT_EA;
950 		} else if (pt[offset] & EPT_LARGE_PAGE)
951 			split_large_ept_entry(&pt[offset], level);
952 		pt = phys_to_virt(pt[offset] & EPT_ADDR_MASK);
953 	}
954 	offset = (guest_addr >> EPT_LEVEL_SHIFT(level)) & EPT_PGDIR_MASK;
955 	pt[offset] = pte;
956 }
957 
958 /* Map a page, @perm is the permission of the page */
959 void install_ept(unsigned long *pml4,
960 		unsigned long phys,
961 		unsigned long guest_addr,
962 		u64 perm)
963 {
964 	install_ept_entry(pml4, 1, guest_addr, (phys & PAGE_MASK) | perm, 0);
965 }
966 
967 /* Map a 1G-size page */
968 void install_1g_ept(unsigned long *pml4,
969 		unsigned long phys,
970 		unsigned long guest_addr,
971 		u64 perm)
972 {
973 	install_ept_entry(pml4, 3, guest_addr,
974 			(phys & PAGE_MASK) | perm | EPT_LARGE_PAGE, 0);
975 }
976 
977 /* Map a 2M-size page */
978 void install_2m_ept(unsigned long *pml4,
979 		unsigned long phys,
980 		unsigned long guest_addr,
981 		u64 perm)
982 {
983 	install_ept_entry(pml4, 2, guest_addr,
984 			(phys & PAGE_MASK) | perm | EPT_LARGE_PAGE, 0);
985 }
986 
987 /* setup_ept_range : Setup a range of 1:1 mapped page to EPT paging structure.
988 		@start : start address of guest page
989 		@len : length of address to be mapped
990 		@map_1g : whether 1G page map is used
991 		@map_2m : whether 2M page map is used
992 		@perm : permission for every page
993  */
994 void setup_ept_range(unsigned long *pml4, unsigned long start,
995 		     unsigned long len, int map_1g, int map_2m, u64 perm)
996 {
997 	u64 phys = start;
998 	u64 max = (u64)len + (u64)start;
999 
1000 	if (map_1g) {
1001 		while (phys + PAGE_SIZE_1G <= max) {
1002 			install_1g_ept(pml4, phys, phys, perm);
1003 			phys += PAGE_SIZE_1G;
1004 		}
1005 	}
1006 	if (map_2m) {
1007 		while (phys + PAGE_SIZE_2M <= max) {
1008 			install_2m_ept(pml4, phys, phys, perm);
1009 			phys += PAGE_SIZE_2M;
1010 		}
1011 	}
1012 	while (phys + PAGE_SIZE <= max) {
1013 		install_ept(pml4, phys, phys, perm);
1014 		phys += PAGE_SIZE;
1015 	}
1016 }
1017 
1018 /* get_ept_pte : Get the PTE of a given level in EPT,
1019     @level == 1 means get the latest level*/
1020 bool get_ept_pte(unsigned long *pml4, unsigned long guest_addr, int level,
1021 		unsigned long *pte)
1022 {
1023 	int l;
1024 	unsigned long *pt = pml4, iter_pte;
1025 	unsigned offset;
1026 
1027 	assert(level >= 1 && level <= 4);
1028 
1029 	for (l = EPT_PAGE_LEVEL; ; --l) {
1030 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1031 		iter_pte = pt[offset];
1032 		if (l == level)
1033 			break;
1034 		if (l < 4 && (iter_pte & EPT_LARGE_PAGE))
1035 			return false;
1036 		if (!(iter_pte & (EPT_PRESENT)))
1037 			return false;
1038 		pt = (unsigned long *)(iter_pte & EPT_ADDR_MASK);
1039 	}
1040 	offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1041 	if (pte)
1042 		*pte = pt[offset];
1043 	return true;
1044 }
1045 
1046 static void clear_ept_ad_pte(unsigned long *pml4, unsigned long guest_addr)
1047 {
1048 	int l;
1049 	unsigned long *pt = pml4;
1050 	u64 pte;
1051 	unsigned offset;
1052 
1053 	for (l = EPT_PAGE_LEVEL; ; --l) {
1054 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1055 		pt[offset] &= ~(EPT_ACCESS_FLAG|EPT_DIRTY_FLAG);
1056 		pte = pt[offset];
1057 		if (l == 1 || (l < 4 && (pte & EPT_LARGE_PAGE)))
1058 			break;
1059 		pt = (unsigned long *)(pte & EPT_ADDR_MASK);
1060 	}
1061 }
1062 
1063 /* clear_ept_ad : Clear EPT A/D bits for the page table walk and the
1064    final GPA of a guest address.  */
1065 void clear_ept_ad(unsigned long *pml4, u64 guest_cr3,
1066 		  unsigned long guest_addr)
1067 {
1068 	int l;
1069 	unsigned long *pt = (unsigned long *)guest_cr3, gpa;
1070 	u64 pte, offset_in_page;
1071 	unsigned offset;
1072 
1073 	for (l = EPT_PAGE_LEVEL; ; --l) {
1074 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1075 
1076 		clear_ept_ad_pte(pml4, (u64) &pt[offset]);
1077 		pte = pt[offset];
1078 		if (l == 1 || (l < 4 && (pte & PT_PAGE_SIZE_MASK)))
1079 			break;
1080 		if (!(pte & PT_PRESENT_MASK))
1081 			return;
1082 		pt = (unsigned long *)(pte & PT_ADDR_MASK);
1083 	}
1084 
1085 	offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1086 	offset_in_page = guest_addr & ((1 << EPT_LEVEL_SHIFT(l)) - 1);
1087 	gpa = (pt[offset] & PT_ADDR_MASK) | (guest_addr & offset_in_page);
1088 	clear_ept_ad_pte(pml4, gpa);
1089 }
1090 
1091 /* check_ept_ad : Check the content of EPT A/D bits for the page table
1092    walk and the final GPA of a guest address.  */
1093 void check_ept_ad(unsigned long *pml4, u64 guest_cr3,
1094 		  unsigned long guest_addr, int expected_gpa_ad,
1095 		  int expected_pt_ad)
1096 {
1097 	int l;
1098 	unsigned long *pt = (unsigned long *)guest_cr3, gpa;
1099 	u64 ept_pte, pte, offset_in_page;
1100 	unsigned offset;
1101 	bool bad_pt_ad = false;
1102 
1103 	for (l = EPT_PAGE_LEVEL; ; --l) {
1104 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1105 
1106 		if (!get_ept_pte(pml4, (u64) &pt[offset], 1, &ept_pte)) {
1107 			printf("EPT - guest level %d page table is not mapped.\n", l);
1108 			return;
1109 		}
1110 
1111 		if (!bad_pt_ad) {
1112 			bad_pt_ad |= (ept_pte & (EPT_ACCESS_FLAG|EPT_DIRTY_FLAG)) != expected_pt_ad;
1113 			if (bad_pt_ad)
1114 				report_fail("EPT - guest level %d page table A=%d/D=%d",
1115 					    l,
1116 					    !!(expected_pt_ad & EPT_ACCESS_FLAG),
1117 					    !!(expected_pt_ad & EPT_DIRTY_FLAG));
1118 		}
1119 
1120 		pte = pt[offset];
1121 		if (l == 1 || (l < 4 && (pte & PT_PAGE_SIZE_MASK)))
1122 			break;
1123 		if (!(pte & PT_PRESENT_MASK))
1124 			return;
1125 		pt = (unsigned long *)(pte & PT_ADDR_MASK);
1126 	}
1127 
1128 	if (!bad_pt_ad)
1129 		report_pass("EPT - guest page table structures A=%d/D=%d",
1130 			    !!(expected_pt_ad & EPT_ACCESS_FLAG),
1131 			    !!(expected_pt_ad & EPT_DIRTY_FLAG));
1132 
1133 	offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1134 	offset_in_page = guest_addr & ((1 << EPT_LEVEL_SHIFT(l)) - 1);
1135 	gpa = (pt[offset] & PT_ADDR_MASK) | (guest_addr & offset_in_page);
1136 
1137 	if (!get_ept_pte(pml4, gpa, 1, &ept_pte)) {
1138 		report_fail("EPT - guest physical address is not mapped");
1139 		return;
1140 	}
1141 	report((ept_pte & (EPT_ACCESS_FLAG | EPT_DIRTY_FLAG)) == expected_gpa_ad,
1142 	       "EPT - guest physical address A=%d/D=%d",
1143 	       !!(expected_gpa_ad & EPT_ACCESS_FLAG),
1144 	       !!(expected_gpa_ad & EPT_DIRTY_FLAG));
1145 }
1146 
1147 void set_ept_pte(unsigned long *pml4, unsigned long guest_addr,
1148 		 int level, u64 pte_val)
1149 {
1150 	int l;
1151 	unsigned long *pt = pml4;
1152 	unsigned offset;
1153 
1154 	assert(level >= 1 && level <= 4);
1155 
1156 	for (l = EPT_PAGE_LEVEL; ; --l) {
1157 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1158 		if (l == level)
1159 			break;
1160 		assert(pt[offset] & EPT_PRESENT);
1161 		pt = (unsigned long *)(pt[offset] & EPT_ADDR_MASK);
1162 	}
1163 	offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1164 	pt[offset] = pte_val;
1165 }
1166 
1167 static void init_vmcs_ctrl(void)
1168 {
1169 	/* 26.2 CHECKS ON VMX CONTROLS AND HOST-STATE AREA */
1170 	/* 26.2.1.1 */
1171 	vmcs_write(PIN_CONTROLS, ctrl_pin);
1172 	/* Disable VMEXIT of IO instruction */
1173 	vmcs_write(CPU_EXEC_CTRL0, ctrl_cpu[0]);
1174 	if (ctrl_cpu_rev[0].set & CPU_SECONDARY) {
1175 		ctrl_cpu[1] = (ctrl_cpu[1] | ctrl_cpu_rev[1].set) &
1176 			ctrl_cpu_rev[1].clr;
1177 		vmcs_write(CPU_EXEC_CTRL1, ctrl_cpu[1]);
1178 	}
1179 	vmcs_write(CR3_TARGET_COUNT, 0);
1180 	vmcs_write(VPID, ++vpid_cnt);
1181 }
1182 
1183 static void init_vmcs_host(void)
1184 {
1185 	/* 26.2 CHECKS ON VMX CONTROLS AND HOST-STATE AREA */
1186 	/* 26.2.1.2 */
1187 	vmcs_write(HOST_EFER, rdmsr(MSR_EFER));
1188 
1189 	/* 26.2.1.3 */
1190 	vmcs_write(ENT_CONTROLS, ctrl_enter);
1191 	vmcs_write(EXI_CONTROLS, ctrl_exit);
1192 
1193 	/* 26.2.2 */
1194 	vmcs_write(HOST_CR0, read_cr0());
1195 	vmcs_write(HOST_CR3, read_cr3());
1196 	vmcs_write(HOST_CR4, read_cr4());
1197 	vmcs_write(HOST_SYSENTER_EIP, (u64)(&entry_sysenter));
1198 	vmcs_write(HOST_SYSENTER_CS,  KERNEL_CS);
1199 
1200 	/* 26.2.3 */
1201 	vmcs_write(HOST_SEL_CS, KERNEL_CS);
1202 	vmcs_write(HOST_SEL_SS, KERNEL_DS);
1203 	vmcs_write(HOST_SEL_DS, KERNEL_DS);
1204 	vmcs_write(HOST_SEL_ES, KERNEL_DS);
1205 	vmcs_write(HOST_SEL_FS, KERNEL_DS);
1206 	vmcs_write(HOST_SEL_GS, KERNEL_DS);
1207 	vmcs_write(HOST_SEL_TR, TSS_MAIN);
1208 	vmcs_write(HOST_BASE_TR, get_gdt_entry_base(get_tss_descr()));
1209 	vmcs_write(HOST_BASE_GDTR, gdt_descr.base);
1210 	vmcs_write(HOST_BASE_IDTR, idt_descr.base);
1211 	vmcs_write(HOST_BASE_FS, 0);
1212 	vmcs_write(HOST_BASE_GS, rdmsr(MSR_GS_BASE));
1213 
1214 	/* Set other vmcs area */
1215 	vmcs_write(PF_ERROR_MASK, 0);
1216 	vmcs_write(PF_ERROR_MATCH, 0);
1217 	vmcs_write(VMCS_LINK_PTR, ~0ul);
1218 	vmcs_write(VMCS_LINK_PTR_HI, ~0ul);
1219 	vmcs_write(HOST_RIP, (u64)(&vmx_return));
1220 }
1221 
1222 static void init_vmcs_guest(void)
1223 {
1224 	gdt_entry_t *tss_descr = get_tss_descr();
1225 
1226 	/* 26.3 CHECKING AND LOADING GUEST STATE */
1227 	ulong guest_cr0, guest_cr4, guest_cr3;
1228 	/* 26.3.1.1 */
1229 	guest_cr0 = read_cr0();
1230 	guest_cr4 = read_cr4();
1231 	guest_cr3 = read_cr3();
1232 	if (ctrl_enter & ENT_GUEST_64) {
1233 		guest_cr0 |= X86_CR0_PG;
1234 		guest_cr4 |= X86_CR4_PAE;
1235 	}
1236 	if ((ctrl_enter & ENT_GUEST_64) == 0)
1237 		guest_cr4 &= (~X86_CR4_PCIDE);
1238 	if (guest_cr0 & X86_CR0_PG)
1239 		guest_cr0 |= X86_CR0_PE;
1240 	vmcs_write(GUEST_CR0, guest_cr0);
1241 	vmcs_write(GUEST_CR3, guest_cr3);
1242 	vmcs_write(GUEST_CR4, guest_cr4);
1243 	vmcs_write(GUEST_SYSENTER_CS,  KERNEL_CS);
1244 	vmcs_write(GUEST_SYSENTER_ESP, guest_syscall_stack_top);
1245 	vmcs_write(GUEST_SYSENTER_EIP, (u64)(&entry_sysenter));
1246 	vmcs_write(GUEST_DR7, 0);
1247 	vmcs_write(GUEST_EFER, rdmsr(MSR_EFER));
1248 
1249 	/* 26.3.1.2 */
1250 	vmcs_write(GUEST_SEL_CS, KERNEL_CS);
1251 	vmcs_write(GUEST_SEL_SS, KERNEL_DS);
1252 	vmcs_write(GUEST_SEL_DS, KERNEL_DS);
1253 	vmcs_write(GUEST_SEL_ES, KERNEL_DS);
1254 	vmcs_write(GUEST_SEL_FS, KERNEL_DS);
1255 	vmcs_write(GUEST_SEL_GS, KERNEL_DS);
1256 	vmcs_write(GUEST_SEL_TR, TSS_MAIN);
1257 	vmcs_write(GUEST_SEL_LDTR, 0);
1258 
1259 	vmcs_write(GUEST_BASE_CS, 0);
1260 	vmcs_write(GUEST_BASE_ES, 0);
1261 	vmcs_write(GUEST_BASE_SS, 0);
1262 	vmcs_write(GUEST_BASE_DS, 0);
1263 	vmcs_write(GUEST_BASE_FS, 0);
1264 	vmcs_write(GUEST_BASE_GS, rdmsr(MSR_GS_BASE));
1265 	vmcs_write(GUEST_BASE_TR, get_gdt_entry_base(tss_descr));
1266 	vmcs_write(GUEST_BASE_LDTR, 0);
1267 
1268 	vmcs_write(GUEST_LIMIT_CS, 0xFFFFFFFF);
1269 	vmcs_write(GUEST_LIMIT_DS, 0xFFFFFFFF);
1270 	vmcs_write(GUEST_LIMIT_ES, 0xFFFFFFFF);
1271 	vmcs_write(GUEST_LIMIT_SS, 0xFFFFFFFF);
1272 	vmcs_write(GUEST_LIMIT_FS, 0xFFFFFFFF);
1273 	vmcs_write(GUEST_LIMIT_GS, 0xFFFFFFFF);
1274 	vmcs_write(GUEST_LIMIT_LDTR, 0xffff);
1275 	vmcs_write(GUEST_LIMIT_TR, get_gdt_entry_limit(tss_descr));
1276 
1277 	vmcs_write(GUEST_AR_CS, 0xa09b);
1278 	vmcs_write(GUEST_AR_DS, 0xc093);
1279 	vmcs_write(GUEST_AR_ES, 0xc093);
1280 	vmcs_write(GUEST_AR_FS, 0xc093);
1281 	vmcs_write(GUEST_AR_GS, 0xc093);
1282 	vmcs_write(GUEST_AR_SS, 0xc093);
1283 	vmcs_write(GUEST_AR_LDTR, 0x82);
1284 	vmcs_write(GUEST_AR_TR, 0x8b);
1285 
1286 	/* 26.3.1.3 */
1287 	vmcs_write(GUEST_BASE_GDTR, gdt_descr.base);
1288 	vmcs_write(GUEST_BASE_IDTR, idt_descr.base);
1289 	vmcs_write(GUEST_LIMIT_GDTR, gdt_descr.limit);
1290 	vmcs_write(GUEST_LIMIT_IDTR, idt_descr.limit);
1291 
1292 	/* 26.3.1.4 */
1293 	vmcs_write(GUEST_RIP, (u64)(&guest_entry));
1294 	vmcs_write(GUEST_RSP, guest_stack_top);
1295 	vmcs_write(GUEST_RFLAGS, X86_EFLAGS_FIXED);
1296 
1297 	/* 26.3.1.5 */
1298 	vmcs_write(GUEST_ACTV_STATE, ACTV_ACTIVE);
1299 	vmcs_write(GUEST_INTR_STATE, 0);
1300 }
1301 
1302 int init_vmcs(struct vmcs **vmcs)
1303 {
1304 	*vmcs = alloc_page();
1305 	(*vmcs)->hdr.revision_id = basic.revision;
1306 	/* vmclear first to init vmcs */
1307 	if (vmcs_clear(*vmcs)) {
1308 		printf("%s : vmcs_clear error\n", __func__);
1309 		return 1;
1310 	}
1311 
1312 	if (make_vmcs_current(*vmcs)) {
1313 		printf("%s : make_vmcs_current error\n", __func__);
1314 		return 1;
1315 	}
1316 
1317 	/* All settings to pin/exit/enter/cpu
1318 	   control fields should be placed here */
1319 	ctrl_pin |= PIN_EXTINT | PIN_NMI | PIN_VIRT_NMI;
1320 	ctrl_exit = EXI_LOAD_EFER | EXI_HOST_64;
1321 	ctrl_enter = (ENT_LOAD_EFER | ENT_GUEST_64);
1322 	/* DIsable IO instruction VMEXIT now */
1323 	ctrl_cpu[0] &= (~(CPU_IO | CPU_IO_BITMAP));
1324 	ctrl_cpu[1] = 0;
1325 
1326 	ctrl_pin = (ctrl_pin | ctrl_pin_rev.set) & ctrl_pin_rev.clr;
1327 	ctrl_enter = (ctrl_enter | ctrl_enter_rev.set) & ctrl_enter_rev.clr;
1328 	ctrl_exit = (ctrl_exit | ctrl_exit_rev.set) & ctrl_exit_rev.clr;
1329 	ctrl_cpu[0] = (ctrl_cpu[0] | ctrl_cpu_rev[0].set) & ctrl_cpu_rev[0].clr;
1330 
1331 	init_vmcs_ctrl();
1332 	init_vmcs_host();
1333 	init_vmcs_guest();
1334 	return 0;
1335 }
1336 
1337 void enable_vmx(void)
1338 {
1339 	bool vmx_enabled =
1340 		rdmsr(MSR_IA32_FEATURE_CONTROL) &
1341 		FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1342 
1343 	if (!vmx_enabled) {
1344 		wrmsr(MSR_IA32_FEATURE_CONTROL,
1345 				FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX |
1346 				FEATURE_CONTROL_LOCKED);
1347 	}
1348 }
1349 
1350 static void init_vmx_caps(void)
1351 {
1352 	basic.val = rdmsr(MSR_IA32_VMX_BASIC);
1353 	ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PIN
1354 			: MSR_IA32_VMX_PINBASED_CTLS);
1355 	ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT
1356 			: MSR_IA32_VMX_EXIT_CTLS);
1357 	ctrl_enter_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_ENTRY
1358 			: MSR_IA32_VMX_ENTRY_CTLS);
1359 	ctrl_cpu_rev[0].val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PROC
1360 			: MSR_IA32_VMX_PROCBASED_CTLS);
1361 	if ((ctrl_cpu_rev[0].clr & CPU_SECONDARY) != 0)
1362 		ctrl_cpu_rev[1].val = rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2);
1363 	else
1364 		ctrl_cpu_rev[1].val = 0;
1365 	if ((ctrl_cpu_rev[1].clr & (CPU_EPT | CPU_VPID)) != 0)
1366 		ept_vpid.val = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
1367 	else
1368 		ept_vpid.val = 0;
1369 }
1370 
1371 void init_vmx(u64 *vmxon_region)
1372 {
1373 	ulong fix_cr0_set, fix_cr0_clr;
1374 	ulong fix_cr4_set, fix_cr4_clr;
1375 
1376 	fix_cr0_set =  rdmsr(MSR_IA32_VMX_CR0_FIXED0);
1377 	fix_cr0_clr =  rdmsr(MSR_IA32_VMX_CR0_FIXED1);
1378 	fix_cr4_set =  rdmsr(MSR_IA32_VMX_CR4_FIXED0);
1379 	fix_cr4_clr = rdmsr(MSR_IA32_VMX_CR4_FIXED1);
1380 
1381 	write_cr0((read_cr0() & fix_cr0_clr) | fix_cr0_set);
1382 	write_cr4((read_cr4() & fix_cr4_clr) | fix_cr4_set | X86_CR4_VMXE);
1383 
1384 	*vmxon_region = basic.revision;
1385 }
1386 
1387 static void alloc_bsp_vmx_pages(void)
1388 {
1389 	bsp_vmxon_region = alloc_page();
1390 	guest_stack_top = (uintptr_t)alloc_page() + PAGE_SIZE;
1391 	guest_syscall_stack_top = (uintptr_t)alloc_page() + PAGE_SIZE;
1392 	vmcs_root = alloc_page();
1393 }
1394 
1395 static void init_bsp_vmx(void)
1396 {
1397 	init_vmx_caps();
1398 	alloc_bsp_vmx_pages();
1399 	init_vmx(bsp_vmxon_region);
1400 }
1401 
1402 static void do_vmxon_off(void *data)
1403 {
1404 	TEST_ASSERT(!vmx_on());
1405 	TEST_ASSERT(!vmx_off());
1406 }
1407 
1408 static void do_write_feature_control(void *data)
1409 {
1410 	wrmsr(MSR_IA32_FEATURE_CONTROL, 0);
1411 }
1412 
1413 static int test_vmx_feature_control(void)
1414 {
1415 	u64 ia32_feature_control;
1416 	bool vmx_enabled;
1417 	bool feature_control_locked;
1418 
1419 	ia32_feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
1420 	vmx_enabled =
1421 		ia32_feature_control & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1422 	feature_control_locked =
1423 		ia32_feature_control & FEATURE_CONTROL_LOCKED;
1424 
1425 	if (vmx_enabled && feature_control_locked) {
1426 		printf("VMX enabled and locked by BIOS\n");
1427 		return 0;
1428 	} else if (feature_control_locked) {
1429 		printf("ERROR: VMX locked out by BIOS!?\n");
1430 		return 1;
1431 	}
1432 
1433 	wrmsr(MSR_IA32_FEATURE_CONTROL, 0);
1434 	report(test_for_exception(GP_VECTOR, &do_vmxon_off, NULL),
1435 	       "test vmxon with FEATURE_CONTROL cleared");
1436 
1437 	wrmsr(MSR_IA32_FEATURE_CONTROL, FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX);
1438 	report(test_for_exception(GP_VECTOR, &do_vmxon_off, NULL),
1439 	       "test vmxon without FEATURE_CONTROL lock");
1440 
1441 	wrmsr(MSR_IA32_FEATURE_CONTROL,
1442 		  FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX |
1443 		  FEATURE_CONTROL_LOCKED);
1444 
1445 	ia32_feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
1446 	vmx_enabled =
1447 		ia32_feature_control & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1448 	report(vmx_enabled, "test enable VMX in FEATURE_CONTROL");
1449 
1450 	report(test_for_exception(GP_VECTOR, &do_write_feature_control, NULL),
1451 	       "test FEATURE_CONTROL lock bit");
1452 
1453 	return !vmx_enabled;
1454 }
1455 
1456 
1457 static void write_cr(int cr_number, unsigned long val)
1458 {
1459 	if (!cr_number)
1460 		write_cr0(val);
1461 	else
1462 		write_cr4(val);
1463 }
1464 
1465 static int write_cr_safe(int cr_number, unsigned long val)
1466 {
1467 	if (!cr_number)
1468 		return write_cr0_safe(val);
1469 	else
1470 		return write_cr4_safe(val);
1471 }
1472 
1473 static int test_vmxon_bad_cr(int cr_number, unsigned long orig_cr,
1474 			     unsigned long *flexible_bits)
1475 {
1476 	unsigned long required1, disallowed1, val, bit;
1477 	int ret, i;
1478 
1479 	if (!cr_number) {
1480 		required1 =  rdmsr(MSR_IA32_VMX_CR0_FIXED0);
1481 		disallowed1 = ~rdmsr(MSR_IA32_VMX_CR0_FIXED1);
1482 	} else {
1483 		required1 =  rdmsr(MSR_IA32_VMX_CR4_FIXED0);
1484 		disallowed1 = ~rdmsr(MSR_IA32_VMX_CR4_FIXED1);
1485 	}
1486 
1487 	*flexible_bits = 0;
1488 
1489 	for (i = 0; i < BITS_PER_LONG; i++) {
1490 		bit = BIT(i);
1491 
1492 		/*
1493 		 * Don't touch bits that will affect the current paging mode,
1494 		 * toggling them will send the test into the weeds before it
1495 		 * gets to VMXON.  nVMX tests are 64-bit only, so CR4.PAE is
1496 		 * guaranteed to be '1', i.e. PSE is fair game.  PKU/PKS are
1497 		 * also fair game as KVM doesn't configure any keys.  SMAP and
1498 		 * SMEP are off limits because the page tables have the USER
1499 		 * bit set at all levels.
1500 		 */
1501 		if ((cr_number == 0 && (bit == X86_CR0_PE || bit == X86_CR0_PG)) ||
1502 		    (cr_number == 4 && (bit == X86_CR4_PAE || bit == X86_CR4_SMAP ||
1503 					bit == X86_CR4_SMEP)))
1504 			continue;
1505 
1506 		if (!(bit & required1) && !(bit & disallowed1)) {
1507 			if (!write_cr_safe(cr_number, orig_cr ^ bit)) {
1508 				*flexible_bits |= bit;
1509 				write_cr(cr_number, orig_cr);
1510 			}
1511 			continue;
1512 		}
1513 
1514 		assert(!(required1 & disallowed1));
1515 
1516 		if (required1 & bit)
1517 			val = orig_cr & ~bit;
1518 		else
1519 			val = orig_cr | bit;
1520 
1521 		if (write_cr_safe(cr_number, val))
1522 			continue;
1523 
1524 		ret = vmx_on();
1525 		report(ret == UD_VECTOR,
1526 		       "VMXON with CR%d bit %d %s should #UD, got '%d'",
1527 		       cr_number, i, (required1 & bit) ? "cleared" : "set", ret);
1528 
1529 		write_cr(cr_number, orig_cr);
1530 
1531 		if (ret <= 0)
1532 			return 1;
1533 	}
1534 	return 0;
1535 }
1536 
1537 static int test_vmxon(void)
1538 {
1539 	unsigned long orig_cr0, flexible_cr0, orig_cr4, flexible_cr4;
1540 	int width = cpuid_maxphyaddr();
1541 	u64 *vmxon_region;
1542 	int ret;
1543 
1544 	orig_cr0 = read_cr0();
1545 	if (test_vmxon_bad_cr(0, orig_cr0, &flexible_cr0))
1546 		return 1;
1547 
1548 	orig_cr4 = read_cr4();
1549 	if (test_vmxon_bad_cr(4, orig_cr4, &flexible_cr4))
1550 		return 1;
1551 
1552 	/* Unaligned page access */
1553 	vmxon_region = (u64 *)((intptr_t)bsp_vmxon_region + 1);
1554 	ret = __vmxon_safe(vmxon_region);
1555 	report(ret < 0, "test vmxon with unaligned vmxon region");
1556 	if (ret >= 0)
1557 		return 1;
1558 
1559 	/* gpa bits beyond physical address width are set*/
1560 	vmxon_region = (u64 *)((intptr_t)bsp_vmxon_region | ((u64)1 << (width+1)));
1561 	ret = __vmxon_safe(vmxon_region);
1562 	report(ret < 0, "test vmxon with bits set beyond physical address width");
1563 	if (ret >= 0)
1564 		return 1;
1565 
1566 	/* invalid revision identifier */
1567 	*bsp_vmxon_region = 0xba9da9;
1568 	ret = vmxon_safe();
1569 	report(ret < 0, "test vmxon with invalid revision identifier");
1570 	if (ret >= 0)
1571 		return 1;
1572 
1573 	/* and finally a valid region, with valid-but-tweaked cr0/cr4 */
1574 	write_cr0(orig_cr0 ^ flexible_cr0);
1575 	write_cr4(orig_cr4 ^ flexible_cr4);
1576 	*bsp_vmxon_region = basic.revision;
1577 	ret = vmxon_safe();
1578 	report(!ret, "test vmxon with valid vmxon region");
1579 	write_cr0(orig_cr0);
1580 	write_cr4(orig_cr4);
1581 	return ret;
1582 }
1583 
1584 static void test_vmptrld(void)
1585 {
1586 	struct vmcs *vmcs, *tmp_root;
1587 	int width = cpuid_maxphyaddr();
1588 
1589 	vmcs = alloc_page();
1590 	vmcs->hdr.revision_id = basic.revision;
1591 
1592 	/* Unaligned page access */
1593 	tmp_root = (struct vmcs *)((intptr_t)vmcs + 1);
1594 	report(make_vmcs_current(tmp_root) == 1,
1595 	       "test vmptrld with unaligned vmcs");
1596 
1597 	/* gpa bits beyond physical address width are set*/
1598 	tmp_root = (struct vmcs *)((intptr_t)vmcs |
1599 				   ((u64)1 << (width+1)));
1600 	report(make_vmcs_current(tmp_root) == 1,
1601 	       "test vmptrld with vmcs address bits set beyond physical address width");
1602 
1603 	/* Pass VMXON region */
1604 	assert(!vmcs_clear(vmcs));
1605 	assert(!make_vmcs_current(vmcs));
1606 	tmp_root = (struct vmcs *)bsp_vmxon_region;
1607 	report(make_vmcs_current(tmp_root) == 1,
1608 	       "test vmptrld with vmxon region");
1609 	report(vmcs_read(VMX_INST_ERROR) == VMXERR_VMPTRLD_VMXON_POINTER,
1610 	       "test vmptrld with vmxon region vm-instruction error");
1611 
1612 	report(make_vmcs_current(vmcs) == 0,
1613 	       "test vmptrld with valid vmcs region");
1614 }
1615 
1616 static void test_vmptrst(void)
1617 {
1618 	int ret;
1619 	struct vmcs *vmcs1, *vmcs2;
1620 
1621 	vmcs1 = alloc_page();
1622 	init_vmcs(&vmcs1);
1623 	ret = vmcs_save(&vmcs2);
1624 	report((!ret) && (vmcs1 == vmcs2), "test vmptrst");
1625 }
1626 
1627 struct vmx_ctl_msr {
1628 	const char *name;
1629 	u32 index, true_index;
1630 	u32 default1;
1631 } vmx_ctl_msr[] = {
1632 	{ "MSR_IA32_VMX_PINBASED_CTLS", MSR_IA32_VMX_PINBASED_CTLS,
1633 	  MSR_IA32_VMX_TRUE_PIN, 0x16 },
1634 	{ "MSR_IA32_VMX_PROCBASED_CTLS", MSR_IA32_VMX_PROCBASED_CTLS,
1635 	  MSR_IA32_VMX_TRUE_PROC, 0x401e172 },
1636 	{ "MSR_IA32_VMX_PROCBASED_CTLS2", MSR_IA32_VMX_PROCBASED_CTLS2,
1637 	  MSR_IA32_VMX_PROCBASED_CTLS2, 0 },
1638 	{ "MSR_IA32_VMX_EXIT_CTLS", MSR_IA32_VMX_EXIT_CTLS,
1639 	  MSR_IA32_VMX_TRUE_EXIT, 0x36dff },
1640 	{ "MSR_IA32_VMX_ENTRY_CTLS", MSR_IA32_VMX_ENTRY_CTLS,
1641 	  MSR_IA32_VMX_TRUE_ENTRY, 0x11ff },
1642 };
1643 
1644 static void test_vmx_caps(void)
1645 {
1646 	u64 val, default1, fixed0, fixed1;
1647 	union vmx_ctrl_msr ctrl, true_ctrl;
1648 	unsigned int n;
1649 	bool ok;
1650 
1651 	printf("\nTest suite: VMX capability reporting\n");
1652 
1653 	report((basic.revision & (1ul << 31)) == 0 &&
1654 	       basic.size > 0 && basic.size <= 4096 &&
1655 	       (basic.type == 0 || basic.type == 6) &&
1656 	       basic.reserved1 == 0 && basic.reserved2 == 0,
1657 	       "MSR_IA32_VMX_BASIC");
1658 
1659 	val = rdmsr(MSR_IA32_VMX_MISC);
1660 	report((!(ctrl_cpu_rev[1].clr & CPU_URG) || val & (1ul << 5)) &&
1661 	       ((val >> 16) & 0x1ff) <= 256 &&
1662 	       (val & 0x80007e00) == 0,
1663 	       "MSR_IA32_VMX_MISC");
1664 
1665 	for (n = 0; n < ARRAY_SIZE(vmx_ctl_msr); n++) {
1666 		ctrl.val = rdmsr(vmx_ctl_msr[n].index);
1667 		default1 = vmx_ctl_msr[n].default1;
1668 		ok = (ctrl.set & default1) == default1;
1669 		ok = ok && (ctrl.set & ~ctrl.clr) == 0;
1670 		if (ok && basic.ctrl) {
1671 			true_ctrl.val = rdmsr(vmx_ctl_msr[n].true_index);
1672 			ok = ctrl.clr == true_ctrl.clr;
1673 			ok = ok && ctrl.set == (true_ctrl.set | default1);
1674 		}
1675 		report(ok, "%s", vmx_ctl_msr[n].name);
1676 	}
1677 
1678 	fixed0 = rdmsr(MSR_IA32_VMX_CR0_FIXED0);
1679 	fixed1 = rdmsr(MSR_IA32_VMX_CR0_FIXED1);
1680 	report(((fixed0 ^ fixed1) & ~fixed1) == 0,
1681 	       "MSR_IA32_VMX_IA32_VMX_CR0_FIXED0/1");
1682 
1683 	fixed0 = rdmsr(MSR_IA32_VMX_CR4_FIXED0);
1684 	fixed1 = rdmsr(MSR_IA32_VMX_CR4_FIXED1);
1685 	report(((fixed0 ^ fixed1) & ~fixed1) == 0,
1686 	       "MSR_IA32_VMX_IA32_VMX_CR4_FIXED0/1");
1687 
1688 	val = rdmsr(MSR_IA32_VMX_VMCS_ENUM);
1689 	report((val & VMCS_FIELD_INDEX_MASK) >= 0x2a &&
1690 	       (val & 0xfffffffffffffc01Ull) == 0,
1691 	       "MSR_IA32_VMX_VMCS_ENUM");
1692 
1693 	fixed0 = -1ull;
1694 	fixed0 &= ~(EPT_CAP_EXEC_ONLY |
1695 		    EPT_CAP_PWL4 |
1696 		    EPT_CAP_PWL5 |
1697 		    EPT_CAP_UC |
1698 		    EPT_CAP_WB |
1699 		    EPT_CAP_2M_PAGE |
1700 		    EPT_CAP_1G_PAGE |
1701 		    EPT_CAP_INVEPT |
1702 		    EPT_CAP_AD_FLAG |
1703 		    EPT_CAP_ADV_EPT_INFO |
1704 		    EPT_CAP_INVEPT_SINGLE |
1705 		    EPT_CAP_INVEPT_ALL |
1706 		    VPID_CAP_INVVPID |
1707 		    VPID_CAP_INVVPID_ADDR |
1708 		    VPID_CAP_INVVPID_CXTGLB |
1709 		    VPID_CAP_INVVPID_ALL |
1710 		    VPID_CAP_INVVPID_CXTLOC);
1711 
1712 	val = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
1713 	report((val & fixed0) == 0,
1714 	       "MSR_IA32_VMX_EPT_VPID_CAP");
1715 }
1716 
1717 /* This function can only be called in guest */
1718 void __attribute__((__used__)) hypercall(u32 hypercall_no)
1719 {
1720 	u64 val = 0;
1721 	val = (hypercall_no & HYPERCALL_MASK) | HYPERCALL_BIT;
1722 	hypercall_field = val;
1723 	asm volatile("vmcall\n\t");
1724 }
1725 
1726 static bool is_hypercall(union exit_reason exit_reason)
1727 {
1728 	return exit_reason.basic == VMX_VMCALL &&
1729 	       (hypercall_field & HYPERCALL_BIT);
1730 }
1731 
1732 static int handle_hypercall(void)
1733 {
1734 	ulong hypercall_no;
1735 
1736 	hypercall_no = hypercall_field & HYPERCALL_MASK;
1737 	hypercall_field = 0;
1738 	switch (hypercall_no) {
1739 	case HYPERCALL_VMEXIT:
1740 		return VMX_TEST_VMEXIT;
1741 	case HYPERCALL_VMABORT:
1742 		return VMX_TEST_VMABORT;
1743 	case HYPERCALL_VMSKIP:
1744 		return VMX_TEST_VMSKIP;
1745 	default:
1746 		printf("ERROR : Invalid hypercall number : %ld\n", hypercall_no);
1747 	}
1748 	return VMX_TEST_EXIT;
1749 }
1750 
1751 static void continue_abort(void)
1752 {
1753 	assert(!in_guest);
1754 	printf("Host was here when guest aborted:\n");
1755 	dump_stack();
1756 	longjmp(abort_target, 1);
1757 	abort();
1758 }
1759 
1760 void __abort_test(void)
1761 {
1762 	if (in_guest)
1763 		hypercall(HYPERCALL_VMABORT);
1764 	else
1765 		longjmp(abort_target, 1);
1766 	abort();
1767 }
1768 
1769 static void continue_skip(void)
1770 {
1771 	assert(!in_guest);
1772 	longjmp(abort_target, 1);
1773 	abort();
1774 }
1775 
1776 void test_skip(const char *msg)
1777 {
1778 	printf("%s skipping test: %s\n", in_guest ? "Guest" : "Host", msg);
1779 	if (in_guest)
1780 		hypercall(HYPERCALL_VMABORT);
1781 	else
1782 		longjmp(abort_target, 1);
1783 	abort();
1784 }
1785 
1786 static int exit_handler(union exit_reason exit_reason)
1787 {
1788 	int ret;
1789 
1790 	current->exits++;
1791 	regs.rflags = vmcs_read(GUEST_RFLAGS);
1792 	if (is_hypercall(exit_reason))
1793 		ret = handle_hypercall();
1794 	else
1795 		ret = current->exit_handler(exit_reason);
1796 	vmcs_write(GUEST_RFLAGS, regs.rflags);
1797 
1798 	return ret;
1799 }
1800 
1801 /*
1802  * Tries to enter the guest, populates @result with VM-Fail, VM-Exit, entered,
1803  * etc...
1804  */
1805 static noinline void vmx_enter_guest(struct vmentry_result *result)
1806 {
1807 	memset(result, 0, sizeof(*result));
1808 
1809 	in_guest = 1;
1810 	asm volatile (
1811 		"mov %[HOST_RSP], %%rdi\n\t"
1812 		"vmwrite %%rsp, %%rdi\n\t"
1813 		LOAD_GPR_C
1814 		"cmpb $0, %[launched]\n\t"
1815 		"jne 1f\n\t"
1816 		"vmlaunch\n\t"
1817 		"jmp 2f\n\t"
1818 		"1: "
1819 		"vmresume\n\t"
1820 		"2: "
1821 		SAVE_GPR_C
1822 		"pushf\n\t"
1823 		"pop %%rdi\n\t"
1824 		"mov %%rdi, %[vm_fail_flags]\n\t"
1825 		"movl $1, %[vm_fail]\n\t"
1826 		"jmp 3f\n\t"
1827 		"vmx_return:\n\t"
1828 		SAVE_GPR_C
1829 		"3: \n\t"
1830 		: [vm_fail]"+m"(result->vm_fail),
1831 		  [vm_fail_flags]"=m"(result->flags)
1832 		: [launched]"m"(launched), [HOST_RSP]"i"(HOST_RSP)
1833 		: "rdi", "memory", "cc"
1834 	);
1835 	in_guest = 0;
1836 
1837 	result->vmlaunch = !launched;
1838 	result->instr = launched ? "vmresume" : "vmlaunch";
1839 	result->exit_reason.full = result->vm_fail ? 0xdead :
1840 						     vmcs_read(EXI_REASON);
1841 	result->entered = !result->vm_fail &&
1842 			  !result->exit_reason.failed_vmentry;
1843 }
1844 
1845 static int vmx_run(void)
1846 {
1847 	struct vmentry_result result;
1848 	u32 ret;
1849 
1850 	while (1) {
1851 		vmx_enter_guest(&result);
1852 		if (result.entered) {
1853 			/*
1854 			 * VMCS isn't in "launched" state if there's been any
1855 			 * entry failure (early or otherwise).
1856 			 */
1857 			launched = 1;
1858 			ret = exit_handler(result.exit_reason);
1859 		} else if (current->entry_failure_handler) {
1860 			ret = current->entry_failure_handler(&result);
1861 		} else {
1862 			ret = VMX_TEST_EXIT;
1863 		}
1864 
1865 		switch (ret) {
1866 		case VMX_TEST_RESUME:
1867 			continue;
1868 		case VMX_TEST_VMEXIT:
1869 			guest_finished = 1;
1870 			return 0;
1871 		case VMX_TEST_EXIT:
1872 			break;
1873 		default:
1874 			printf("ERROR : Invalid %s_handler return val %d.\n",
1875 			       result.entered ? "exit" : "entry_failure",
1876 			       ret);
1877 			break;
1878 		}
1879 
1880 		if (result.entered)
1881 			print_vmexit_info(result.exit_reason);
1882 		else
1883 			print_vmentry_failure_info(&result);
1884 		abort();
1885 	}
1886 }
1887 
1888 static void run_teardown_step(struct test_teardown_step *step)
1889 {
1890 	step->func(step->data);
1891 }
1892 
1893 static int test_run(struct vmx_test *test)
1894 {
1895 	int r;
1896 
1897 	/* Validate V2 interface. */
1898 	if (test->v2) {
1899 		int ret = 0;
1900 		if (test->init || test->guest_main || test->exit_handler ||
1901 		    test->syscall_handler) {
1902 			report_fail("V2 test cannot specify V1 callbacks.");
1903 			ret = 1;
1904 		}
1905 		if (ret)
1906 			return ret;
1907 	}
1908 
1909 	if (test->name == NULL)
1910 		test->name = "(no name)";
1911 	if (vmx_on()) {
1912 		printf("%s : vmxon failed.\n", __func__);
1913 		return 1;
1914 	}
1915 
1916 	init_vmcs(&(test->vmcs));
1917 	/* Directly call test->init is ok here, init_vmcs has done
1918 	   vmcs init, vmclear and vmptrld*/
1919 	if (test->init && test->init(test->vmcs) != VMX_TEST_START)
1920 		goto out;
1921 	teardown_count = 0;
1922 	v2_guest_main = NULL;
1923 	test->exits = 0;
1924 	current = test;
1925 	regs = test->guest_regs;
1926 	vmcs_write(GUEST_RFLAGS, regs.rflags | X86_EFLAGS_FIXED);
1927 	launched = 0;
1928 	guest_finished = 0;
1929 	printf("\nTest suite: %s\n", test->name);
1930 
1931 	r = setjmp(abort_target);
1932 	if (r) {
1933 		assert(!in_guest);
1934 		goto out;
1935 	}
1936 
1937 
1938 	if (test->v2)
1939 		test->v2();
1940 	else
1941 		vmx_run();
1942 
1943 	while (teardown_count > 0)
1944 		run_teardown_step(&teardown_steps[--teardown_count]);
1945 
1946 	if (launched && !guest_finished)
1947 		report_fail("Guest didn't run to completion.");
1948 
1949 out:
1950 	if (vmx_off()) {
1951 		printf("%s : vmxoff failed.\n", __func__);
1952 		return 1;
1953 	}
1954 	return 0;
1955 }
1956 
1957 /*
1958  * Add a teardown step. Executed after the test's main function returns.
1959  * Teardown steps executed in reverse order.
1960  */
1961 void test_add_teardown(test_teardown_func func, void *data)
1962 {
1963 	struct test_teardown_step *step;
1964 
1965 	TEST_ASSERT_MSG(teardown_count < MAX_TEST_TEARDOWN_STEPS,
1966 			"There are already %d teardown steps.",
1967 			teardown_count);
1968 	step = &teardown_steps[teardown_count++];
1969 	step->func = func;
1970 	step->data = data;
1971 }
1972 
1973 static void __test_set_guest(test_guest_func func)
1974 {
1975 	assert(current->v2);
1976 	v2_guest_main = func;
1977 }
1978 
1979 /*
1980  * Set the target of the first enter_guest call. Can only be called once per
1981  * test. Must be called before first enter_guest call.
1982  */
1983 void test_set_guest(test_guest_func func)
1984 {
1985 	TEST_ASSERT_MSG(!v2_guest_main, "Already set guest func.");
1986 	__test_set_guest(func);
1987 }
1988 
1989 /*
1990  * Set the target of the enter_guest call and reset the RIP so 'func' will
1991  * start from the beginning.  This can be called multiple times per test.
1992  */
1993 void test_override_guest(test_guest_func func)
1994 {
1995 	__test_set_guest(func);
1996 	init_vmcs_guest();
1997 }
1998 
1999 void test_set_guest_finished(void)
2000 {
2001 	guest_finished = 1;
2002 }
2003 
2004 static void check_for_guest_termination(union exit_reason exit_reason)
2005 {
2006 	if (is_hypercall(exit_reason)) {
2007 		int ret;
2008 
2009 		ret = handle_hypercall();
2010 		switch (ret) {
2011 		case VMX_TEST_VMEXIT:
2012 			guest_finished = 1;
2013 			break;
2014 		case VMX_TEST_VMABORT:
2015 			continue_abort();
2016 			break;
2017 		case VMX_TEST_VMSKIP:
2018 			continue_skip();
2019 			break;
2020 		default:
2021 			printf("ERROR : Invalid handle_hypercall return %d.\n",
2022 			       ret);
2023 			abort();
2024 		}
2025 	}
2026 }
2027 
2028 /*
2029  * Enters the guest (or launches it for the first time). Error to call once the
2030  * guest has returned (i.e., run past the end of its guest() function).
2031  */
2032 void __enter_guest(u8 abort_flag, struct vmentry_result *result)
2033 {
2034 	TEST_ASSERT_MSG(v2_guest_main,
2035 			"Never called test_set_guest_func!");
2036 
2037 	TEST_ASSERT_MSG(!guest_finished,
2038 			"Called enter_guest() after guest returned.");
2039 
2040 	vmx_enter_guest(result);
2041 
2042 	if (result->vm_fail) {
2043 		if (abort_flag & ABORT_ON_EARLY_VMENTRY_FAIL)
2044 			goto do_abort;
2045 		return;
2046 	}
2047 	if (result->exit_reason.failed_vmentry) {
2048 		if ((abort_flag & ABORT_ON_INVALID_GUEST_STATE) ||
2049 		    result->exit_reason.basic != VMX_FAIL_STATE)
2050 			goto do_abort;
2051 		return;
2052 	}
2053 
2054 	launched = 1;
2055 	check_for_guest_termination(result->exit_reason);
2056 	return;
2057 
2058 do_abort:
2059 	print_vmentry_failure_info(result);
2060 	abort();
2061 }
2062 
2063 void enter_guest_with_bad_controls(void)
2064 {
2065 	struct vmentry_result result;
2066 
2067 	TEST_ASSERT_MSG(v2_guest_main,
2068 			"Never called test_set_guest_func!");
2069 
2070 	TEST_ASSERT_MSG(!guest_finished,
2071 			"Called enter_guest() after guest returned.");
2072 
2073 	__enter_guest(ABORT_ON_INVALID_GUEST_STATE, &result);
2074 	report(result.vm_fail, "VM-Fail occurred as expected");
2075 	report((result.flags & VMX_ENTRY_FLAGS) == X86_EFLAGS_ZF,
2076                "FLAGS set correctly on VM-Fail");
2077 	report(vmcs_read(VMX_INST_ERROR) == VMXERR_ENTRY_INVALID_CONTROL_FIELD,
2078 	       "VM-Inst Error # is %d (VM entry with invalid control field(s))",
2079 	       VMXERR_ENTRY_INVALID_CONTROL_FIELD);
2080 }
2081 
2082 void enter_guest(void)
2083 {
2084 	struct vmentry_result result;
2085 
2086 	__enter_guest(ABORT_ON_EARLY_VMENTRY_FAIL |
2087 		      ABORT_ON_INVALID_GUEST_STATE, &result);
2088 }
2089 
2090 extern struct vmx_test vmx_tests[];
2091 
2092 static bool
2093 test_wanted(const char *name, const char *filters[], int filter_count)
2094 {
2095 	int i;
2096 	bool positive = false;
2097 	bool match = false;
2098 	char clean_name[strlen(name) + 1];
2099 	char *c;
2100 	const char *n;
2101 
2102 	printf("filter = %s, test = %s\n", filters[0], name);
2103 
2104 	/* Replace spaces with underscores. */
2105 	n = name;
2106 	c = &clean_name[0];
2107 	do *c++ = (*n == ' ') ? '_' : *n;
2108 	while (*n++);
2109 
2110 	for (i = 0; i < filter_count; i++) {
2111 		const char *filter = filters[i];
2112 
2113 		if (filter[0] == '-') {
2114 			if (simple_glob(clean_name, filter + 1))
2115 				return false;
2116 		} else {
2117 			positive = true;
2118 			match |= simple_glob(clean_name, filter);
2119 		}
2120 	}
2121 
2122 	if (!positive || match) {
2123 		matched++;
2124 		return true;
2125 	} else {
2126 		return false;
2127 	}
2128 }
2129 
2130 int main(int argc, const char *argv[])
2131 {
2132 	int i = 0;
2133 
2134 	setup_vm();
2135 	hypercall_field = 0;
2136 
2137 	/* We want xAPIC mode to test MMIO passthrough from L1 (us) to L2.  */
2138 	smp_reset_apic();
2139 
2140 	argv++;
2141 	argc--;
2142 
2143 	if (!this_cpu_has(X86_FEATURE_VMX)) {
2144 		printf("WARNING: vmx not supported, add '-cpu host'\n");
2145 		goto exit;
2146 	}
2147 	init_bsp_vmx();
2148 	if (test_wanted("test_vmx_feature_control", argv, argc)) {
2149 		/* Sets MSR_IA32_FEATURE_CONTROL to 0x5 */
2150 		if (test_vmx_feature_control() != 0)
2151 			goto exit;
2152 	} else {
2153 		enable_vmx();
2154 	}
2155 
2156 	if (test_wanted("test_vmxon", argv, argc)) {
2157 		/* Enables VMX */
2158 		if (test_vmxon() != 0)
2159 			goto exit;
2160 	} else {
2161 		if (vmx_on()) {
2162 			report_fail("vmxon");
2163 			goto exit;
2164 		}
2165 	}
2166 
2167 	if (test_wanted("test_vmptrld", argv, argc))
2168 		test_vmptrld();
2169 	if (test_wanted("test_vmclear", argv, argc))
2170 		test_vmclear();
2171 	if (test_wanted("test_vmptrst", argv, argc))
2172 		test_vmptrst();
2173 	if (test_wanted("test_vmwrite_vmread", argv, argc))
2174 		test_vmwrite_vmread();
2175 	if (test_wanted("test_vmcs_high", argv, argc))
2176 		test_vmcs_high();
2177 	if (test_wanted("test_vmcs_lifecycle", argv, argc))
2178 		test_vmcs_lifecycle();
2179 	if (test_wanted("test_vmx_caps", argv, argc))
2180 		test_vmx_caps();
2181 	if (test_wanted("test_vmread_flags_touch", argv, argc))
2182 		test_vmread_flags_touch();
2183 	if (test_wanted("test_vmwrite_flags_touch", argv, argc))
2184 		test_vmwrite_flags_touch();
2185 
2186 	/* Balance vmxon from test_vmxon. */
2187 	vmx_off();
2188 
2189 	for (; vmx_tests[i].name != NULL; i++) {
2190 		if (!test_wanted(vmx_tests[i].name, argv, argc))
2191 			continue;
2192 		if (test_run(&vmx_tests[i]))
2193 			goto exit;
2194 	}
2195 
2196 	if (!matched)
2197 		report(matched, "command line didn't match any tests!");
2198 
2199 exit:
2200 	return report_summary();
2201 }
2202