xref: /kvm-unit-tests/x86/vmx.c (revision 6163f75d09a0a96a5c3db82dd768b13f79629c00)
1 /*
2  * x86/vmx.c : Framework for testing nested virtualization
3  *	This is a framework to test nested VMX for KVM, which
4  * 	started as a project of GSoC 2013. All test cases should
5  *	be located in x86/vmx_tests.c and framework related
6  *	functions should be in this file.
7  *
8  * How to write test cases?
9  *	Add callbacks of test suite in variant "vmx_tests". You can
10  *	write:
11  *		1. init function used for initializing test suite
12  *		2. main function for codes running in L2 guest,
13  *		3. exit_handler to handle vmexit of L2 to L1
14  *		4. syscall handler to handle L2 syscall vmexit
15  *		5. vmenter fail handler to handle direct failure of vmenter
16  *		6. guest_regs is loaded when vmenter and saved when
17  *			vmexit, you can read and set it in exit_handler
18  *	If no special function is needed for a test suite, use
19  *	coressponding basic_* functions as callback. More handlers
20  *	can be added to "vmx_tests", see details of "struct vmx_test"
21  *	and function test_run().
22  *
23  * Currently, vmx test framework only set up one VCPU and one
24  * concurrent guest test environment with same paging for L2 and
25  * L1. For usage of EPT, only 1:1 mapped paging is used from VFN
26  * to PFN.
27  *
28  * Author : Arthur Chunqi Li <yzt356@gmail.com>
29  */
30 
31 #include "libcflat.h"
32 #include "processor.h"
33 #include "alloc_page.h"
34 #include "vm.h"
35 #include "desc.h"
36 #include "vmx.h"
37 #include "msr.h"
38 #include "smp.h"
39 #include "apic.h"
40 
41 u64 *bsp_vmxon_region;
42 struct vmcs *vmcs_root;
43 u32 vpid_cnt;
44 void *guest_stack, *guest_syscall_stack;
45 u32 ctrl_pin, ctrl_enter, ctrl_exit, ctrl_cpu[2];
46 struct regs regs;
47 
48 struct vmx_test *current;
49 
50 #define MAX_TEST_TEARDOWN_STEPS 10
51 
52 struct test_teardown_step {
53 	test_teardown_func func;
54 	void *data;
55 };
56 
57 static int teardown_count;
58 static struct test_teardown_step teardown_steps[MAX_TEST_TEARDOWN_STEPS];
59 
60 static test_guest_func v2_guest_main;
61 
62 u64 hypercall_field;
63 bool launched;
64 static int matched;
65 static int guest_finished;
66 static int in_guest;
67 
68 union vmx_basic basic;
69 union vmx_ctrl_msr ctrl_pin_rev;
70 union vmx_ctrl_msr ctrl_cpu_rev[2];
71 union vmx_ctrl_msr ctrl_exit_rev;
72 union vmx_ctrl_msr ctrl_enter_rev;
73 union vmx_ept_vpid  ept_vpid;
74 
75 extern struct descriptor_table_ptr gdt64_desc;
76 extern struct descriptor_table_ptr idt_descr;
77 extern struct descriptor_table_ptr tss_descr;
78 extern void *vmx_return;
79 extern void *entry_sysenter;
80 extern void *guest_entry;
81 
82 static volatile u32 stage;
83 
84 static jmp_buf abort_target;
85 
86 struct vmcs_field {
87 	u64 mask;
88 	u64 encoding;
89 };
90 
91 #define MASK(_bits) GENMASK_ULL((_bits) - 1, 0)
92 #define MASK_NATURAL MASK(sizeof(unsigned long) * 8)
93 
94 static struct vmcs_field vmcs_fields[] = {
95 	{ MASK(16), VPID },
96 	{ MASK(16), PINV },
97 	{ MASK(16), EPTP_IDX },
98 
99 	{ MASK(16), GUEST_SEL_ES },
100 	{ MASK(16), GUEST_SEL_CS },
101 	{ MASK(16), GUEST_SEL_SS },
102 	{ MASK(16), GUEST_SEL_DS },
103 	{ MASK(16), GUEST_SEL_FS },
104 	{ MASK(16), GUEST_SEL_GS },
105 	{ MASK(16), GUEST_SEL_LDTR },
106 	{ MASK(16), GUEST_SEL_TR },
107 	{ MASK(16), GUEST_INT_STATUS },
108 
109 	{ MASK(16), HOST_SEL_ES },
110 	{ MASK(16), HOST_SEL_CS },
111 	{ MASK(16), HOST_SEL_SS },
112 	{ MASK(16), HOST_SEL_DS },
113 	{ MASK(16), HOST_SEL_FS },
114 	{ MASK(16), HOST_SEL_GS },
115 	{ MASK(16), HOST_SEL_TR },
116 
117 	{ MASK(64), IO_BITMAP_A },
118 	{ MASK(64), IO_BITMAP_B },
119 	{ MASK(64), MSR_BITMAP },
120 	{ MASK(64), EXIT_MSR_ST_ADDR },
121 	{ MASK(64), EXIT_MSR_LD_ADDR },
122 	{ MASK(64), ENTER_MSR_LD_ADDR },
123 	{ MASK(64), VMCS_EXEC_PTR },
124 	{ MASK(64), TSC_OFFSET },
125 	{ MASK(64), APIC_VIRT_ADDR },
126 	{ MASK(64), APIC_ACCS_ADDR },
127 	{ MASK(64), EPTP },
128 
129 	{ MASK(64), INFO_PHYS_ADDR },
130 
131 	{ MASK(64), VMCS_LINK_PTR },
132 	{ MASK(64), GUEST_DEBUGCTL },
133 	{ MASK(64), GUEST_EFER },
134 	{ MASK(64), GUEST_PAT },
135 	{ MASK(64), GUEST_PERF_GLOBAL_CTRL },
136 	{ MASK(64), GUEST_PDPTE },
137 
138 	{ MASK(64), HOST_PAT },
139 	{ MASK(64), HOST_EFER },
140 	{ MASK(64), HOST_PERF_GLOBAL_CTRL },
141 
142 	{ MASK(32), PIN_CONTROLS },
143 	{ MASK(32), CPU_EXEC_CTRL0 },
144 	{ MASK(32), EXC_BITMAP },
145 	{ MASK(32), PF_ERROR_MASK },
146 	{ MASK(32), PF_ERROR_MATCH },
147 	{ MASK(32), CR3_TARGET_COUNT },
148 	{ MASK(32), EXI_CONTROLS },
149 	{ MASK(32), EXI_MSR_ST_CNT },
150 	{ MASK(32), EXI_MSR_LD_CNT },
151 	{ MASK(32), ENT_CONTROLS },
152 	{ MASK(32), ENT_MSR_LD_CNT },
153 	{ MASK(32), ENT_INTR_INFO },
154 	{ MASK(32), ENT_INTR_ERROR },
155 	{ MASK(32), ENT_INST_LEN },
156 	{ MASK(32), TPR_THRESHOLD },
157 	{ MASK(32), CPU_EXEC_CTRL1 },
158 
159 	{ MASK(32), VMX_INST_ERROR },
160 	{ MASK(32), EXI_REASON },
161 	{ MASK(32), EXI_INTR_INFO },
162 	{ MASK(32), EXI_INTR_ERROR },
163 	{ MASK(32), IDT_VECT_INFO },
164 	{ MASK(32), IDT_VECT_ERROR },
165 	{ MASK(32), EXI_INST_LEN },
166 	{ MASK(32), EXI_INST_INFO },
167 
168 	{ MASK(32), GUEST_LIMIT_ES },
169 	{ MASK(32), GUEST_LIMIT_CS },
170 	{ MASK(32), GUEST_LIMIT_SS },
171 	{ MASK(32), GUEST_LIMIT_DS },
172 	{ MASK(32), GUEST_LIMIT_FS },
173 	{ MASK(32), GUEST_LIMIT_GS },
174 	{ MASK(32), GUEST_LIMIT_LDTR },
175 	{ MASK(32), GUEST_LIMIT_TR },
176 	{ MASK(32), GUEST_LIMIT_GDTR },
177 	{ MASK(32), GUEST_LIMIT_IDTR },
178 	{ 0x1d0ff, GUEST_AR_ES },
179 	{ 0x1f0ff, GUEST_AR_CS },
180 	{ 0x1d0ff, GUEST_AR_SS },
181 	{ 0x1d0ff, GUEST_AR_DS },
182 	{ 0x1d0ff, GUEST_AR_FS },
183 	{ 0x1d0ff, GUEST_AR_GS },
184 	{ 0x1d0ff, GUEST_AR_LDTR },
185 	{ 0x1d0ff, GUEST_AR_TR },
186 	{ MASK(32), GUEST_INTR_STATE },
187 	{ MASK(32), GUEST_ACTV_STATE },
188 	{ MASK(32), GUEST_SMBASE },
189 	{ MASK(32), GUEST_SYSENTER_CS },
190 	{ MASK(32), PREEMPT_TIMER_VALUE },
191 
192 	{ MASK(32), HOST_SYSENTER_CS },
193 
194 	{ MASK_NATURAL, CR0_MASK },
195 	{ MASK_NATURAL, CR4_MASK },
196 	{ MASK_NATURAL, CR0_READ_SHADOW },
197 	{ MASK_NATURAL, CR4_READ_SHADOW },
198 	{ MASK_NATURAL, CR3_TARGET_0 },
199 	{ MASK_NATURAL, CR3_TARGET_1 },
200 	{ MASK_NATURAL, CR3_TARGET_2 },
201 	{ MASK_NATURAL, CR3_TARGET_3 },
202 
203 	{ MASK_NATURAL, EXI_QUALIFICATION },
204 	{ MASK_NATURAL, IO_RCX },
205 	{ MASK_NATURAL, IO_RSI },
206 	{ MASK_NATURAL, IO_RDI },
207 	{ MASK_NATURAL, IO_RIP },
208 	{ MASK_NATURAL, GUEST_LINEAR_ADDRESS },
209 
210 	{ MASK_NATURAL, GUEST_CR0 },
211 	{ MASK_NATURAL, GUEST_CR3 },
212 	{ MASK_NATURAL, GUEST_CR4 },
213 	{ MASK_NATURAL, GUEST_BASE_ES },
214 	{ MASK_NATURAL, GUEST_BASE_CS },
215 	{ MASK_NATURAL, GUEST_BASE_SS },
216 	{ MASK_NATURAL, GUEST_BASE_DS },
217 	{ MASK_NATURAL, GUEST_BASE_FS },
218 	{ MASK_NATURAL, GUEST_BASE_GS },
219 	{ MASK_NATURAL, GUEST_BASE_LDTR },
220 	{ MASK_NATURAL, GUEST_BASE_TR },
221 	{ MASK_NATURAL, GUEST_BASE_GDTR },
222 	{ MASK_NATURAL, GUEST_BASE_IDTR },
223 	{ MASK_NATURAL, GUEST_DR7 },
224 	{ MASK_NATURAL, GUEST_RSP },
225 	{ MASK_NATURAL, GUEST_RIP },
226 	{ MASK_NATURAL, GUEST_RFLAGS },
227 	{ MASK_NATURAL, GUEST_PENDING_DEBUG },
228 	{ MASK_NATURAL, GUEST_SYSENTER_ESP },
229 	{ MASK_NATURAL, GUEST_SYSENTER_EIP },
230 
231 	{ MASK_NATURAL, HOST_CR0 },
232 	{ MASK_NATURAL, HOST_CR3 },
233 	{ MASK_NATURAL, HOST_CR4 },
234 	{ MASK_NATURAL, HOST_BASE_FS },
235 	{ MASK_NATURAL, HOST_BASE_GS },
236 	{ MASK_NATURAL, HOST_BASE_TR },
237 	{ MASK_NATURAL, HOST_BASE_GDTR },
238 	{ MASK_NATURAL, HOST_BASE_IDTR },
239 	{ MASK_NATURAL, HOST_SYSENTER_ESP },
240 	{ MASK_NATURAL, HOST_SYSENTER_EIP },
241 	{ MASK_NATURAL, HOST_RSP },
242 	{ MASK_NATURAL, HOST_RIP },
243 };
244 
245 enum vmcs_field_type {
246 	VMCS_FIELD_TYPE_CONTROL = 0,
247 	VMCS_FIELD_TYPE_READ_ONLY_DATA = 1,
248 	VMCS_FIELD_TYPE_GUEST = 2,
249 	VMCS_FIELD_TYPE_HOST = 3,
250 	VMCS_FIELD_TYPES,
251 };
252 
253 static inline int vmcs_field_type(struct vmcs_field *f)
254 {
255 	return (f->encoding >> VMCS_FIELD_TYPE_SHIFT) & 0x3;
256 }
257 
258 static int vmcs_field_readonly(struct vmcs_field *f)
259 {
260 	u64 ia32_vmx_misc;
261 
262 	ia32_vmx_misc = rdmsr(MSR_IA32_VMX_MISC);
263 	return !(ia32_vmx_misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS) &&
264 		(vmcs_field_type(f) == VMCS_FIELD_TYPE_READ_ONLY_DATA);
265 }
266 
267 static inline u64 vmcs_field_value(struct vmcs_field *f, u8 cookie)
268 {
269 	u64 value;
270 
271 	/* Incorporate the cookie and the field encoding into the value. */
272 	value = cookie;
273 	value |= (f->encoding << 8);
274 	value |= 0xdeadbeefull << 32;
275 
276 	return value & f->mask;
277 }
278 
279 static void set_vmcs_field(struct vmcs_field *f, u8 cookie)
280 {
281 	vmcs_write(f->encoding, vmcs_field_value(f, cookie));
282 }
283 
284 static bool check_vmcs_field(struct vmcs_field *f, u8 cookie, u32 *max_index)
285 {
286 	u64 expected;
287 	u64 actual;
288 	u32 index;
289 	int ret;
290 
291 	if (f->encoding == VMX_INST_ERROR) {
292 		printf("Skipping volatile field %lx\n", f->encoding);
293 		return true;
294 	}
295 
296 	ret = vmcs_read_checking(f->encoding, &actual);
297 	assert(!(ret & X86_EFLAGS_CF));
298 	/* Skip VMCS fields that aren't recognized by the CPU */
299 	if (ret & X86_EFLAGS_ZF)
300 		return true;
301 
302 	if (max_index) {
303 		index = f->encoding & VMCS_FIELD_INDEX_MASK;
304 		if (index > *max_index)
305 			*max_index = index;
306 	}
307 
308 	if (vmcs_field_readonly(f)) {
309 		printf("Skipping read-only field %lx\n", f->encoding);
310 		return true;
311 	}
312 
313 	expected = vmcs_field_value(f, cookie);
314 	actual &= f->mask;
315 
316 	if (expected == actual)
317 		return true;
318 
319 	printf("FAIL: VMWRITE/VMREAD %lx (expected: %lx, actual: %lx)\n",
320 	       f->encoding, (unsigned long) expected, (unsigned long) actual);
321 
322 	return false;
323 }
324 
325 static void set_all_vmcs_fields(u8 cookie)
326 {
327 	int i;
328 
329 	for (i = 0; i < ARRAY_SIZE(vmcs_fields); i++)
330 		set_vmcs_field(&vmcs_fields[i], cookie);
331 }
332 
333 static bool __check_all_vmcs_fields(u8 cookie, u32 *max_index)
334 {
335 	bool pass = true;
336 	int i;
337 
338 	for (i = 0; i < ARRAY_SIZE(vmcs_fields); i++) {
339 		if (!check_vmcs_field(&vmcs_fields[i], cookie, max_index))
340 			pass = false;
341 	}
342 
343 	return pass;
344 }
345 
346 static bool check_all_vmcs_fields(u8 cookie)
347 {
348 	return __check_all_vmcs_fields(cookie, NULL);
349 }
350 
351 static void test_vmwrite_vmread(void)
352 {
353 	struct vmcs *vmcs = alloc_page();
354 	u32 vmcs_enum_max, max_index = 0;
355 
356 	vmcs->hdr.revision_id = basic.revision;
357 	assert(!vmcs_clear(vmcs));
358 	assert(!make_vmcs_current(vmcs));
359 
360 	set_all_vmcs_fields(0x42);
361 	report("VMWRITE/VMREAD", __check_all_vmcs_fields(0x42, &max_index));
362 
363 	vmcs_enum_max = rdmsr(MSR_IA32_VMX_VMCS_ENUM) & VMCS_FIELD_INDEX_MASK;
364 	report("VMX_VMCS_ENUM.MAX_INDEX expected at least: %x, actual: %x",
365 		vmcs_enum_max >= max_index, max_index, vmcs_enum_max);
366 
367 	assert(!vmcs_clear(vmcs));
368 	free_page(vmcs);
369 }
370 
371 static void test_vmcs_high(void)
372 {
373 	struct vmcs *vmcs = alloc_page();
374 
375 	vmcs->hdr.revision_id = basic.revision;
376 	assert(!vmcs_clear(vmcs));
377 	assert(!make_vmcs_current(vmcs));
378 
379 	vmcs_write(TSC_OFFSET, 0x0123456789ABCDEFull);
380 	report("VMREAD TSC_OFFSET after VMWRITE TSC_OFFSET",
381 	       vmcs_read(TSC_OFFSET) == 0x0123456789ABCDEFull);
382 	report("VMREAD TSC_OFFSET_HI after VMWRITE TSC_OFFSET",
383 	       vmcs_read(TSC_OFFSET_HI) == 0x01234567ull);
384 	vmcs_write(TSC_OFFSET_HI, 0x76543210ul);
385 	report("VMREAD TSC_OFFSET_HI after VMWRITE TSC_OFFSET_HI",
386 	       vmcs_read(TSC_OFFSET_HI) == 0x76543210ul);
387 	report("VMREAD TSC_OFFSET after VMWRITE TSC_OFFSET_HI",
388 	       vmcs_read(TSC_OFFSET) == 0x7654321089ABCDEFull);
389 
390 	assert(!vmcs_clear(vmcs));
391 	free_page(vmcs);
392 }
393 
394 static void test_vmcs_lifecycle(void)
395 {
396 	struct vmcs *vmcs[2] = {};
397 	int i;
398 
399 	for (i = 0; i < ARRAY_SIZE(vmcs); i++) {
400 		vmcs[i] = alloc_page();
401 		vmcs[i]->hdr.revision_id = basic.revision;
402 	}
403 
404 #define VMPTRLD(_i) do { \
405 	assert(_i < ARRAY_SIZE(vmcs)); \
406 	assert(!make_vmcs_current(vmcs[_i])); \
407 	printf("VMPTRLD VMCS%d\n", (_i)); \
408 } while (0)
409 
410 #define VMCLEAR(_i) do { \
411 	assert(_i < ARRAY_SIZE(vmcs)); \
412 	assert(!vmcs_clear(vmcs[_i])); \
413 	printf("VMCLEAR VMCS%d\n", (_i)); \
414 } while (0)
415 
416 	VMCLEAR(0);
417 	VMPTRLD(0);
418 	set_all_vmcs_fields(0);
419 	report("current:VMCS0 active:[VMCS0]", check_all_vmcs_fields(0));
420 
421 	VMCLEAR(0);
422 	VMPTRLD(0);
423 	report("current:VMCS0 active:[VMCS0]", check_all_vmcs_fields(0));
424 
425 	VMCLEAR(1);
426 	report("current:VMCS0 active:[VMCS0]", check_all_vmcs_fields(0));
427 
428 	VMPTRLD(1);
429 	set_all_vmcs_fields(1);
430 	report("current:VMCS1 active:[VMCS0,VCMS1]", check_all_vmcs_fields(1));
431 
432 	VMPTRLD(0);
433 	report("current:VMCS0 active:[VMCS0,VCMS1]", check_all_vmcs_fields(0));
434 	VMPTRLD(1);
435 	report("current:VMCS1 active:[VMCS0,VCMS1]", check_all_vmcs_fields(1));
436 	VMPTRLD(1);
437 	report("current:VMCS1 active:[VMCS0,VCMS1]", check_all_vmcs_fields(1));
438 
439 	VMCLEAR(0);
440 	report("current:VMCS1 active:[VCMS1]", check_all_vmcs_fields(1));
441 
442 	/* VMPTRLD should not erase VMWRITEs to the current VMCS */
443 	set_all_vmcs_fields(2);
444 	VMPTRLD(1);
445 	report("current:VMCS1 active:[VCMS1]", check_all_vmcs_fields(2));
446 
447 	for (i = 0; i < ARRAY_SIZE(vmcs); i++) {
448 		VMCLEAR(i);
449 		free_page(vmcs[i]);
450 	}
451 
452 #undef VMPTRLD
453 #undef VMCLEAR
454 }
455 
456 void vmx_set_test_stage(u32 s)
457 {
458 	barrier();
459 	stage = s;
460 	barrier();
461 }
462 
463 u32 vmx_get_test_stage(void)
464 {
465 	u32 s;
466 
467 	barrier();
468 	s = stage;
469 	barrier();
470 	return s;
471 }
472 
473 void vmx_inc_test_stage(void)
474 {
475 	barrier();
476 	stage++;
477 	barrier();
478 }
479 
480 /* entry_sysenter */
481 asm(
482 	".align	4, 0x90\n\t"
483 	".globl	entry_sysenter\n\t"
484 	"entry_sysenter:\n\t"
485 	SAVE_GPR
486 	"	and	$0xf, %rax\n\t"
487 	"	mov	%rax, %rdi\n\t"
488 	"	call	syscall_handler\n\t"
489 	LOAD_GPR
490 	"	vmresume\n\t"
491 );
492 
493 static void __attribute__((__used__)) syscall_handler(u64 syscall_no)
494 {
495 	if (current->syscall_handler)
496 		current->syscall_handler(syscall_no);
497 }
498 
499 static const char * const exit_reason_descriptions[] = {
500 	[VMX_EXC_NMI]		= "VMX_EXC_NMI",
501 	[VMX_EXTINT]		= "VMX_EXTINT",
502 	[VMX_TRIPLE_FAULT]	= "VMX_TRIPLE_FAULT",
503 	[VMX_INIT]		= "VMX_INIT",
504 	[VMX_SIPI]		= "VMX_SIPI",
505 	[VMX_SMI_IO]		= "VMX_SMI_IO",
506 	[VMX_SMI_OTHER]		= "VMX_SMI_OTHER",
507 	[VMX_INTR_WINDOW]	= "VMX_INTR_WINDOW",
508 	[VMX_NMI_WINDOW]	= "VMX_NMI_WINDOW",
509 	[VMX_TASK_SWITCH]	= "VMX_TASK_SWITCH",
510 	[VMX_CPUID]		= "VMX_CPUID",
511 	[VMX_GETSEC]		= "VMX_GETSEC",
512 	[VMX_HLT]		= "VMX_HLT",
513 	[VMX_INVD]		= "VMX_INVD",
514 	[VMX_INVLPG]		= "VMX_INVLPG",
515 	[VMX_RDPMC]		= "VMX_RDPMC",
516 	[VMX_RDTSC]		= "VMX_RDTSC",
517 	[VMX_RSM]		= "VMX_RSM",
518 	[VMX_VMCALL]		= "VMX_VMCALL",
519 	[VMX_VMCLEAR]		= "VMX_VMCLEAR",
520 	[VMX_VMLAUNCH]		= "VMX_VMLAUNCH",
521 	[VMX_VMPTRLD]		= "VMX_VMPTRLD",
522 	[VMX_VMPTRST]		= "VMX_VMPTRST",
523 	[VMX_VMREAD]		= "VMX_VMREAD",
524 	[VMX_VMRESUME]		= "VMX_VMRESUME",
525 	[VMX_VMWRITE]		= "VMX_VMWRITE",
526 	[VMX_VMXOFF]		= "VMX_VMXOFF",
527 	[VMX_VMXON]		= "VMX_VMXON",
528 	[VMX_CR]		= "VMX_CR",
529 	[VMX_DR]		= "VMX_DR",
530 	[VMX_IO]		= "VMX_IO",
531 	[VMX_RDMSR]		= "VMX_RDMSR",
532 	[VMX_WRMSR]		= "VMX_WRMSR",
533 	[VMX_FAIL_STATE]	= "VMX_FAIL_STATE",
534 	[VMX_FAIL_MSR]		= "VMX_FAIL_MSR",
535 	[VMX_MWAIT]		= "VMX_MWAIT",
536 	[VMX_MTF]		= "VMX_MTF",
537 	[VMX_MONITOR]		= "VMX_MONITOR",
538 	[VMX_PAUSE]		= "VMX_PAUSE",
539 	[VMX_FAIL_MCHECK]	= "VMX_FAIL_MCHECK",
540 	[VMX_TPR_THRESHOLD]	= "VMX_TPR_THRESHOLD",
541 	[VMX_APIC_ACCESS]	= "VMX_APIC_ACCESS",
542 	[VMX_EOI_INDUCED]	= "VMX_EOI_INDUCED",
543 	[VMX_GDTR_IDTR]		= "VMX_GDTR_IDTR",
544 	[VMX_LDTR_TR]		= "VMX_LDTR_TR",
545 	[VMX_EPT_VIOLATION]	= "VMX_EPT_VIOLATION",
546 	[VMX_EPT_MISCONFIG]	= "VMX_EPT_MISCONFIG",
547 	[VMX_INVEPT]		= "VMX_INVEPT",
548 	[VMX_PREEMPT]		= "VMX_PREEMPT",
549 	[VMX_INVVPID]		= "VMX_INVVPID",
550 	[VMX_WBINVD]		= "VMX_WBINVD",
551 	[VMX_XSETBV]		= "VMX_XSETBV",
552 	[VMX_APIC_WRITE]	= "VMX_APIC_WRITE",
553 	[VMX_RDRAND]		= "VMX_RDRAND",
554 	[VMX_INVPCID]		= "VMX_INVPCID",
555 	[VMX_VMFUNC]		= "VMX_VMFUNC",
556 	[VMX_RDSEED]		= "VMX_RDSEED",
557 	[VMX_PML_FULL]		= "VMX_PML_FULL",
558 	[VMX_XSAVES]		= "VMX_XSAVES",
559 	[VMX_XRSTORS]		= "VMX_XRSTORS",
560 };
561 
562 const char *exit_reason_description(u64 reason)
563 {
564 	if (reason >= ARRAY_SIZE(exit_reason_descriptions))
565 		return "(unknown)";
566 	return exit_reason_descriptions[reason] ? : "(unused)";
567 }
568 
569 void print_vmexit_info()
570 {
571 	u64 guest_rip, guest_rsp;
572 	ulong reason = vmcs_read(EXI_REASON) & 0xff;
573 	ulong exit_qual = vmcs_read(EXI_QUALIFICATION);
574 	guest_rip = vmcs_read(GUEST_RIP);
575 	guest_rsp = vmcs_read(GUEST_RSP);
576 	printf("VMEXIT info:\n");
577 	printf("\tvmexit reason = %ld\n", reason);
578 	printf("\texit qualification = %#lx\n", exit_qual);
579 	printf("\tBit 31 of reason = %lx\n", (vmcs_read(EXI_REASON) >> 31) & 1);
580 	printf("\tguest_rip = %#lx\n", guest_rip);
581 	printf("\tRAX=%#lx    RBX=%#lx    RCX=%#lx    RDX=%#lx\n",
582 		regs.rax, regs.rbx, regs.rcx, regs.rdx);
583 	printf("\tRSP=%#lx    RBP=%#lx    RSI=%#lx    RDI=%#lx\n",
584 		guest_rsp, regs.rbp, regs.rsi, regs.rdi);
585 	printf("\tR8 =%#lx    R9 =%#lx    R10=%#lx    R11=%#lx\n",
586 		regs.r8, regs.r9, regs.r10, regs.r11);
587 	printf("\tR12=%#lx    R13=%#lx    R14=%#lx    R15=%#lx\n",
588 		regs.r12, regs.r13, regs.r14, regs.r15);
589 }
590 
591 void
592 print_vmentry_failure_info(struct vmentry_failure *failure) {
593 	if (failure->early) {
594 		printf("Early %s failure: ", failure->instr);
595 		switch (failure->flags & VMX_ENTRY_FLAGS) {
596 		case X86_EFLAGS_CF:
597 			printf("current-VMCS pointer is not valid.\n");
598 			break;
599 		case X86_EFLAGS_ZF:
600 			printf("error number is %ld. See Intel 30.4.\n",
601 			       vmcs_read(VMX_INST_ERROR));
602 			break;
603 		default:
604 			printf("unexpected flags %lx!\n", failure->flags);
605 		}
606 	} else {
607 		u64 reason = vmcs_read(EXI_REASON);
608 		u64 qual = vmcs_read(EXI_QUALIFICATION);
609 
610 		printf("Non-early %s failure (reason=%#lx, qual=%#lx): ",
611 			failure->instr, reason, qual);
612 
613 		switch (reason & 0xff) {
614 		case VMX_FAIL_STATE:
615 			printf("invalid guest state\n");
616 			break;
617 		case VMX_FAIL_MSR:
618 			printf("MSR loading\n");
619 			break;
620 		case VMX_FAIL_MCHECK:
621 			printf("machine-check event\n");
622 			break;
623 		default:
624 			printf("unexpected basic exit reason %ld\n",
625 			       reason & 0xff);
626 		}
627 
628 		if (!(reason & VMX_ENTRY_FAILURE))
629 			printf("\tVMX_ENTRY_FAILURE BIT NOT SET!\n");
630 
631 		if (reason & 0x7fff0000)
632 			printf("\tRESERVED BITS SET!\n");
633 	}
634 }
635 
636 /*
637  * VMCLEAR should ensures all VMCS state is flushed to the VMCS
638  * region in memory.
639  */
640 static void test_vmclear_flushing(void)
641 {
642 	struct vmcs *vmcs[3] = {};
643 	int i;
644 
645 	for (i = 0; i < ARRAY_SIZE(vmcs); i++) {
646 		vmcs[i] = alloc_page();
647 	}
648 
649 	vmcs[0]->hdr.revision_id = basic.revision;
650 	assert(!vmcs_clear(vmcs[0]));
651 	assert(!make_vmcs_current(vmcs[0]));
652 	set_all_vmcs_fields(0x86);
653 
654 	assert(!vmcs_clear(vmcs[0]));
655 	memcpy(vmcs[1], vmcs[0], basic.size);
656 	assert(!make_vmcs_current(vmcs[1]));
657 	report("test vmclear flush (current VMCS)", check_all_vmcs_fields(0x86));
658 
659 	set_all_vmcs_fields(0x87);
660 	assert(!make_vmcs_current(vmcs[0]));
661 	assert(!vmcs_clear(vmcs[1]));
662 	memcpy(vmcs[2], vmcs[1], basic.size);
663 	assert(!make_vmcs_current(vmcs[2]));
664 	report("test vmclear flush (!current VMCS)", check_all_vmcs_fields(0x87));
665 
666 	for (i = 0; i < ARRAY_SIZE(vmcs); i++) {
667 		assert(!vmcs_clear(vmcs[i]));
668 		free_page(vmcs[i]);
669 	}
670 }
671 
672 static void test_vmclear(void)
673 {
674 	struct vmcs *tmp_root;
675 	int width = cpuid_maxphyaddr();
676 
677 	/*
678 	 * Note- The tests below do not necessarily have a
679 	 * valid VMCS, but that's ok since the invalid vmcs
680 	 * is only used for a specific test and is discarded
681 	 * without touching its contents
682 	 */
683 
684 	/* Unaligned page access */
685 	tmp_root = (struct vmcs *)((intptr_t)vmcs_root + 1);
686 	report("test vmclear with unaligned vmcs",
687 	       vmcs_clear(tmp_root) == 1);
688 
689 	/* gpa bits beyond physical address width are set*/
690 	tmp_root = (struct vmcs *)((intptr_t)vmcs_root |
691 				   ((u64)1 << (width+1)));
692 	report("test vmclear with vmcs address bits set beyond physical address width",
693 	       vmcs_clear(tmp_root) == 1);
694 
695 	/* Pass VMXON region */
696 	tmp_root = (struct vmcs *)bsp_vmxon_region;
697 	report("test vmclear with vmxon region",
698 	       vmcs_clear(tmp_root) == 1);
699 
700 	/* Valid VMCS */
701 	report("test vmclear with valid vmcs region", vmcs_clear(vmcs_root) == 0);
702 
703 	test_vmclear_flushing();
704 }
705 
706 static void __attribute__((__used__)) guest_main(void)
707 {
708 	if (current->v2)
709 		v2_guest_main();
710 	else
711 		current->guest_main();
712 }
713 
714 /* guest_entry */
715 asm(
716 	".align	4, 0x90\n\t"
717 	".globl	entry_guest\n\t"
718 	"guest_entry:\n\t"
719 	"	call guest_main\n\t"
720 	"	mov $1, %edi\n\t"
721 	"	call hypercall\n\t"
722 );
723 
724 /* EPT paging structure related functions */
725 /* split_large_ept_entry: Split a 2M/1G large page into 512 smaller PTEs.
726 		@ptep : large page table entry to split
727 		@level : level of ptep (2 or 3)
728  */
729 static void split_large_ept_entry(unsigned long *ptep, int level)
730 {
731 	unsigned long *new_pt;
732 	unsigned long gpa;
733 	unsigned long pte;
734 	unsigned long prototype;
735 	int i;
736 
737 	pte = *ptep;
738 	assert(pte & EPT_PRESENT);
739 	assert(pte & EPT_LARGE_PAGE);
740 	assert(level == 2 || level == 3);
741 
742 	new_pt = alloc_page();
743 	assert(new_pt);
744 
745 	prototype = pte & ~EPT_ADDR_MASK;
746 	if (level == 2)
747 		prototype &= ~EPT_LARGE_PAGE;
748 
749 	gpa = pte & EPT_ADDR_MASK;
750 	for (i = 0; i < EPT_PGDIR_ENTRIES; i++) {
751 		new_pt[i] = prototype | gpa;
752 		gpa += 1ul << EPT_LEVEL_SHIFT(level - 1);
753 	}
754 
755 	pte &= ~EPT_LARGE_PAGE;
756 	pte &= ~EPT_ADDR_MASK;
757 	pte |= virt_to_phys(new_pt);
758 
759 	*ptep = pte;
760 }
761 
762 /* install_ept_entry : Install a page to a given level in EPT
763 		@pml4 : addr of pml4 table
764 		@pte_level : level of PTE to set
765 		@guest_addr : physical address of guest
766 		@pte : pte value to set
767 		@pt_page : address of page table, NULL for a new page
768  */
769 void install_ept_entry(unsigned long *pml4,
770 		int pte_level,
771 		unsigned long guest_addr,
772 		unsigned long pte,
773 		unsigned long *pt_page)
774 {
775 	int level;
776 	unsigned long *pt = pml4;
777 	unsigned offset;
778 
779 	/* EPT only uses 48 bits of GPA. */
780 	assert(guest_addr < (1ul << 48));
781 
782 	for (level = EPT_PAGE_LEVEL; level > pte_level; --level) {
783 		offset = (guest_addr >> EPT_LEVEL_SHIFT(level))
784 				& EPT_PGDIR_MASK;
785 		if (!(pt[offset] & (EPT_PRESENT))) {
786 			unsigned long *new_pt = pt_page;
787 			if (!new_pt)
788 				new_pt = alloc_page();
789 			else
790 				pt_page = 0;
791 			memset(new_pt, 0, PAGE_SIZE);
792 			pt[offset] = virt_to_phys(new_pt)
793 					| EPT_RA | EPT_WA | EPT_EA;
794 		} else if (pt[offset] & EPT_LARGE_PAGE)
795 			split_large_ept_entry(&pt[offset], level);
796 		pt = phys_to_virt(pt[offset] & EPT_ADDR_MASK);
797 	}
798 	offset = (guest_addr >> EPT_LEVEL_SHIFT(level)) & EPT_PGDIR_MASK;
799 	pt[offset] = pte;
800 }
801 
802 /* Map a page, @perm is the permission of the page */
803 void install_ept(unsigned long *pml4,
804 		unsigned long phys,
805 		unsigned long guest_addr,
806 		u64 perm)
807 {
808 	install_ept_entry(pml4, 1, guest_addr, (phys & PAGE_MASK) | perm, 0);
809 }
810 
811 /* Map a 1G-size page */
812 void install_1g_ept(unsigned long *pml4,
813 		unsigned long phys,
814 		unsigned long guest_addr,
815 		u64 perm)
816 {
817 	install_ept_entry(pml4, 3, guest_addr,
818 			(phys & PAGE_MASK) | perm | EPT_LARGE_PAGE, 0);
819 }
820 
821 /* Map a 2M-size page */
822 void install_2m_ept(unsigned long *pml4,
823 		unsigned long phys,
824 		unsigned long guest_addr,
825 		u64 perm)
826 {
827 	install_ept_entry(pml4, 2, guest_addr,
828 			(phys & PAGE_MASK) | perm | EPT_LARGE_PAGE, 0);
829 }
830 
831 /* setup_ept_range : Setup a range of 1:1 mapped page to EPT paging structure.
832 		@start : start address of guest page
833 		@len : length of address to be mapped
834 		@map_1g : whether 1G page map is used
835 		@map_2m : whether 2M page map is used
836 		@perm : permission for every page
837  */
838 void setup_ept_range(unsigned long *pml4, unsigned long start,
839 		     unsigned long len, int map_1g, int map_2m, u64 perm)
840 {
841 	u64 phys = start;
842 	u64 max = (u64)len + (u64)start;
843 
844 	if (map_1g) {
845 		while (phys + PAGE_SIZE_1G <= max) {
846 			install_1g_ept(pml4, phys, phys, perm);
847 			phys += PAGE_SIZE_1G;
848 		}
849 	}
850 	if (map_2m) {
851 		while (phys + PAGE_SIZE_2M <= max) {
852 			install_2m_ept(pml4, phys, phys, perm);
853 			phys += PAGE_SIZE_2M;
854 		}
855 	}
856 	while (phys + PAGE_SIZE <= max) {
857 		install_ept(pml4, phys, phys, perm);
858 		phys += PAGE_SIZE;
859 	}
860 }
861 
862 /* get_ept_pte : Get the PTE of a given level in EPT,
863     @level == 1 means get the latest level*/
864 bool get_ept_pte(unsigned long *pml4, unsigned long guest_addr, int level,
865 		unsigned long *pte)
866 {
867 	int l;
868 	unsigned long *pt = pml4, iter_pte;
869 	unsigned offset;
870 
871 	assert(level >= 1 && level <= 4);
872 
873 	for (l = EPT_PAGE_LEVEL; ; --l) {
874 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
875 		iter_pte = pt[offset];
876 		if (l == level)
877 			break;
878 		if (l < 4 && (iter_pte & EPT_LARGE_PAGE))
879 			return false;
880 		if (!(iter_pte & (EPT_PRESENT)))
881 			return false;
882 		pt = (unsigned long *)(iter_pte & EPT_ADDR_MASK);
883 	}
884 	offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
885 	if (pte)
886 		*pte = pt[offset];
887 	return true;
888 }
889 
890 static void clear_ept_ad_pte(unsigned long *pml4, unsigned long guest_addr)
891 {
892 	int l;
893 	unsigned long *pt = pml4;
894 	u64 pte;
895 	unsigned offset;
896 
897 	for (l = EPT_PAGE_LEVEL; ; --l) {
898 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
899 		pt[offset] &= ~(EPT_ACCESS_FLAG|EPT_DIRTY_FLAG);
900 		pte = pt[offset];
901 		if (l == 1 || (l < 4 && (pte & EPT_LARGE_PAGE)))
902 			break;
903 		pt = (unsigned long *)(pte & EPT_ADDR_MASK);
904 	}
905 }
906 
907 /* clear_ept_ad : Clear EPT A/D bits for the page table walk and the
908    final GPA of a guest address.  */
909 void clear_ept_ad(unsigned long *pml4, u64 guest_cr3,
910 		  unsigned long guest_addr)
911 {
912 	int l;
913 	unsigned long *pt = (unsigned long *)guest_cr3, gpa;
914 	u64 pte, offset_in_page;
915 	unsigned offset;
916 
917 	for (l = EPT_PAGE_LEVEL; ; --l) {
918 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
919 
920 		clear_ept_ad_pte(pml4, (u64) &pt[offset]);
921 		pte = pt[offset];
922 		if (l == 1 || (l < 4 && (pte & PT_PAGE_SIZE_MASK)))
923 			break;
924 		if (!(pte & PT_PRESENT_MASK))
925 			return;
926 		pt = (unsigned long *)(pte & PT_ADDR_MASK);
927 	}
928 
929 	offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
930 	offset_in_page = guest_addr & ((1 << EPT_LEVEL_SHIFT(l)) - 1);
931 	gpa = (pt[offset] & PT_ADDR_MASK) | (guest_addr & offset_in_page);
932 	clear_ept_ad_pte(pml4, gpa);
933 }
934 
935 /* check_ept_ad : Check the content of EPT A/D bits for the page table
936    walk and the final GPA of a guest address.  */
937 void check_ept_ad(unsigned long *pml4, u64 guest_cr3,
938 		  unsigned long guest_addr, int expected_gpa_ad,
939 		  int expected_pt_ad)
940 {
941 	int l;
942 	unsigned long *pt = (unsigned long *)guest_cr3, gpa;
943 	u64 ept_pte, pte, offset_in_page;
944 	unsigned offset;
945 	bool bad_pt_ad = false;
946 
947 	for (l = EPT_PAGE_LEVEL; ; --l) {
948 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
949 
950 		if (!get_ept_pte(pml4, (u64) &pt[offset], 1, &ept_pte)) {
951 			printf("EPT - guest level %d page table is not mapped.\n", l);
952 			return;
953 		}
954 
955 		if (!bad_pt_ad) {
956 			bad_pt_ad |= (ept_pte & (EPT_ACCESS_FLAG|EPT_DIRTY_FLAG)) != expected_pt_ad;
957 			if (bad_pt_ad)
958 				report("EPT - guest level %d page table A=%d/D=%d",
959 				       false, l,
960 				       !!(expected_pt_ad & EPT_ACCESS_FLAG),
961 				       !!(expected_pt_ad & EPT_DIRTY_FLAG));
962 		}
963 
964 		pte = pt[offset];
965 		if (l == 1 || (l < 4 && (pte & PT_PAGE_SIZE_MASK)))
966 			break;
967 		if (!(pte & PT_PRESENT_MASK))
968 			return;
969 		pt = (unsigned long *)(pte & PT_ADDR_MASK);
970 	}
971 
972 	if (!bad_pt_ad)
973 		report("EPT - guest page table structures A=%d/D=%d",
974 		       true,
975 		       !!(expected_pt_ad & EPT_ACCESS_FLAG),
976 		       !!(expected_pt_ad & EPT_DIRTY_FLAG));
977 
978 	offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
979 	offset_in_page = guest_addr & ((1 << EPT_LEVEL_SHIFT(l)) - 1);
980 	gpa = (pt[offset] & PT_ADDR_MASK) | (guest_addr & offset_in_page);
981 
982 	if (!get_ept_pte(pml4, gpa, 1, &ept_pte)) {
983 		report("EPT - guest physical address is not mapped", false);
984 		return;
985 	}
986 	report("EPT - guest physical address A=%d/D=%d",
987 	       (ept_pte & (EPT_ACCESS_FLAG|EPT_DIRTY_FLAG)) == expected_gpa_ad,
988 	       !!(expected_gpa_ad & EPT_ACCESS_FLAG),
989 	       !!(expected_gpa_ad & EPT_DIRTY_FLAG));
990 }
991 
992 
993 void ept_sync(int type, u64 eptp)
994 {
995 	switch (type) {
996 	case INVEPT_SINGLE:
997 		if (ept_vpid.val & EPT_CAP_INVEPT_SINGLE) {
998 			invept(INVEPT_SINGLE, eptp);
999 			break;
1000 		}
1001 		/* else fall through */
1002 	case INVEPT_GLOBAL:
1003 		if (ept_vpid.val & EPT_CAP_INVEPT_ALL) {
1004 			invept(INVEPT_GLOBAL, eptp);
1005 			break;
1006 		}
1007 		/* else fall through */
1008 	default:
1009 		printf("WARNING: invept is not supported!\n");
1010 	}
1011 }
1012 
1013 void set_ept_pte(unsigned long *pml4, unsigned long guest_addr,
1014 		 int level, u64 pte_val)
1015 {
1016 	int l;
1017 	unsigned long *pt = pml4;
1018 	unsigned offset;
1019 
1020 	assert(level >= 1 && level <= 4);
1021 
1022 	for (l = EPT_PAGE_LEVEL; ; --l) {
1023 		offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1024 		if (l == level)
1025 			break;
1026 		assert(pt[offset] & EPT_PRESENT);
1027 		pt = (unsigned long *)(pt[offset] & EPT_ADDR_MASK);
1028 	}
1029 	offset = (guest_addr >> EPT_LEVEL_SHIFT(l)) & EPT_PGDIR_MASK;
1030 	pt[offset] = pte_val;
1031 }
1032 
1033 bool ept_2m_supported(void)
1034 {
1035 	return ept_vpid.val & EPT_CAP_2M_PAGE;
1036 }
1037 
1038 bool ept_1g_supported(void)
1039 {
1040 	return ept_vpid.val & EPT_CAP_1G_PAGE;
1041 }
1042 
1043 bool ept_huge_pages_supported(int level)
1044 {
1045 	if (level == 2)
1046 		return ept_2m_supported();
1047 	else if (level == 3)
1048 		return ept_1g_supported();
1049 	else
1050 		return false;
1051 }
1052 
1053 bool ept_execute_only_supported(void)
1054 {
1055 	return ept_vpid.val & EPT_CAP_WT;
1056 }
1057 
1058 bool ept_ad_bits_supported(void)
1059 {
1060 	return ept_vpid.val & EPT_CAP_AD_FLAG;
1061 }
1062 
1063 void vpid_sync(int type, u16 vpid)
1064 {
1065 	switch(type) {
1066 	case INVVPID_CONTEXT_GLOBAL:
1067 		if (ept_vpid.val & VPID_CAP_INVVPID_CXTGLB) {
1068 			invvpid(INVVPID_CONTEXT_GLOBAL, vpid, 0);
1069 			break;
1070 		}
1071 	case INVVPID_ALL:
1072 		if (ept_vpid.val & VPID_CAP_INVVPID_ALL) {
1073 			invvpid(INVVPID_ALL, vpid, 0);
1074 			break;
1075 		}
1076 	default:
1077 		printf("WARNING: invvpid is not supported\n");
1078 	}
1079 }
1080 
1081 static void init_vmcs_ctrl(void)
1082 {
1083 	/* 26.2 CHECKS ON VMX CONTROLS AND HOST-STATE AREA */
1084 	/* 26.2.1.1 */
1085 	vmcs_write(PIN_CONTROLS, ctrl_pin);
1086 	/* Disable VMEXIT of IO instruction */
1087 	vmcs_write(CPU_EXEC_CTRL0, ctrl_cpu[0]);
1088 	if (ctrl_cpu_rev[0].set & CPU_SECONDARY) {
1089 		ctrl_cpu[1] = (ctrl_cpu[1] | ctrl_cpu_rev[1].set) &
1090 			ctrl_cpu_rev[1].clr;
1091 		vmcs_write(CPU_EXEC_CTRL1, ctrl_cpu[1]);
1092 	}
1093 	vmcs_write(CR3_TARGET_COUNT, 0);
1094 	vmcs_write(VPID, ++vpid_cnt);
1095 }
1096 
1097 static void init_vmcs_host(void)
1098 {
1099 	/* 26.2 CHECKS ON VMX CONTROLS AND HOST-STATE AREA */
1100 	/* 26.2.1.2 */
1101 	vmcs_write(HOST_EFER, rdmsr(MSR_EFER));
1102 
1103 	/* 26.2.1.3 */
1104 	vmcs_write(ENT_CONTROLS, ctrl_enter);
1105 	vmcs_write(EXI_CONTROLS, ctrl_exit);
1106 
1107 	/* 26.2.2 */
1108 	vmcs_write(HOST_CR0, read_cr0());
1109 	vmcs_write(HOST_CR3, read_cr3());
1110 	vmcs_write(HOST_CR4, read_cr4());
1111 	vmcs_write(HOST_SYSENTER_EIP, (u64)(&entry_sysenter));
1112 	vmcs_write(HOST_SYSENTER_CS,  KERNEL_CS);
1113 
1114 	/* 26.2.3 */
1115 	vmcs_write(HOST_SEL_CS, KERNEL_CS);
1116 	vmcs_write(HOST_SEL_SS, KERNEL_DS);
1117 	vmcs_write(HOST_SEL_DS, KERNEL_DS);
1118 	vmcs_write(HOST_SEL_ES, KERNEL_DS);
1119 	vmcs_write(HOST_SEL_FS, KERNEL_DS);
1120 	vmcs_write(HOST_SEL_GS, KERNEL_DS);
1121 	vmcs_write(HOST_SEL_TR, TSS_MAIN);
1122 	vmcs_write(HOST_BASE_TR, tss_descr.base);
1123 	vmcs_write(HOST_BASE_GDTR, gdt64_desc.base);
1124 	vmcs_write(HOST_BASE_IDTR, idt_descr.base);
1125 	vmcs_write(HOST_BASE_FS, 0);
1126 	vmcs_write(HOST_BASE_GS, 0);
1127 
1128 	/* Set other vmcs area */
1129 	vmcs_write(PF_ERROR_MASK, 0);
1130 	vmcs_write(PF_ERROR_MATCH, 0);
1131 	vmcs_write(VMCS_LINK_PTR, ~0ul);
1132 	vmcs_write(VMCS_LINK_PTR_HI, ~0ul);
1133 	vmcs_write(HOST_RIP, (u64)(&vmx_return));
1134 }
1135 
1136 static void init_vmcs_guest(void)
1137 {
1138 	/* 26.3 CHECKING AND LOADING GUEST STATE */
1139 	ulong guest_cr0, guest_cr4, guest_cr3;
1140 	/* 26.3.1.1 */
1141 	guest_cr0 = read_cr0();
1142 	guest_cr4 = read_cr4();
1143 	guest_cr3 = read_cr3();
1144 	if (ctrl_enter & ENT_GUEST_64) {
1145 		guest_cr0 |= X86_CR0_PG;
1146 		guest_cr4 |= X86_CR4_PAE;
1147 	}
1148 	if ((ctrl_enter & ENT_GUEST_64) == 0)
1149 		guest_cr4 &= (~X86_CR4_PCIDE);
1150 	if (guest_cr0 & X86_CR0_PG)
1151 		guest_cr0 |= X86_CR0_PE;
1152 	vmcs_write(GUEST_CR0, guest_cr0);
1153 	vmcs_write(GUEST_CR3, guest_cr3);
1154 	vmcs_write(GUEST_CR4, guest_cr4);
1155 	vmcs_write(GUEST_SYSENTER_CS,  KERNEL_CS);
1156 	vmcs_write(GUEST_SYSENTER_ESP,
1157 		(u64)(guest_syscall_stack + PAGE_SIZE - 1));
1158 	vmcs_write(GUEST_SYSENTER_EIP, (u64)(&entry_sysenter));
1159 	vmcs_write(GUEST_DR7, 0);
1160 	vmcs_write(GUEST_EFER, rdmsr(MSR_EFER));
1161 
1162 	/* 26.3.1.2 */
1163 	vmcs_write(GUEST_SEL_CS, KERNEL_CS);
1164 	vmcs_write(GUEST_SEL_SS, KERNEL_DS);
1165 	vmcs_write(GUEST_SEL_DS, KERNEL_DS);
1166 	vmcs_write(GUEST_SEL_ES, KERNEL_DS);
1167 	vmcs_write(GUEST_SEL_FS, KERNEL_DS);
1168 	vmcs_write(GUEST_SEL_GS, KERNEL_DS);
1169 	vmcs_write(GUEST_SEL_TR, TSS_MAIN);
1170 	vmcs_write(GUEST_SEL_LDTR, 0);
1171 
1172 	vmcs_write(GUEST_BASE_CS, 0);
1173 	vmcs_write(GUEST_BASE_ES, 0);
1174 	vmcs_write(GUEST_BASE_SS, 0);
1175 	vmcs_write(GUEST_BASE_DS, 0);
1176 	vmcs_write(GUEST_BASE_FS, 0);
1177 	vmcs_write(GUEST_BASE_GS, 0);
1178 	vmcs_write(GUEST_BASE_TR, tss_descr.base);
1179 	vmcs_write(GUEST_BASE_LDTR, 0);
1180 
1181 	vmcs_write(GUEST_LIMIT_CS, 0xFFFFFFFF);
1182 	vmcs_write(GUEST_LIMIT_DS, 0xFFFFFFFF);
1183 	vmcs_write(GUEST_LIMIT_ES, 0xFFFFFFFF);
1184 	vmcs_write(GUEST_LIMIT_SS, 0xFFFFFFFF);
1185 	vmcs_write(GUEST_LIMIT_FS, 0xFFFFFFFF);
1186 	vmcs_write(GUEST_LIMIT_GS, 0xFFFFFFFF);
1187 	vmcs_write(GUEST_LIMIT_LDTR, 0xffff);
1188 	vmcs_write(GUEST_LIMIT_TR, tss_descr.limit);
1189 
1190 	vmcs_write(GUEST_AR_CS, 0xa09b);
1191 	vmcs_write(GUEST_AR_DS, 0xc093);
1192 	vmcs_write(GUEST_AR_ES, 0xc093);
1193 	vmcs_write(GUEST_AR_FS, 0xc093);
1194 	vmcs_write(GUEST_AR_GS, 0xc093);
1195 	vmcs_write(GUEST_AR_SS, 0xc093);
1196 	vmcs_write(GUEST_AR_LDTR, 0x82);
1197 	vmcs_write(GUEST_AR_TR, 0x8b);
1198 
1199 	/* 26.3.1.3 */
1200 	vmcs_write(GUEST_BASE_GDTR, gdt64_desc.base);
1201 	vmcs_write(GUEST_BASE_IDTR, idt_descr.base);
1202 	vmcs_write(GUEST_LIMIT_GDTR, gdt64_desc.limit);
1203 	vmcs_write(GUEST_LIMIT_IDTR, idt_descr.limit);
1204 
1205 	/* 26.3.1.4 */
1206 	vmcs_write(GUEST_RIP, (u64)(&guest_entry));
1207 	vmcs_write(GUEST_RSP, (u64)(guest_stack + PAGE_SIZE - 1));
1208 	vmcs_write(GUEST_RFLAGS, 0x2);
1209 
1210 	/* 26.3.1.5 */
1211 	vmcs_write(GUEST_ACTV_STATE, ACTV_ACTIVE);
1212 	vmcs_write(GUEST_INTR_STATE, 0);
1213 }
1214 
1215 static int init_vmcs(struct vmcs **vmcs)
1216 {
1217 	*vmcs = alloc_page();
1218 	(*vmcs)->hdr.revision_id = basic.revision;
1219 	/* vmclear first to init vmcs */
1220 	if (vmcs_clear(*vmcs)) {
1221 		printf("%s : vmcs_clear error\n", __func__);
1222 		return 1;
1223 	}
1224 
1225 	if (make_vmcs_current(*vmcs)) {
1226 		printf("%s : make_vmcs_current error\n", __func__);
1227 		return 1;
1228 	}
1229 
1230 	/* All settings to pin/exit/enter/cpu
1231 	   control fields should be placed here */
1232 	ctrl_pin |= PIN_EXTINT | PIN_NMI | PIN_VIRT_NMI;
1233 	ctrl_exit = EXI_LOAD_EFER | EXI_HOST_64;
1234 	ctrl_enter = (ENT_LOAD_EFER | ENT_GUEST_64);
1235 	/* DIsable IO instruction VMEXIT now */
1236 	ctrl_cpu[0] &= (~(CPU_IO | CPU_IO_BITMAP));
1237 	ctrl_cpu[1] = 0;
1238 
1239 	ctrl_pin = (ctrl_pin | ctrl_pin_rev.set) & ctrl_pin_rev.clr;
1240 	ctrl_enter = (ctrl_enter | ctrl_enter_rev.set) & ctrl_enter_rev.clr;
1241 	ctrl_exit = (ctrl_exit | ctrl_exit_rev.set) & ctrl_exit_rev.clr;
1242 	ctrl_cpu[0] = (ctrl_cpu[0] | ctrl_cpu_rev[0].set) & ctrl_cpu_rev[0].clr;
1243 
1244 	init_vmcs_ctrl();
1245 	init_vmcs_host();
1246 	init_vmcs_guest();
1247 	return 0;
1248 }
1249 
1250 void enable_vmx(void)
1251 {
1252 	bool vmx_enabled =
1253 		rdmsr(MSR_IA32_FEATURE_CONTROL) &
1254 		FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1255 
1256 	if (!vmx_enabled) {
1257 		wrmsr(MSR_IA32_FEATURE_CONTROL,
1258 				FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX |
1259 				FEATURE_CONTROL_LOCKED);
1260 	}
1261 }
1262 
1263 static void init_vmx_caps(void)
1264 {
1265 	basic.val = rdmsr(MSR_IA32_VMX_BASIC);
1266 	ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PIN
1267 			: MSR_IA32_VMX_PINBASED_CTLS);
1268 	ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT
1269 			: MSR_IA32_VMX_EXIT_CTLS);
1270 	ctrl_enter_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_ENTRY
1271 			: MSR_IA32_VMX_ENTRY_CTLS);
1272 	ctrl_cpu_rev[0].val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PROC
1273 			: MSR_IA32_VMX_PROCBASED_CTLS);
1274 	if ((ctrl_cpu_rev[0].clr & CPU_SECONDARY) != 0)
1275 		ctrl_cpu_rev[1].val = rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2);
1276 	else
1277 		ctrl_cpu_rev[1].val = 0;
1278 	if ((ctrl_cpu_rev[1].clr & (CPU_EPT | CPU_VPID)) != 0)
1279 		ept_vpid.val = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
1280 	else
1281 		ept_vpid.val = 0;
1282 }
1283 
1284 void init_vmx(u64 *vmxon_region)
1285 {
1286 	ulong fix_cr0_set, fix_cr0_clr;
1287 	ulong fix_cr4_set, fix_cr4_clr;
1288 
1289 	fix_cr0_set =  rdmsr(MSR_IA32_VMX_CR0_FIXED0);
1290 	fix_cr0_clr =  rdmsr(MSR_IA32_VMX_CR0_FIXED1);
1291 	fix_cr4_set =  rdmsr(MSR_IA32_VMX_CR4_FIXED0);
1292 	fix_cr4_clr = rdmsr(MSR_IA32_VMX_CR4_FIXED1);
1293 
1294 	write_cr0((read_cr0() & fix_cr0_clr) | fix_cr0_set);
1295 	write_cr4((read_cr4() & fix_cr4_clr) | fix_cr4_set | X86_CR4_VMXE);
1296 
1297 	*vmxon_region = basic.revision;
1298 }
1299 
1300 static void alloc_bsp_vmx_pages(void)
1301 {
1302 	bsp_vmxon_region = alloc_page();
1303 	guest_stack = alloc_page();
1304 	guest_syscall_stack = alloc_page();
1305 	vmcs_root = alloc_page();
1306 }
1307 
1308 static void init_bsp_vmx(void)
1309 {
1310 	init_vmx_caps();
1311 	alloc_bsp_vmx_pages();
1312 	init_vmx(bsp_vmxon_region);
1313 }
1314 
1315 static void do_vmxon_off(void *data)
1316 {
1317 	vmx_on();
1318 	vmx_off();
1319 }
1320 
1321 static void do_write_feature_control(void *data)
1322 {
1323 	wrmsr(MSR_IA32_FEATURE_CONTROL, 0);
1324 }
1325 
1326 static int test_vmx_feature_control(void)
1327 {
1328 	u64 ia32_feature_control;
1329 	bool vmx_enabled;
1330 	bool feature_control_locked;
1331 
1332 	ia32_feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
1333 	vmx_enabled =
1334 		ia32_feature_control & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1335 	feature_control_locked =
1336 		ia32_feature_control & FEATURE_CONTROL_LOCKED;
1337 
1338 	if (vmx_enabled && feature_control_locked) {
1339 		printf("VMX enabled and locked by BIOS\n");
1340 		return 0;
1341 	} else if (feature_control_locked) {
1342 		printf("ERROR: VMX locked out by BIOS!?\n");
1343 		return 1;
1344 	}
1345 
1346 	wrmsr(MSR_IA32_FEATURE_CONTROL, 0);
1347 	report("test vmxon with FEATURE_CONTROL cleared",
1348 	       test_for_exception(GP_VECTOR, &do_vmxon_off, NULL));
1349 
1350 	wrmsr(MSR_IA32_FEATURE_CONTROL, FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX);
1351 	report("test vmxon without FEATURE_CONTROL lock",
1352 	       test_for_exception(GP_VECTOR, &do_vmxon_off, NULL));
1353 
1354 	wrmsr(MSR_IA32_FEATURE_CONTROL,
1355 		  FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX |
1356 		  FEATURE_CONTROL_LOCKED);
1357 
1358 	ia32_feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
1359 	vmx_enabled =
1360 		ia32_feature_control & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1361 	report("test enable VMX in FEATURE_CONTROL", vmx_enabled);
1362 
1363 	report("test FEATURE_CONTROL lock bit",
1364 	       test_for_exception(GP_VECTOR, &do_write_feature_control, NULL));
1365 
1366 	return !vmx_enabled;
1367 }
1368 
1369 static int test_vmxon(void)
1370 {
1371 	int ret, ret1;
1372 	u64 *vmxon_region;
1373 	int width = cpuid_maxphyaddr();
1374 
1375 	/* Unaligned page access */
1376 	vmxon_region = (u64 *)((intptr_t)bsp_vmxon_region + 1);
1377 	ret1 = _vmx_on(vmxon_region);
1378 	report("test vmxon with unaligned vmxon region", ret1);
1379 	if (!ret1) {
1380 		ret = 1;
1381 		goto out;
1382 	}
1383 
1384 	/* gpa bits beyond physical address width are set*/
1385 	vmxon_region = (u64 *)((intptr_t)bsp_vmxon_region | ((u64)1 << (width+1)));
1386 	ret1 = _vmx_on(vmxon_region);
1387 	report("test vmxon with bits set beyond physical address width", ret1);
1388 	if (!ret1) {
1389 		ret = 1;
1390 		goto out;
1391 	}
1392 
1393 	/* invalid revision indentifier */
1394 	*bsp_vmxon_region = 0xba9da9;
1395 	ret1 = vmx_on();
1396 	report("test vmxon with invalid revision identifier", ret1);
1397 	if (!ret1) {
1398 		ret = 1;
1399 		goto out;
1400 	}
1401 
1402 	/* and finally a valid region */
1403 	*bsp_vmxon_region = basic.revision;
1404 	ret = vmx_on();
1405 	report("test vmxon with valid vmxon region", !ret);
1406 
1407 out:
1408 	return ret;
1409 }
1410 
1411 static void test_vmptrld(void)
1412 {
1413 	struct vmcs *vmcs, *tmp_root;
1414 	int width = cpuid_maxphyaddr();
1415 
1416 	vmcs = alloc_page();
1417 	vmcs->hdr.revision_id = basic.revision;
1418 
1419 	/* Unaligned page access */
1420 	tmp_root = (struct vmcs *)((intptr_t)vmcs + 1);
1421 	report("test vmptrld with unaligned vmcs",
1422 	       make_vmcs_current(tmp_root) == 1);
1423 
1424 	/* gpa bits beyond physical address width are set*/
1425 	tmp_root = (struct vmcs *)((intptr_t)vmcs |
1426 				   ((u64)1 << (width+1)));
1427 	report("test vmptrld with vmcs address bits set beyond physical address width",
1428 	       make_vmcs_current(tmp_root) == 1);
1429 
1430 	/* Pass VMXON region */
1431 	assert(!vmcs_clear(vmcs));
1432 	assert(!make_vmcs_current(vmcs));
1433 	tmp_root = (struct vmcs *)bsp_vmxon_region;
1434 	report("test vmptrld with vmxon region",
1435 	       make_vmcs_current(tmp_root) == 1);
1436 	report("test vmptrld with vmxon region vm-instruction error",
1437 	       vmcs_read(VMX_INST_ERROR) == VMXERR_VMPTRLD_VMXON_POINTER);
1438 
1439 	report("test vmptrld with valid vmcs region", make_vmcs_current(vmcs) == 0);
1440 }
1441 
1442 static void test_vmptrst(void)
1443 {
1444 	int ret;
1445 	struct vmcs *vmcs1, *vmcs2;
1446 
1447 	vmcs1 = alloc_page();
1448 	init_vmcs(&vmcs1);
1449 	ret = vmcs_save(&vmcs2);
1450 	report("test vmptrst", (!ret) && (vmcs1 == vmcs2));
1451 }
1452 
1453 struct vmx_ctl_msr {
1454 	const char *name;
1455 	u32 index, true_index;
1456 	u32 default1;
1457 } vmx_ctl_msr[] = {
1458 	{ "MSR_IA32_VMX_PINBASED_CTLS", MSR_IA32_VMX_PINBASED_CTLS,
1459 	  MSR_IA32_VMX_TRUE_PIN, 0x16 },
1460 	{ "MSR_IA32_VMX_PROCBASED_CTLS", MSR_IA32_VMX_PROCBASED_CTLS,
1461 	  MSR_IA32_VMX_TRUE_PROC, 0x401e172 },
1462 	{ "MSR_IA32_VMX_PROCBASED_CTLS2", MSR_IA32_VMX_PROCBASED_CTLS2,
1463 	  MSR_IA32_VMX_PROCBASED_CTLS2, 0 },
1464 	{ "MSR_IA32_VMX_EXIT_CTLS", MSR_IA32_VMX_EXIT_CTLS,
1465 	  MSR_IA32_VMX_TRUE_EXIT, 0x36dff },
1466 	{ "MSR_IA32_VMX_ENTRY_CTLS", MSR_IA32_VMX_ENTRY_CTLS,
1467 	  MSR_IA32_VMX_TRUE_ENTRY, 0x11ff },
1468 };
1469 
1470 static void test_vmx_caps(void)
1471 {
1472 	u64 val, default1, fixed0, fixed1;
1473 	union vmx_ctrl_msr ctrl, true_ctrl;
1474 	unsigned int n;
1475 	bool ok;
1476 
1477 	printf("\nTest suite: VMX capability reporting\n");
1478 
1479 	report("MSR_IA32_VMX_BASIC",
1480 	       (basic.revision & (1ul << 31)) == 0 &&
1481 	       basic.size > 0 && basic.size <= 4096 &&
1482 	       (basic.type == 0 || basic.type == 6) &&
1483 	       basic.reserved1 == 0 && basic.reserved2 == 0);
1484 
1485 	val = rdmsr(MSR_IA32_VMX_MISC);
1486 	report("MSR_IA32_VMX_MISC",
1487 	       (!(ctrl_cpu_rev[1].clr & CPU_URG) || val & (1ul << 5)) &&
1488 	       ((val >> 16) & 0x1ff) <= 256 &&
1489 	       (val & 0x80007e00) == 0);
1490 
1491 	for (n = 0; n < ARRAY_SIZE(vmx_ctl_msr); n++) {
1492 		ctrl.val = rdmsr(vmx_ctl_msr[n].index);
1493 		default1 = vmx_ctl_msr[n].default1;
1494 		ok = (ctrl.set & default1) == default1;
1495 		ok = ok && (ctrl.set & ~ctrl.clr) == 0;
1496 		if (ok && basic.ctrl) {
1497 			true_ctrl.val = rdmsr(vmx_ctl_msr[n].true_index);
1498 			ok = ctrl.clr == true_ctrl.clr;
1499 			ok = ok && ctrl.set == (true_ctrl.set | default1);
1500 		}
1501 		report("%s", ok, vmx_ctl_msr[n].name);
1502 	}
1503 
1504 	fixed0 = rdmsr(MSR_IA32_VMX_CR0_FIXED0);
1505 	fixed1 = rdmsr(MSR_IA32_VMX_CR0_FIXED1);
1506 	report("MSR_IA32_VMX_IA32_VMX_CR0_FIXED0/1",
1507 	       ((fixed0 ^ fixed1) & ~fixed1) == 0);
1508 
1509 	fixed0 = rdmsr(MSR_IA32_VMX_CR4_FIXED0);
1510 	fixed1 = rdmsr(MSR_IA32_VMX_CR4_FIXED1);
1511 	report("MSR_IA32_VMX_IA32_VMX_CR4_FIXED0/1",
1512 	       ((fixed0 ^ fixed1) & ~fixed1) == 0);
1513 
1514 	val = rdmsr(MSR_IA32_VMX_VMCS_ENUM);
1515 	report("MSR_IA32_VMX_VMCS_ENUM",
1516 	       (val & VMCS_FIELD_INDEX_MASK) >= 0x2a &&
1517 	       (val & 0xfffffffffffffc01Ull) == 0);
1518 
1519 	val = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
1520 	report("MSR_IA32_VMX_EPT_VPID_CAP",
1521 	       (val & 0xfffff07ef98cbebeUll) == 0);
1522 }
1523 
1524 /* This function can only be called in guest */
1525 static void __attribute__((__used__)) hypercall(u32 hypercall_no)
1526 {
1527 	u64 val = 0;
1528 	val = (hypercall_no & HYPERCALL_MASK) | HYPERCALL_BIT;
1529 	hypercall_field = val;
1530 	asm volatile("vmcall\n\t");
1531 }
1532 
1533 static bool is_hypercall(void)
1534 {
1535 	ulong reason, hyper_bit;
1536 
1537 	reason = vmcs_read(EXI_REASON) & 0xff;
1538 	hyper_bit = hypercall_field & HYPERCALL_BIT;
1539 	if (reason == VMX_VMCALL && hyper_bit)
1540 		return true;
1541 	return false;
1542 }
1543 
1544 static int handle_hypercall(void)
1545 {
1546 	ulong hypercall_no;
1547 
1548 	hypercall_no = hypercall_field & HYPERCALL_MASK;
1549 	hypercall_field = 0;
1550 	switch (hypercall_no) {
1551 	case HYPERCALL_VMEXIT:
1552 		return VMX_TEST_VMEXIT;
1553 	case HYPERCALL_VMABORT:
1554 		return VMX_TEST_VMABORT;
1555 	case HYPERCALL_VMSKIP:
1556 		return VMX_TEST_VMSKIP;
1557 	default:
1558 		printf("ERROR : Invalid hypercall number : %ld\n", hypercall_no);
1559 	}
1560 	return VMX_TEST_EXIT;
1561 }
1562 
1563 static void continue_abort(void)
1564 {
1565 	assert(!in_guest);
1566 	printf("Host was here when guest aborted:\n");
1567 	dump_stack();
1568 	longjmp(abort_target, 1);
1569 	abort();
1570 }
1571 
1572 void __abort_test(void)
1573 {
1574 	if (in_guest)
1575 		hypercall(HYPERCALL_VMABORT);
1576 	else
1577 		longjmp(abort_target, 1);
1578 	abort();
1579 }
1580 
1581 static void continue_skip(void)
1582 {
1583 	assert(!in_guest);
1584 	longjmp(abort_target, 1);
1585 	abort();
1586 }
1587 
1588 void test_skip(const char *msg)
1589 {
1590 	printf("%s skipping test: %s\n", in_guest ? "Guest" : "Host", msg);
1591 	if (in_guest)
1592 		hypercall(HYPERCALL_VMABORT);
1593 	else
1594 		longjmp(abort_target, 1);
1595 	abort();
1596 }
1597 
1598 static int exit_handler(void)
1599 {
1600 	int ret;
1601 
1602 	current->exits++;
1603 	regs.rflags = vmcs_read(GUEST_RFLAGS);
1604 	if (is_hypercall())
1605 		ret = handle_hypercall();
1606 	else
1607 		ret = current->exit_handler();
1608 	vmcs_write(GUEST_RFLAGS, regs.rflags);
1609 
1610 	return ret;
1611 }
1612 
1613 /*
1614  * Called if vmlaunch or vmresume fails.
1615  *	@early    - failure due to "VMX controls and host-state area" (26.2)
1616  *	@vmlaunch - was this a vmlaunch or vmresume
1617  *	@rflags   - host rflags
1618  */
1619 static int
1620 entry_failure_handler(struct vmentry_failure *failure)
1621 {
1622 	if (current->entry_failure_handler)
1623 		return current->entry_failure_handler(failure);
1624 	else
1625 		return VMX_TEST_EXIT;
1626 }
1627 
1628 /*
1629  * Tries to enter the guest. Returns true if entry succeeded. Otherwise,
1630  * populates @failure.
1631  */
1632 static void vmx_enter_guest(struct vmentry_failure *failure)
1633 {
1634 	failure->early = 0;
1635 
1636 	in_guest = 1;
1637 	asm volatile (
1638 		"mov %[HOST_RSP], %%rdi\n\t"
1639 		"vmwrite %%rsp, %%rdi\n\t"
1640 		LOAD_GPR_C
1641 		"cmpb $0, %[launched]\n\t"
1642 		"jne 1f\n\t"
1643 		"vmlaunch\n\t"
1644 		"jmp 2f\n\t"
1645 		"1: "
1646 		"vmresume\n\t"
1647 		"2: "
1648 		SAVE_GPR_C
1649 		"pushf\n\t"
1650 		"pop %%rdi\n\t"
1651 		"mov %%rdi, %[failure_flags]\n\t"
1652 		"movl $1, %[failure_early]\n\t"
1653 		"jmp 3f\n\t"
1654 		"vmx_return:\n\t"
1655 		SAVE_GPR_C
1656 		"3: \n\t"
1657 		: [failure_early]"+m"(failure->early),
1658 		  [failure_flags]"=m"(failure->flags)
1659 		: [launched]"m"(launched), [HOST_RSP]"i"(HOST_RSP)
1660 		: "rdi", "memory", "cc"
1661 	);
1662 	in_guest = 0;
1663 
1664 	failure->vmlaunch = !launched;
1665 	failure->instr = launched ? "vmresume" : "vmlaunch";
1666 }
1667 
1668 static int vmx_run(void)
1669 {
1670 	while (1) {
1671 		u32 ret;
1672 		bool entered;
1673 		struct vmentry_failure failure;
1674 
1675 		vmx_enter_guest(&failure);
1676 		entered = !failure.early &&
1677 			  !(vmcs_read(EXI_REASON) & VMX_ENTRY_FAILURE);
1678 
1679 		if (entered) {
1680 			/*
1681 			 * VMCS isn't in "launched" state if there's been any
1682 			 * entry failure (early or otherwise).
1683 			 */
1684 			launched = 1;
1685 			ret = exit_handler();
1686 		} else {
1687 			ret = entry_failure_handler(&failure);
1688 		}
1689 
1690 		switch (ret) {
1691 		case VMX_TEST_RESUME:
1692 			continue;
1693 		case VMX_TEST_VMEXIT:
1694 			guest_finished = 1;
1695 			return 0;
1696 		case VMX_TEST_EXIT:
1697 			break;
1698 		default:
1699 			printf("ERROR : Invalid %s_handler return val %d.\n",
1700 			       entered ? "exit" : "entry_failure",
1701 			       ret);
1702 			break;
1703 		}
1704 
1705 		if (entered)
1706 			print_vmexit_info();
1707 		else
1708 			print_vmentry_failure_info(&failure);
1709 		abort();
1710 	}
1711 }
1712 
1713 static void run_teardown_step(struct test_teardown_step *step)
1714 {
1715 	step->func(step->data);
1716 }
1717 
1718 static int test_run(struct vmx_test *test)
1719 {
1720 	int r;
1721 
1722 	/* Validate V2 interface. */
1723 	if (test->v2) {
1724 		int ret = 0;
1725 		if (test->init || test->guest_main || test->exit_handler ||
1726 		    test->syscall_handler) {
1727 			report("V2 test cannot specify V1 callbacks.", 0);
1728 			ret = 1;
1729 		}
1730 		if (ret)
1731 			return ret;
1732 	}
1733 
1734 	if (test->name == NULL)
1735 		test->name = "(no name)";
1736 	if (vmx_on()) {
1737 		printf("%s : vmxon failed.\n", __func__);
1738 		return 1;
1739 	}
1740 
1741 	init_vmcs(&(test->vmcs));
1742 	/* Directly call test->init is ok here, init_vmcs has done
1743 	   vmcs init, vmclear and vmptrld*/
1744 	if (test->init && test->init(test->vmcs) != VMX_TEST_START)
1745 		goto out;
1746 	teardown_count = 0;
1747 	v2_guest_main = NULL;
1748 	test->exits = 0;
1749 	current = test;
1750 	regs = test->guest_regs;
1751 	vmcs_write(GUEST_RFLAGS, regs.rflags | 0x2);
1752 	launched = 0;
1753 	guest_finished = 0;
1754 	printf("\nTest suite: %s\n", test->name);
1755 
1756 	r = setjmp(abort_target);
1757 	if (r) {
1758 		assert(!in_guest);
1759 		goto out;
1760 	}
1761 
1762 
1763 	if (test->v2)
1764 		test->v2();
1765 	else
1766 		vmx_run();
1767 
1768 	while (teardown_count > 0)
1769 		run_teardown_step(&teardown_steps[--teardown_count]);
1770 
1771 	if (launched && !guest_finished)
1772 		report("Guest didn't run to completion.", 0);
1773 
1774 out:
1775 	if (vmx_off()) {
1776 		printf("%s : vmxoff failed.\n", __func__);
1777 		return 1;
1778 	}
1779 	return 0;
1780 }
1781 
1782 /*
1783  * Add a teardown step. Executed after the test's main function returns.
1784  * Teardown steps executed in reverse order.
1785  */
1786 void test_add_teardown(test_teardown_func func, void *data)
1787 {
1788 	struct test_teardown_step *step;
1789 
1790 	TEST_ASSERT_MSG(teardown_count < MAX_TEST_TEARDOWN_STEPS,
1791 			"There are already %d teardown steps.",
1792 			teardown_count);
1793 	step = &teardown_steps[teardown_count++];
1794 	step->func = func;
1795 	step->data = data;
1796 }
1797 
1798 /*
1799  * Set the target of the first enter_guest call. Can only be called once per
1800  * test. Must be called before first enter_guest call.
1801  */
1802 void test_set_guest(test_guest_func func)
1803 {
1804 	assert(current->v2);
1805 	TEST_ASSERT_MSG(!v2_guest_main, "Already set guest func.");
1806 	v2_guest_main = func;
1807 }
1808 
1809 static void check_for_guest_termination(void)
1810 {
1811 	if (is_hypercall()) {
1812 		int ret;
1813 
1814 		ret = handle_hypercall();
1815 		switch (ret) {
1816 		case VMX_TEST_VMEXIT:
1817 			guest_finished = 1;
1818 			break;
1819 		case VMX_TEST_VMABORT:
1820 			continue_abort();
1821 			break;
1822 		case VMX_TEST_VMSKIP:
1823 			continue_skip();
1824 			break;
1825 		default:
1826 			printf("ERROR : Invalid handle_hypercall return %d.\n",
1827 			       ret);
1828 			abort();
1829 		}
1830 	}
1831 }
1832 
1833 #define        ABORT_ON_EARLY_VMENTRY_FAIL     0x1
1834 #define        ABORT_ON_INVALID_GUEST_STATE    0x2
1835 
1836 /*
1837  * Enters the guest (or launches it for the first time). Error to call once the
1838  * guest has returned (i.e., run past the end of its guest() function).
1839  */
1840 static void __enter_guest(u8 abort_flag, struct vmentry_failure *failure)
1841 {
1842 	TEST_ASSERT_MSG(v2_guest_main,
1843 			"Never called test_set_guest_func!");
1844 
1845 	TEST_ASSERT_MSG(!guest_finished,
1846 			"Called enter_guest() after guest returned.");
1847 
1848 	vmx_enter_guest(failure);
1849 	if ((abort_flag & ABORT_ON_EARLY_VMENTRY_FAIL && failure->early) ||
1850 	    (abort_flag & ABORT_ON_INVALID_GUEST_STATE &&
1851 	    vmcs_read(EXI_REASON) & VMX_ENTRY_FAILURE)) {
1852 
1853 		print_vmentry_failure_info(failure);
1854 		abort();
1855 	}
1856 
1857 	if (!failure->early && !(vmcs_read(EXI_REASON) & VMX_ENTRY_FAILURE)) {
1858 		launched = 1;
1859 		check_for_guest_termination();
1860 	}
1861 }
1862 
1863 void enter_guest_with_bad_controls(void)
1864 {
1865 	struct vmentry_failure failure = {0};
1866 
1867 	TEST_ASSERT_MSG(v2_guest_main,
1868 			"Never called test_set_guest_func!");
1869 
1870 	TEST_ASSERT_MSG(!guest_finished,
1871 			"Called enter_guest() after guest returned.");
1872 
1873 	__enter_guest(ABORT_ON_INVALID_GUEST_STATE, &failure);
1874 	report("failure occurred early", failure.early);
1875 	report("FLAGS set correctly",
1876 	       (failure.flags & VMX_ENTRY_FLAGS) == X86_EFLAGS_ZF);
1877 	report("VM-Inst Error # is %d (VM entry with invalid control field(s))",
1878 	       vmcs_read(VMX_INST_ERROR) == VMXERR_ENTRY_INVALID_CONTROL_FIELD,
1879 	       VMXERR_ENTRY_INVALID_CONTROL_FIELD);
1880 
1881 	/*
1882 	 * This if statement shouldn't fire, as the entire premise of this
1883 	 * function is that VM entry is expected to fail, rather than succeed
1884 	 * and execute to termination. However, if the VM entry does
1885 	 * unexpectedly succeed, it's nice to check whether the guest has
1886 	 * terminated, to reduce the number of error messages.
1887 	 */
1888 	if (!failure.early)
1889 		check_for_guest_termination();
1890 }
1891 
1892 void enter_guest(void)
1893 {
1894 	struct vmentry_failure failure = {0};
1895 
1896 	__enter_guest(ABORT_ON_EARLY_VMENTRY_FAIL |
1897 		      ABORT_ON_INVALID_GUEST_STATE, &failure);
1898 }
1899 
1900 void enter_guest_with_invalid_guest_state(void)
1901 {
1902 	struct vmentry_failure failure = {0};
1903 
1904 	__enter_guest(ABORT_ON_EARLY_VMENTRY_FAIL, &failure);
1905 }
1906 
1907 extern struct vmx_test vmx_tests[];
1908 
1909 static bool
1910 test_wanted(const char *name, const char *filters[], int filter_count)
1911 {
1912 	int i;
1913 	bool positive = false;
1914 	bool match = false;
1915 	char clean_name[strlen(name) + 1];
1916 	char *c;
1917 	const char *n;
1918 
1919 	/* Replace spaces with underscores. */
1920 	n = name;
1921 	c = &clean_name[0];
1922 	do *c++ = (*n == ' ') ? '_' : *n;
1923 	while (*n++);
1924 
1925 	for (i = 0; i < filter_count; i++) {
1926 		const char *filter = filters[i];
1927 
1928 		if (filter[0] == '-') {
1929 			if (simple_glob(clean_name, filter + 1))
1930 				return false;
1931 		} else {
1932 			positive = true;
1933 			match |= simple_glob(clean_name, filter);
1934 		}
1935 	}
1936 
1937 	if (!positive || match) {
1938 		matched++;
1939 		return true;
1940 	} else {
1941 		return false;
1942 	}
1943 }
1944 
1945 int main(int argc, const char *argv[])
1946 {
1947 	int i = 0;
1948 
1949 	setup_vm();
1950 	smp_init();
1951 	hypercall_field = 0;
1952 
1953 	/* We want xAPIC mode to test MMIO passthrough from L1 (us) to L2.  */
1954 	reset_apic();
1955 
1956 	argv++;
1957 	argc--;
1958 
1959 	if (!this_cpu_has(X86_FEATURE_VMX)) {
1960 		printf("WARNING: vmx not supported, add '-cpu host'\n");
1961 		goto exit;
1962 	}
1963 	init_bsp_vmx();
1964 	if (test_wanted("test_vmx_feature_control", argv, argc)) {
1965 		/* Sets MSR_IA32_FEATURE_CONTROL to 0x5 */
1966 		if (test_vmx_feature_control() != 0)
1967 			goto exit;
1968 	} else {
1969 		enable_vmx();
1970 	}
1971 
1972 	if (test_wanted("test_vmxon", argv, argc)) {
1973 		/* Enables VMX */
1974 		if (test_vmxon() != 0)
1975 			goto exit;
1976 	} else {
1977 		if (vmx_on()) {
1978 			report("vmxon", 0);
1979 			goto exit;
1980 		}
1981 	}
1982 
1983 	if (test_wanted("test_vmptrld", argv, argc))
1984 		test_vmptrld();
1985 	if (test_wanted("test_vmclear", argv, argc))
1986 		test_vmclear();
1987 	if (test_wanted("test_vmptrst", argv, argc))
1988 		test_vmptrst();
1989 	if (test_wanted("test_vmwrite_vmread", argv, argc))
1990 		test_vmwrite_vmread();
1991 	if (test_wanted("test_vmcs_high", argv, argc))
1992 		test_vmcs_high();
1993 	if (test_wanted("test_vmcs_lifecycle", argv, argc))
1994 		test_vmcs_lifecycle();
1995 	if (test_wanted("test_vmx_caps", argv, argc))
1996 		test_vmx_caps();
1997 
1998 	/* Balance vmxon from test_vmxon. */
1999 	vmx_off();
2000 
2001 	for (; vmx_tests[i].name != NULL; i++) {
2002 		if (!test_wanted(vmx_tests[i].name, argv, argc))
2003 			continue;
2004 		if (test_run(&vmx_tests[i]))
2005 			goto exit;
2006 	}
2007 
2008 	if (!matched)
2009 		report("command line didn't match any tests!", matched);
2010 
2011 exit:
2012 	return report_summary();
2013 }
2014