xref: /kvm-unit-tests/x86/access.c (revision cd5f2fb4ad641c51fe0f1a85264dc3f6ede6e131)
1 #include "libcflat.h"
2 #include "desc.h"
3 #include "processor.h"
4 #include "asm/page.h"
5 #include "x86/vm.h"
6 #include "access.h"
7 
8 static bool verbose = false;
9 
10 typedef unsigned long pt_element_t;
11 static int invalid_mask;
12 
13 /* Test code/data is at 32MiB, paging structures at 33MiB. */
14 #define AT_CODE_DATA_PHYS	  32 * 1024 * 1024
15 #define AT_PAGING_STRUCTURES_PHYS 33 * 1024 * 1024
16 
17 #define PT_BASE_ADDR_MASK ((pt_element_t)((((pt_element_t)1 << 36) - 1) & PAGE_MASK))
18 #define PT_PSE_BASE_ADDR_MASK (PT_BASE_ADDR_MASK & ~(1ull << 21))
19 
20 #define PFERR_PRESENT_MASK (1U << 0)
21 #define PFERR_WRITE_MASK (1U << 1)
22 #define PFERR_USER_MASK (1U << 2)
23 #define PFERR_RESERVED_MASK (1U << 3)
24 #define PFERR_FETCH_MASK (1U << 4)
25 #define PFERR_PK_MASK (1U << 5)
26 
27 #define MSR_EFER 0xc0000080
28 #define EFER_NX_MASK            (1ull << 11)
29 
30 #define PT_INDEX(address, level)       \
31 	  (((address) >> (12 + ((level)-1) * 9)) & 511)
32 
33 /*
34  * Page table access check tests.  Each number/bit represent an individual
35  * test case.  The main test will bump a counter by 1 to run all permutations
36  * of the below test cases (sans illegal combinations).
37  *
38  * Keep the PRESENT and reserved bits in the higher numbers so that they aren't
39  * toggled on every test, e.g. to keep entries in the TLB.
40  */
41 enum {
42 	AC_PTE_WRITABLE_BIT,
43 	AC_PTE_USER_BIT,
44 	AC_PTE_ACCESSED_BIT,
45 	AC_PTE_DIRTY_BIT,
46 	AC_PTE_NX_BIT,
47 	AC_PTE_PRESENT_BIT,
48 	AC_PTE_BIT51_BIT,
49 	AC_PTE_BIT36_BIT,
50 
51 	AC_PDE_WRITABLE_BIT,
52 	AC_PDE_USER_BIT,
53 	AC_PDE_ACCESSED_BIT,
54 	AC_PDE_DIRTY_BIT,
55 	AC_PDE_PSE_BIT,
56 	AC_PDE_NX_BIT,
57 	AC_PDE_PRESENT_BIT,
58 	AC_PDE_BIT51_BIT,
59 	AC_PDE_BIT36_BIT,
60 	AC_PDE_BIT13_BIT,
61 
62 	/*
63 	 *  special test case to DISABLE writable bit on page directory
64 	 *  pointer table entry.
65 	 */
66 	AC_PDPTE_NO_WRITABLE_BIT,
67 
68 	AC_PKU_AD_BIT,
69 	AC_PKU_WD_BIT,
70 	AC_PKU_PKEY_BIT,
71 
72 	AC_ACCESS_USER_BIT,
73 	AC_ACCESS_WRITE_BIT,
74 	AC_ACCESS_FETCH_BIT,
75 	AC_ACCESS_TWICE_BIT,
76 
77 	AC_CPU_EFER_NX_BIT,
78 	AC_CPU_CR0_WP_BIT,
79 	AC_CPU_CR4_SMEP_BIT,
80 	AC_CPU_CR4_PKE_BIT,
81 
82 	AC_FEP_BIT,
83 
84 	NR_AC_FLAGS,
85 };
86 
87 #define AC_PTE_PRESENT_MASK   (1 << AC_PTE_PRESENT_BIT)
88 #define AC_PTE_WRITABLE_MASK  (1 << AC_PTE_WRITABLE_BIT)
89 #define AC_PTE_USER_MASK      (1 << AC_PTE_USER_BIT)
90 #define AC_PTE_ACCESSED_MASK  (1 << AC_PTE_ACCESSED_BIT)
91 #define AC_PTE_DIRTY_MASK     (1 << AC_PTE_DIRTY_BIT)
92 #define AC_PTE_NX_MASK        (1 << AC_PTE_NX_BIT)
93 #define AC_PTE_BIT51_MASK     (1 << AC_PTE_BIT51_BIT)
94 #define AC_PTE_BIT36_MASK     (1 << AC_PTE_BIT36_BIT)
95 
96 #define AC_PDE_PRESENT_MASK   (1 << AC_PDE_PRESENT_BIT)
97 #define AC_PDE_WRITABLE_MASK  (1 << AC_PDE_WRITABLE_BIT)
98 #define AC_PDE_USER_MASK      (1 << AC_PDE_USER_BIT)
99 #define AC_PDE_ACCESSED_MASK  (1 << AC_PDE_ACCESSED_BIT)
100 #define AC_PDE_DIRTY_MASK     (1 << AC_PDE_DIRTY_BIT)
101 #define AC_PDE_PSE_MASK       (1 << AC_PDE_PSE_BIT)
102 #define AC_PDE_NX_MASK        (1 << AC_PDE_NX_BIT)
103 #define AC_PDE_BIT51_MASK     (1 << AC_PDE_BIT51_BIT)
104 #define AC_PDE_BIT36_MASK     (1 << AC_PDE_BIT36_BIT)
105 #define AC_PDE_BIT13_MASK     (1 << AC_PDE_BIT13_BIT)
106 
107 #define AC_PDPTE_NO_WRITABLE_MASK  (1 << AC_PDPTE_NO_WRITABLE_BIT)
108 
109 #define AC_PKU_AD_MASK        (1 << AC_PKU_AD_BIT)
110 #define AC_PKU_WD_MASK        (1 << AC_PKU_WD_BIT)
111 #define AC_PKU_PKEY_MASK      (1 << AC_PKU_PKEY_BIT)
112 
113 #define AC_ACCESS_USER_MASK   (1 << AC_ACCESS_USER_BIT)
114 #define AC_ACCESS_WRITE_MASK  (1 << AC_ACCESS_WRITE_BIT)
115 #define AC_ACCESS_FETCH_MASK  (1 << AC_ACCESS_FETCH_BIT)
116 #define AC_ACCESS_TWICE_MASK  (1 << AC_ACCESS_TWICE_BIT)
117 
118 #define AC_CPU_EFER_NX_MASK   (1 << AC_CPU_EFER_NX_BIT)
119 #define AC_CPU_CR0_WP_MASK    (1 << AC_CPU_CR0_WP_BIT)
120 #define AC_CPU_CR4_SMEP_MASK  (1 << AC_CPU_CR4_SMEP_BIT)
121 #define AC_CPU_CR4_PKE_MASK   (1 << AC_CPU_CR4_PKE_BIT)
122 
123 #define AC_FEP_MASK           (1 << AC_FEP_BIT)
124 
125 const char *ac_names[] = {
126 	[AC_PTE_PRESENT_BIT] = "pte.p",
127 	[AC_PTE_ACCESSED_BIT] = "pte.a",
128 	[AC_PTE_WRITABLE_BIT] = "pte.rw",
129 	[AC_PTE_USER_BIT] = "pte.user",
130 	[AC_PTE_DIRTY_BIT] = "pte.d",
131 	[AC_PTE_NX_BIT] = "pte.nx",
132 	[AC_PTE_BIT51_BIT] = "pte.51",
133 	[AC_PTE_BIT36_BIT] = "pte.36",
134 	[AC_PDE_PRESENT_BIT] = "pde.p",
135 	[AC_PDE_ACCESSED_BIT] = "pde.a",
136 	[AC_PDE_WRITABLE_BIT] = "pde.rw",
137 	[AC_PDE_USER_BIT] = "pde.user",
138 	[AC_PDE_DIRTY_BIT] = "pde.d",
139 	[AC_PDE_PSE_BIT] = "pde.pse",
140 	[AC_PDE_NX_BIT] = "pde.nx",
141 	[AC_PDE_BIT51_BIT] = "pde.51",
142 	[AC_PDE_BIT36_BIT] = "pde.36",
143 	[AC_PDE_BIT13_BIT] = "pde.13",
144 	[AC_PDPTE_NO_WRITABLE_BIT] = "pdpte.ro",
145 	[AC_PKU_AD_BIT] = "pkru.ad",
146 	[AC_PKU_WD_BIT] = "pkru.wd",
147 	[AC_PKU_PKEY_BIT] = "pkey=1",
148 	[AC_ACCESS_WRITE_BIT] = "write",
149 	[AC_ACCESS_USER_BIT] = "user",
150 	[AC_ACCESS_FETCH_BIT] = "fetch",
151 	[AC_ACCESS_TWICE_BIT] = "twice",
152 	[AC_CPU_EFER_NX_BIT] = "efer.nx",
153 	[AC_CPU_CR0_WP_BIT] = "cr0.wp",
154 	[AC_CPU_CR4_SMEP_BIT] = "cr4.smep",
155 	[AC_CPU_CR4_PKE_BIT] = "cr4.pke",
156 	[AC_FEP_BIT] = "fep",
157 };
158 
va(pt_element_t phys)159 static inline void *va(pt_element_t phys)
160 {
161 	return (void *)phys;
162 }
163 
164 typedef struct {
165 	pt_element_t pt_pool_pa;
166 	unsigned int pt_pool_current;
167 	int pt_levels;
168 } ac_pt_env_t;
169 
170 typedef struct {
171 	unsigned flags;
172 	void *virt;
173 	pt_element_t phys;
174 	pt_element_t *ptep;
175 	pt_element_t expected_pte;
176 	pt_element_t *pdep;
177 	pt_element_t expected_pde;
178 	pt_element_t ignore_pde;
179 	int expected_fault;
180 	unsigned expected_error;
181 	int pt_levels;
182 
183 	/* 5-level paging, 1-based to avoid math. */
184 	pt_element_t page_tables[6];
185 } ac_test_t;
186 
187 typedef struct {
188 	unsigned short limit;
189 	unsigned long linear_addr;
190 } __attribute__((packed)) descriptor_table_t;
191 
192 
193 static void ac_test_show(ac_test_t *at);
194 
195 static unsigned long shadow_cr0;
196 static unsigned long shadow_cr3;
197 static unsigned long shadow_cr4;
198 static unsigned long long shadow_efer;
199 
200 typedef void (*walk_fn)(pt_element_t *ptep, int level, unsigned long virt);
201 
202 /* Returns the size of the range covered by the last processed entry. */
walk_va(ac_test_t * at,int min_level,unsigned long virt,walk_fn callback,bool leaf_only)203 static unsigned long walk_va(ac_test_t *at, int min_level, unsigned long virt,
204 			     walk_fn callback, bool leaf_only)
205 {
206 	unsigned long parent_pte = shadow_cr3;
207 	int i;
208 
209 	for (i = at->pt_levels; i >= min_level; --i) {
210 		pt_element_t *parent_pt = va(parent_pte & PT_BASE_ADDR_MASK);
211 		unsigned int index = PT_INDEX(virt, i);
212 		pt_element_t *ptep = &parent_pt[index];
213 
214 		assert(!leaf_only || (*ptep & PT_PRESENT_MASK));
215 
216 		if (!leaf_only || i == 1 || (*ptep & PT_PAGE_SIZE_MASK))
217 			callback(ptep, i, virt);
218 
219 		if (i == 1 || *ptep & PT_PAGE_SIZE_MASK)
220 			break;
221 
222 		parent_pte = *ptep;
223 	}
224 
225 	return 1ul << PGDIR_BITS(i);
226 }
227 
walk_ptes(ac_test_t * at,unsigned long virt,unsigned long end,walk_fn callback)228 static void walk_ptes(ac_test_t *at, unsigned long virt, unsigned long end,
229 		      walk_fn callback)
230 {
231 	unsigned long page_size;
232 
233 	for ( ; virt < end; virt = ALIGN_DOWN(virt + page_size, page_size))
234 		page_size = walk_va(at, 1, virt, callback, true);
235 }
236 
set_cr0_wp(int wp)237 static void set_cr0_wp(int wp)
238 {
239 	unsigned long cr0 = shadow_cr0;
240 
241 	cr0 &= ~X86_CR0_WP;
242 	if (wp)
243 		cr0 |= X86_CR0_WP;
244 	if (cr0 != shadow_cr0) {
245 		write_cr0(cr0);
246 		shadow_cr0 = cr0;
247 	}
248 }
249 
clear_user_mask(pt_element_t * ptep,int level,unsigned long virt)250 static void clear_user_mask(pt_element_t *ptep, int level, unsigned long virt)
251 {
252 	*ptep &= ~PT_USER_MASK;
253 
254 	/* Flush to avoid spurious #PF */
255 	invlpg((void*)virt);
256 }
257 
set_user_mask(pt_element_t * ptep,int level,unsigned long virt)258 static void set_user_mask(pt_element_t *ptep, int level, unsigned long virt)
259 {
260 	*ptep |= PT_USER_MASK;
261 
262 	/* Flush to avoid spurious #PF */
263 	invlpg((void*)virt);
264 }
265 
set_cr4_smep(ac_test_t * at,int smep)266 static unsigned set_cr4_smep(ac_test_t *at, int smep)
267 {
268 	extern char stext, etext;
269 	unsigned long code_start = (unsigned long)&stext;
270 	unsigned long code_end = (unsigned long)&etext;
271 	unsigned long cr4 = shadow_cr4;
272 	unsigned r;
273 
274 	cr4 &= ~X86_CR4_SMEP;
275 	if (smep)
276 		cr4 |= X86_CR4_SMEP;
277 	if (cr4 == shadow_cr4)
278 		return 0;
279 
280 	if (smep)
281 		walk_ptes(at, code_start, code_end, clear_user_mask);
282 	r = write_cr4_safe(cr4);
283 	if (r || !smep)
284 		walk_ptes(at, code_start, code_end, set_user_mask);
285 	if (!r)
286 		shadow_cr4 = cr4;
287 	return r;
288 }
289 
set_cr4_pke(int pke)290 static void set_cr4_pke(int pke)
291 {
292 	unsigned long cr4 = shadow_cr4;
293 
294 	cr4 &= ~X86_CR4_PKE;
295 	if (pke)
296 		cr4 |= X86_CR4_PKE;
297 	if (cr4 == shadow_cr4)
298 		return;
299 
300 	/* Check that protection keys do not affect accesses when CR4.PKE=0.  */
301 	if ((shadow_cr4 & X86_CR4_PKE) && !pke)
302 		write_pkru(0xfffffffc);
303 	write_cr4(cr4);
304 	shadow_cr4 = cr4;
305 }
306 
set_efer_nx(int nx)307 static void set_efer_nx(int nx)
308 {
309 	unsigned long long efer = shadow_efer;
310 
311 	efer &= ~EFER_NX_MASK;
312 	if (nx)
313 		efer |= EFER_NX_MASK;
314 	if (efer != shadow_efer) {
315 		wrmsr(MSR_EFER, efer);
316 		shadow_efer = efer;
317 	}
318 }
319 
ac_env_int(ac_pt_env_t * pt_env,int page_table_levels)320 static void ac_env_int(ac_pt_env_t *pt_env, int page_table_levels)
321 {
322 	extern char page_fault, kernel_entry;
323 	set_idt_entry(14, &page_fault, 0);
324 	set_idt_entry(0x20, &kernel_entry, 3);
325 
326 	pt_env->pt_pool_pa = AT_PAGING_STRUCTURES_PHYS;
327 	pt_env->pt_pool_current = 0;
328 	pt_env->pt_levels = page_table_levels;
329 }
330 
ac_test_alloc_pt(ac_pt_env_t * pt_env)331 static pt_element_t ac_test_alloc_pt(ac_pt_env_t *pt_env)
332 {
333 	pt_element_t pt;
334 
335 	/*
336 	 * Each test needs at most pt_levels-1 structures per virtual address,
337 	 * and no existing scenario uses more than four addresses.
338 	 */
339 	assert(pt_env->pt_pool_current < (4 * (pt_env->pt_levels - 1)));
340 
341 	pt = pt_env->pt_pool_pa + (pt_env->pt_pool_current * PAGE_SIZE);
342 	pt_env->pt_pool_current++;
343 	memset(va(pt), 0, PAGE_SIZE);
344 	return pt;
345 }
346 
__ac_test_init(ac_test_t * at,unsigned long virt,ac_pt_env_t * pt_env,ac_test_t * buddy)347 static void __ac_test_init(ac_test_t *at, unsigned long virt,
348 			   ac_pt_env_t *pt_env, ac_test_t *buddy)
349 {
350 	unsigned long buddy_virt = buddy ? (unsigned long)buddy->virt : 0;
351 	pt_element_t *root_pt = va(shadow_cr3 & PT_BASE_ADDR_MASK);
352 	int i;
353 
354 	/*
355 	 * The test infrastructure, e.g. this function, must use a different
356 	 * top-level SPTE than the test, otherwise modifying SPTEs can affect
357 	 * normal behavior, e.g. crash the test due to marking code SPTEs
358 	 * USER when CR4.SMEP=1.
359 	 */
360 	assert(PT_INDEX(virt, pt_env->pt_levels) !=
361 	       PT_INDEX((unsigned long)__ac_test_init, pt_env->pt_levels));
362 
363 	set_efer_nx(1);
364 	set_cr0_wp(1);
365 	at->flags = 0;
366 	at->virt = (void *)virt;
367 	at->phys = AT_CODE_DATA_PHYS;
368 	at->pt_levels = pt_env->pt_levels;
369 
370 	at->page_tables[0] = -1ull;
371 	at->page_tables[1] = -1ull;
372 
373 	/*
374 	 * Zap the existing top-level PTE as it may be reused from a previous
375 	 * sub-test.  This allows runtime PTE modification to assert that two
376 	 * overlapping walks don't try to install different paging structures.
377 	 */
378 	root_pt[PT_INDEX(virt, pt_env->pt_levels)] = 0;
379 
380 	for (i = at->pt_levels; i > 1; i--) {
381 		/*
382 		 * Buddies can reuse any part of the walk that share the same
383 		 * index.  This is weird, but intentional, as several tests
384 		 * want different walks to merge at lower levels.
385 		 */
386 		if (buddy && PT_INDEX(virt, i) == PT_INDEX(buddy_virt, i))
387 			at->page_tables[i] = buddy->page_tables[i];
388 		else
389 			at->page_tables[i] = ac_test_alloc_pt(pt_env);
390 	}
391 }
392 
ac_test_init(ac_test_t * at,unsigned long virt,ac_pt_env_t * pt_env)393 static void ac_test_init(ac_test_t *at, unsigned long virt, ac_pt_env_t *pt_env)
394 {
395 	__ac_test_init(at, virt, pt_env, NULL);
396 }
397 
ac_test_bump_one(ac_test_t * at)398 static int ac_test_bump_one(ac_test_t *at)
399 {
400 	at->flags = ((at->flags | invalid_mask) + 1) & ~invalid_mask;
401 	return at->flags < (1 << NR_AC_FLAGS);
402 }
403 
404 #define F(x)  ((flags & x##_MASK) != 0)
405 
ac_test_legal(ac_test_t * at)406 static bool ac_test_legal(ac_test_t *at)
407 {
408 	int flags = at->flags;
409 	unsigned reserved;
410 
411 	if (F(AC_CPU_CR4_SMEP))
412 		return false;
413 
414 	if (F(AC_ACCESS_FETCH) && F(AC_ACCESS_WRITE))
415 		return false;
416 
417 	/*
418 	 * Since we convert current page to kernel page when cr4.smep=1,
419 	 * we can't switch to user mode.
420 	 */
421 	if (F(AC_ACCESS_USER) && F(AC_CPU_CR4_SMEP))
422 		return false;
423 
424 	/*
425 	 * Only test protection key faults if CR4.PKE=1.
426 	 */
427 	if (!F(AC_CPU_CR4_PKE) &&
428 		(F(AC_PKU_AD) || F(AC_PKU_WD))) {
429 		return false;
430 	}
431 
432 	/*
433 	 * pde.bit13 checks handling of reserved bits in largepage PDEs.  It is
434 	 * meaningless if there is a PTE.
435 	 */
436 	if (!F(AC_PDE_PSE) && F(AC_PDE_BIT13))
437 		return false;
438 
439 	/*
440 	 * Shorten the test by avoiding testing too many reserved bit combinations.
441 	 * Skip testing multiple reserved bits to shorten the test. Reserved bit
442 	 * page faults are terminal and multiple reserved bits do not affect the
443 	 * error code; the odds of a KVM bug are super low, and the odds of actually
444 	 * being able to detect a bug are even lower.
445 	 */
446 	reserved = (AC_PDE_BIT51_MASK | AC_PDE_BIT36_MASK | AC_PDE_BIT13_MASK |
447 		   AC_PTE_BIT51_MASK | AC_PTE_BIT36_MASK);
448 	if (!F(AC_CPU_EFER_NX))
449 		reserved |= AC_PDE_NX_MASK | AC_PTE_NX_MASK;
450 
451 	/* Only test one reserved bit at a time.  */
452 	reserved &= flags;
453 	if (reserved & (reserved - 1))
454 		return false;
455 
456 	return true;
457 }
458 
ac_test_bump(ac_test_t * at)459 static int ac_test_bump(ac_test_t *at)
460 {
461 	int ret;
462 
463 	do {
464 		ret = ac_test_bump_one(at);
465 	} while (ret && !ac_test_legal(at));
466 
467 	return ret;
468 }
469 
ac_test_permissions(ac_test_t * at,unsigned flags,bool writable,bool user,bool executable)470 static pt_element_t ac_test_permissions(ac_test_t *at, unsigned flags,
471 					bool writable, bool user,
472 					bool executable)
473 {
474 	bool kwritable = !F(AC_CPU_CR0_WP) && !F(AC_ACCESS_USER);
475 	pt_element_t expected = 0;
476 
477 	if (F(AC_ACCESS_USER) && !user)
478 		at->expected_fault = 1;
479 
480 	if (F(AC_ACCESS_WRITE) && !writable && !kwritable)
481 		at->expected_fault = 1;
482 
483 	if (F(AC_ACCESS_FETCH) && !executable)
484 		at->expected_fault = 1;
485 
486 	if (F(AC_ACCESS_FETCH) && user && F(AC_CPU_CR4_SMEP))
487 		at->expected_fault = 1;
488 
489 	if (user && !F(AC_ACCESS_FETCH) && F(AC_PKU_PKEY) && F(AC_CPU_CR4_PKE)) {
490 		if (F(AC_PKU_AD)) {
491 			at->expected_fault = 1;
492 			at->expected_error |= PFERR_PK_MASK;
493 		} else if (F(AC_ACCESS_WRITE) && F(AC_PKU_WD) && !kwritable) {
494 			at->expected_fault = 1;
495 			at->expected_error |= PFERR_PK_MASK;
496 		}
497 	}
498 
499 	if (!at->expected_fault) {
500 		expected |= PT_ACCESSED_MASK;
501 		if (F(AC_ACCESS_WRITE))
502 			expected |= PT_DIRTY_MASK;
503 	}
504 
505 	return expected;
506 }
507 
ac_emulate_access(ac_test_t * at,unsigned flags)508 static void ac_emulate_access(ac_test_t *at, unsigned flags)
509 {
510 	bool pde_valid, pte_valid;
511 	bool user, writable, executable;
512 
513 	if (F(AC_ACCESS_USER))
514 		at->expected_error |= PFERR_USER_MASK;
515 
516 	if (F(AC_ACCESS_WRITE))
517 		at->expected_error |= PFERR_WRITE_MASK;
518 
519 	if (F(AC_ACCESS_FETCH))
520 		at->expected_error |= PFERR_FETCH_MASK;
521 
522 	if (!F(AC_PDE_ACCESSED))
523 		at->ignore_pde = PT_ACCESSED_MASK;
524 
525 	pde_valid = F(AC_PDE_PRESENT)
526 		&& !F(AC_PDE_BIT51) && !F(AC_PDE_BIT36) && !F(AC_PDE_BIT13)
527 		&& !(F(AC_PDE_NX) && !F(AC_CPU_EFER_NX));
528 
529 	if (!pde_valid) {
530 		at->expected_fault = 1;
531 		if (F(AC_PDE_PRESENT)) {
532 			at->expected_error |= PFERR_RESERVED_MASK;
533 		} else {
534 			at->expected_error &= ~PFERR_PRESENT_MASK;
535 		}
536 		goto fault;
537 	}
538 
539 	writable = !F(AC_PDPTE_NO_WRITABLE) && F(AC_PDE_WRITABLE);
540 	user = F(AC_PDE_USER);
541 	executable = !F(AC_PDE_NX);
542 
543 	if (F(AC_PDE_PSE)) {
544 		at->expected_pde |= ac_test_permissions(at, flags, writable,
545 							user, executable);
546 		goto no_pte;
547 	}
548 
549 	at->expected_pde |= PT_ACCESSED_MASK;
550 
551 	pte_valid = F(AC_PTE_PRESENT)
552 		    && !F(AC_PTE_BIT51) && !F(AC_PTE_BIT36)
553 		    && !(F(AC_PTE_NX) && !F(AC_CPU_EFER_NX));
554 
555 	if (!pte_valid) {
556 		at->expected_fault = 1;
557 		if (F(AC_PTE_PRESENT)) {
558 			at->expected_error |= PFERR_RESERVED_MASK;
559 		} else {
560 			at->expected_error &= ~PFERR_PRESENT_MASK;
561 		}
562 		goto fault;
563 	}
564 
565 	writable &= F(AC_PTE_WRITABLE);
566 	user &= F(AC_PTE_USER);
567 	executable &= !F(AC_PTE_NX);
568 
569 	at->expected_pte |= ac_test_permissions(at, flags, writable, user,
570 						executable);
571 
572 no_pte:
573 fault:
574 	if (!at->expected_fault)
575 		at->ignore_pde = 0;
576 	if (!F(AC_CPU_EFER_NX) && !F(AC_CPU_CR4_SMEP))
577 		at->expected_error &= ~PFERR_FETCH_MASK;
578 }
579 
__ac_set_expected_status(ac_test_t * at,bool flush)580 static void __ac_set_expected_status(ac_test_t *at, bool flush)
581 {
582 	if (flush)
583 		invlpg(at->virt);
584 
585 	if (at->ptep)
586 		at->expected_pte = *at->ptep;
587 	at->expected_pde = *at->pdep;
588 	at->ignore_pde = 0;
589 	at->expected_fault = 0;
590 	at->expected_error = PFERR_PRESENT_MASK;
591 
592 	if (at->flags & AC_ACCESS_TWICE_MASK) {
593 		ac_emulate_access(at, at->flags &
594 				  ~AC_ACCESS_WRITE_MASK &
595 				  ~AC_ACCESS_FETCH_MASK &
596 				  ~AC_ACCESS_USER_MASK);
597 		at->expected_fault = 0;
598 		at->expected_error = PFERR_PRESENT_MASK;
599 		at->ignore_pde = 0;
600 	}
601 
602 	ac_emulate_access(at, at->flags);
603 }
604 
ac_set_expected_status(ac_test_t * at)605 static void ac_set_expected_status(ac_test_t *at)
606 {
607 	__ac_set_expected_status(at, true);
608 }
609 
ac_get_pt(ac_test_t * at,int i,pt_element_t * ptep)610 static pt_element_t ac_get_pt(ac_test_t *at, int i, pt_element_t *ptep)
611 {
612 	pt_element_t pte;
613 
614 	pte = *ptep;
615 	if (pte && !(pte & PT_PAGE_SIZE_MASK) &&
616 	    (pte & PT_BASE_ADDR_MASK) != at->page_tables[i]) {
617 		printf("\nPT collision.  VA = 0x%lx, level = %d, index = %ld, found PT = 0x%lx, want PT = 0x%lx\n",
618 			(unsigned long)at->virt, i,
619 			PT_INDEX((unsigned long)at->virt, i),
620 			pte, at->page_tables[i]);
621 		abort();
622 	}
623 
624 	/*
625 	 * Preserve A/D bits to avoid writing upper level PTEs,
626 	 * which cannot be unsyc'd when KVM uses shadow paging.
627 	 */
628 	pte = at->page_tables[i] | (pte & (PT_DIRTY_MASK | PT_ACCESSED_MASK));
629 	return pte;
630 }
631 
ac_test_setup_ptes(ac_test_t * at)632 static void ac_test_setup_ptes(ac_test_t *at)
633 {
634 	unsigned long parent_pte = shadow_cr3;
635 	int flags = at->flags;
636 	int i;
637 
638 	at->ptep = 0;
639 	for (i = at->pt_levels; i >= 1 && (i >= 2 || !F(AC_PDE_PSE)); --i) {
640 		pt_element_t *parent_pt = va(parent_pte & PT_BASE_ADDR_MASK);
641 		unsigned index = PT_INDEX((unsigned long)at->virt, i);
642 		pt_element_t *ptep = &parent_pt[index];
643 		pt_element_t pte;
644 
645 		switch (i) {
646 		case 5:
647 		case 4:
648 			pte = ac_get_pt(at, i, ptep);
649 			pte |= PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
650 			break;
651 		case 3:
652 			pte = ac_get_pt(at, i, ptep);
653 			pte |= PT_PRESENT_MASK | PT_USER_MASK;
654 			if (!F(AC_PDPTE_NO_WRITABLE))
655 				pte |= PT_WRITABLE_MASK;
656 			break;
657 		case 2:
658 			if (!F(AC_PDE_PSE)) {
659 				pte = ac_get_pt(at, i, ptep);
660 
661 				/* The protection key is ignored on non-leaf entries.  */
662 				if (F(AC_PKU_PKEY))
663 					pte |= 2ull << 59;
664 			} else {
665 				pte = at->phys & PT_PSE_BASE_ADDR_MASK;
666 				pte |= PT_PAGE_SIZE_MASK;
667 				if (F(AC_PKU_PKEY))
668 					pte |= 1ull << 59;
669 			}
670 			if (F(AC_PDE_PRESENT))
671 				pte |= PT_PRESENT_MASK;
672 			if (F(AC_PDE_WRITABLE))
673 				pte |= PT_WRITABLE_MASK;
674 			if (F(AC_PDE_USER))
675 				pte |= PT_USER_MASK;
676 			if (F(AC_PDE_ACCESSED))
677 				pte |= PT_ACCESSED_MASK;
678 			if (F(AC_PDE_DIRTY))
679 				pte |= PT_DIRTY_MASK;
680 			if (F(AC_PDE_NX))
681 				pte |= PT64_NX_MASK;
682 			if (F(AC_PDE_BIT51))
683 				pte |= 1ull << 51;
684 			if (F(AC_PDE_BIT36))
685 				pte |= 1ull << 36;
686 			if (F(AC_PDE_BIT13))
687 				pte |= 1ull << 13;
688 			at->pdep = ptep;
689 			break;
690 		case 1:
691 			pte = at->phys & PT_BASE_ADDR_MASK;
692 			if (F(AC_PKU_PKEY))
693 				pte |= 1ull << 59;
694 			if (F(AC_PTE_PRESENT))
695 				pte |= PT_PRESENT_MASK;
696 			if (F(AC_PTE_WRITABLE))
697 				pte |= PT_WRITABLE_MASK;
698 			if (F(AC_PTE_USER))
699 				pte |= PT_USER_MASK;
700 			if (F(AC_PTE_ACCESSED))
701 				pte |= PT_ACCESSED_MASK;
702 			if (F(AC_PTE_DIRTY))
703 				pte |= PT_DIRTY_MASK;
704 			if (F(AC_PTE_NX))
705 				pte |= PT64_NX_MASK;
706 			if (F(AC_PTE_BIT51))
707 				pte |= 1ull << 51;
708 			if (F(AC_PTE_BIT36))
709 				pte |= 1ull << 36;
710 			at->ptep = ptep;
711 			break;
712 		default:
713 			assert(0);
714 		}
715 
716 		if (pte != *ptep)
717 			*ptep = pte;
718 
719 		parent_pte = pte;
720 	}
721 	ac_set_expected_status(at);
722 }
723 
__dump_pte(pt_element_t * ptep,int level,unsigned long virt)724 static void __dump_pte(pt_element_t *ptep, int level, unsigned long virt)
725 {
726 	printf("------L%d I%lu: %lx\n", level, PT_INDEX(virt, level), *ptep);
727 }
728 
dump_mapping(ac_test_t * at)729 static void dump_mapping(ac_test_t *at)
730 {
731 	unsigned long virt = (unsigned long)at->virt;
732 	int flags = at->flags;
733 
734 	printf("Dump mapping: address: %p\n", at->virt);
735 	walk_va(at, F(AC_PDE_PSE) ? 2 : 1, virt, __dump_pte, false);
736 }
737 
ac_test_check(ac_test_t * at,bool * success_ret,bool cond,const char * fmt,...)738 static void ac_test_check(ac_test_t *at, bool *success_ret, bool cond,
739 			  const char *fmt, ...)
740 {
741 	va_list ap;
742 	char buf[500];
743 
744 	if (!*success_ret) {
745 		return;
746 	}
747 
748 	if (!cond) {
749 		return;
750 	}
751 
752 	*success_ret = false;
753 
754 	if (!verbose) {
755 		puts("\n");
756 		ac_test_show(at);
757 	}
758 
759 	va_start(ap, fmt);
760 	vsnprintf(buf, sizeof(buf), fmt, ap);
761 	va_end(ap);
762 	printf("FAIL: %s\n", buf);
763 	dump_mapping(at);
764 }
765 
pt_match(pt_element_t pte1,pt_element_t pte2,pt_element_t ignore)766 static int pt_match(pt_element_t pte1, pt_element_t pte2, pt_element_t ignore)
767 {
768 	pte1 &= ~ignore;
769 	pte2 &= ~ignore;
770 	return pte1 == pte2;
771 }
772 
ac_test_do_access(ac_test_t * at)773 static int ac_test_do_access(ac_test_t *at)
774 {
775 	static unsigned unique = 42;
776 	int fault = 0;
777 	unsigned e;
778 	static unsigned char user_stack[4096];
779 	unsigned long rsp;
780 	bool success = true;
781 	int flags = at->flags;
782 
783 	++unique;
784 	if (!(unique & 65535)) {
785 		puts(".");
786 	}
787 
788 	*((unsigned char *)at->phys) = 0xc3; /* ret */
789 
790 	unsigned r = unique;
791 	set_cr0_wp(F(AC_CPU_CR0_WP));
792 	set_efer_nx(F(AC_CPU_EFER_NX));
793 	set_cr4_pke(F(AC_CPU_CR4_PKE));
794 	if (F(AC_CPU_CR4_PKE)) {
795 		/* WD2=AD2=1, WD1=F(AC_PKU_WD), AD1=F(AC_PKU_AD) */
796 		write_pkru(0x30 | (F(AC_PKU_WD) ? 8 : 0) |
797 			   (F(AC_PKU_AD) ? 4 : 0));
798 	}
799 
800 	set_cr4_smep(at, F(AC_CPU_CR4_SMEP));
801 
802 	if (F(AC_ACCESS_TWICE)) {
803 		asm volatile ("mov $fixed2, %%rsi \n\t"
804 			      "cmp $0, %[fep] \n\t"
805 			      "jz 1f \n\t"
806 			      KVM_FEP
807 			      "1: mov (%[addr]), %[reg] \n\t"
808 			      "fixed2:"
809 			      : [reg]"=r"(r), [fault]"=a"(fault), "=b"(e)
810 			      : [addr]"r"(at->virt), [fep]"r"(F(AC_FEP))
811 			      : "rsi");
812 		fault = 0;
813 	}
814 
815 	asm volatile ("mov $fixed1, %%rsi \n\t"
816 		      "mov %%rsp, %[rsp0] \n\t"
817 		      "cmp $0, %[user] \n\t"
818 		      "jz do_access \n\t"
819 		      "push %%rax; mov %[user_ds], %%ax; mov %%ax, %%ds; pop %%rax \n\t"
820 		      "pushq %[user_ds] \n\t"
821 		      "pushq %[user_stack_top] \n\t"
822 		      "pushfq \n\t"
823 		      "pushq %[user_cs] \n\t"
824 		      "pushq $do_access \n\t"
825 		      "iretq \n"
826 		      "do_access: \n\t"
827 		      "cmp $0, %[fetch] \n\t"
828 		      "jnz 2f \n\t"
829 		      "cmp $0, %[write] \n\t"
830 		      "jnz 1f \n\t"
831 		      "cmp $0, %[fep] \n\t"
832 		      "jz 0f \n\t"
833 		      KVM_FEP
834 		      "0: mov (%[addr]), %[reg] \n\t"
835 		      "jmp done \n\t"
836 		      "1: cmp $0, %[fep] \n\t"
837 		      "jz 0f \n\t"
838 		      KVM_FEP
839 		      "0: mov %[reg], (%[addr]) \n\t"
840 		      "jmp done \n\t"
841 		      "2: call *%[addr] \n\t"
842 		      "done: \n"
843 		      "fixed1: \n"
844 		      "int %[kernel_entry_vector] \n\t"
845 		      ".section .text.entry \n\t"
846 		      "kernel_entry: \n\t"
847 		      "mov %[rsp0], %%rsp \n\t"
848 		      "jmp back_to_kernel \n\t"
849 		      ".section .text \n\t"
850 		      "back_to_kernel:"
851 		      : [reg]"+r"(r), "+a"(fault), "=b"(e), "=&d"(rsp),
852 			[rsp0]"=m"(tss[0].rsp0)
853 		      : [addr]"r"(at->virt),
854 			[write]"r"(F(AC_ACCESS_WRITE)),
855 			[user]"r"(F(AC_ACCESS_USER)),
856 			[fetch]"r"(F(AC_ACCESS_FETCH)),
857 			[fep]"r"(F(AC_FEP)),
858 			[user_ds]"i"(USER_DS),
859 			[user_cs]"i"(USER_CS),
860 			[user_stack_top]"r"(user_stack + sizeof user_stack),
861 			[kernel_entry_vector]"i"(0x20)
862 		      : "rsi");
863 
864 	asm volatile (".section .text.pf \n\t"
865 		      "page_fault: \n\t"
866 		      "pop %rbx \n\t"
867 		      "mov %rsi, (%rsp) \n\t"
868 		      "movl $1, %eax \n\t"
869 		      "iretq \n\t"
870 		      ".section .text");
871 
872 	ac_test_check(at, &success, fault && !at->expected_fault,
873 		      "unexpected fault");
874 	ac_test_check(at, &success, !fault && at->expected_fault,
875 		      "unexpected access");
876 	ac_test_check(at, &success, fault && e != at->expected_error,
877 		      "error code %x expected %x", e, at->expected_error);
878 	if (at->ptep)
879 		ac_test_check(at, &success, *at->ptep != at->expected_pte,
880 			      "pte %x expected %x", *at->ptep, at->expected_pte);
881 	ac_test_check(at, &success,
882 		      !pt_match(*at->pdep, at->expected_pde, at->ignore_pde),
883 		      "pde %x expected %x", *at->pdep, at->expected_pde);
884 
885 	if (success && verbose) {
886 		if (at->expected_fault) {
887 			printf("PASS (%x)\n", at->expected_error);
888 		} else {
889 			printf("PASS\n");
890 		}
891 	}
892 	return success;
893 }
894 
ac_test_show(ac_test_t * at)895 static void ac_test_show(ac_test_t *at)
896 {
897 	char line[5000];
898 
899 	*line = 0;
900 	strcat(line, "test");
901 	for (int i = 0; i < NR_AC_FLAGS; ++i)
902 		if (at->flags & (1 << i)) {
903 			strcat(line, " ");
904 			strcat(line, ac_names[i]);
905 		}
906 
907 	strcat(line, ": ");
908 	printf("%s", line);
909 }
910 
911 /*
912  * This test case is used to trigger the bug which is fixed by
913  * commit e09e90a5 in the kvm tree
914  */
corrupt_hugepage_trigger(ac_pt_env_t * pt_env)915 static int corrupt_hugepage_trigger(ac_pt_env_t *pt_env)
916 {
917 	ac_test_t at1, at2;
918 
919 	ac_test_init(&at1, 0xffff923400000000ul, pt_env);
920 	__ac_test_init(&at2, 0xffffe66600000000ul, pt_env, &at1);
921 
922 	at2.flags = AC_CPU_CR0_WP_MASK | AC_PDE_PSE_MASK | AC_PDE_PRESENT_MASK;
923 	ac_test_setup_ptes(&at2);
924 	if (!ac_test_do_access(&at2))
925 		goto err;
926 
927 	at1.flags = at2.flags | AC_PDE_WRITABLE_MASK;
928 	ac_test_setup_ptes(&at1);
929 	if (!ac_test_do_access(&at1))
930 		goto err;
931 
932 	at1.flags |= AC_ACCESS_WRITE_MASK;
933 	ac_set_expected_status(&at1);
934 	if (!ac_test_do_access(&at1))
935 		goto err;
936 
937 	at2.flags |= AC_ACCESS_WRITE_MASK;
938 	ac_set_expected_status(&at2);
939 	if (!ac_test_do_access(&at2))
940 		goto err;
941 
942 	return 1;
943 
944 err:
945 	printf("corrupt_hugepage_trigger test fail\n");
946 	return 0;
947 }
948 
949 /*
950  * This test case is used to trigger the bug which is fixed by
951  * commit 3ddf6c06e13e in the kvm tree
952  */
check_pfec_on_prefetch_pte(ac_pt_env_t * pt_env)953 static int check_pfec_on_prefetch_pte(ac_pt_env_t *pt_env)
954 {
955 	ac_test_t at1, at2;
956 
957 	ac_test_init(&at1, 0xffff923406001000ul, pt_env);
958 	__ac_test_init(&at2, 0xffff923406003000ul, pt_env, &at1);
959 
960 	at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK;
961 	ac_test_setup_ptes(&at1);
962 
963 	at2.flags = at1.flags | AC_PTE_NX_MASK;
964 	ac_test_setup_ptes(&at2);
965 
966 	if (!ac_test_do_access(&at1)) {
967 		printf("%s: prepare fail\n", __FUNCTION__);
968 			goto err;
969 	}
970 
971 	if (!ac_test_do_access(&at2)) {
972 		printf("%s: check PFEC on prefetch pte path fail\n",
973 		       __FUNCTION__);
974 		goto err;
975 	}
976 
977 	return 1;
978 
979 err:
980 	return 0;
981 }
982 
983 /*
984  * If the write-fault access is from supervisor and CR0.WP is not set on the
985  * vcpu, kvm will fix it by adjusting pte access - it sets the W bit on pte
986  * and clears U bit. This is the chance that kvm can change pte access from
987  * readonly to writable.
988  *
989  * Unfortunately, the pte access is the access of 'direct' shadow page table,
990  * means direct sp.role.access = pte_access, then we will create a writable
991  * spte entry on the readonly shadow page table. It will cause Dirty bit is
992  * not tracked when two guest ptes point to the same large page. Note, it
993  * does not have other impact except Dirty bit since cr0.wp is encoded into
994  * sp.role.
995  *
996  * Note: to trigger this bug, hugepage should be disabled on host.
997  */
check_large_pte_dirty_for_nowp(ac_pt_env_t * pt_env)998 static int check_large_pte_dirty_for_nowp(ac_pt_env_t *pt_env)
999 {
1000 	ac_test_t at1, at2;
1001 
1002 	ac_test_init(&at1, 0xffff923403000000ul, pt_env);
1003 	__ac_test_init(&at2, 0xffffe66606000000ul, pt_env, &at1);
1004 
1005 	at2.flags = AC_PDE_PRESENT_MASK | AC_PDE_PSE_MASK;
1006 	ac_test_setup_ptes(&at2);
1007 	if (!ac_test_do_access(&at2)) {
1008 		printf("%s: read on the first mapping fail.\n", __FUNCTION__);
1009 		goto err;
1010 	}
1011 
1012 	at1.flags = at2.flags | AC_ACCESS_WRITE_MASK;
1013 	ac_test_setup_ptes(&at1);
1014 	if (!ac_test_do_access(&at1)) {
1015 		printf("%s: write on the second mapping fail.\n", __FUNCTION__);
1016 		goto err;
1017 	}
1018 
1019 	at2.flags |= AC_ACCESS_WRITE_MASK;
1020 	ac_set_expected_status(&at2);
1021 	if (!ac_test_do_access(&at2)) {
1022 		printf("%s: write on the first mapping fail.\n", __FUNCTION__);
1023 		goto err;
1024 	}
1025 
1026 	return 1;
1027 
1028 err:
1029 	return 0;
1030 }
1031 
check_smep_andnot_wp(ac_pt_env_t * pt_env)1032 static int check_smep_andnot_wp(ac_pt_env_t *pt_env)
1033 {
1034 	ac_test_t at1;
1035 	int err_prepare_andnot_wp, err_smep_andnot_wp;
1036 
1037 	if (!this_cpu_has(X86_FEATURE_SMEP)) {
1038 		return 1;
1039 	}
1040 
1041 	ac_test_init(&at1, 0xffff923406001000ul, pt_env);
1042 
1043 	at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK |
1044 		    AC_PDE_USER_MASK | AC_PTE_USER_MASK |
1045 		    AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK |
1046 		    AC_CPU_CR4_SMEP_MASK |
1047 		    AC_CPU_CR0_WP_MASK |
1048 		    AC_ACCESS_WRITE_MASK;
1049 	ac_test_setup_ptes(&at1);
1050 
1051 	/*
1052 	 * Here we write the ro user page when
1053 	 * cr0.wp=0, then we execute it and SMEP
1054 	 * fault should happen.
1055 	 */
1056 	err_prepare_andnot_wp = ac_test_do_access(&at1);
1057 	if (!err_prepare_andnot_wp) {
1058 		printf("%s: SMEP prepare fail\n", __FUNCTION__);
1059 		goto clean_up;
1060 	}
1061 
1062 	at1.flags &= ~AC_ACCESS_WRITE_MASK;
1063 	at1.flags |= AC_ACCESS_FETCH_MASK;
1064 	ac_set_expected_status(&at1);
1065 	err_smep_andnot_wp = ac_test_do_access(&at1);
1066 
1067 clean_up:
1068 	set_cr4_smep(&at1, 0);
1069 
1070 	if (!err_prepare_andnot_wp)
1071 		goto err;
1072 	if (!err_smep_andnot_wp) {
1073 		printf("%s: check SMEP without wp fail\n", __FUNCTION__);
1074 		goto err;
1075 	}
1076 	return 1;
1077 
1078 err:
1079 	return 0;
1080 }
1081 
1082 #define TOGGLE_CR0_WP_TEST_BASE_FLAGS \
1083 	(AC_PDE_PRESENT_MASK | AC_PDE_ACCESSED_MASK | \
1084 	 AC_PTE_PRESENT_MASK | AC_PTE_ACCESSED_MASK | \
1085 	 AC_ACCESS_WRITE_MASK)
1086 
do_cr0_wp_access(ac_test_t * at,int flags)1087 static int do_cr0_wp_access(ac_test_t *at, int flags)
1088 {
1089 	const bool cr0_wp = !!(flags & AC_CPU_CR0_WP_MASK);
1090 
1091 	at->flags = TOGGLE_CR0_WP_TEST_BASE_FLAGS | flags;
1092 	__ac_set_expected_status(at, false);
1093 
1094 	/*
1095 	 * Under VMX the guest might own the CR0.WP bit, requiring KVM to
1096 	 * manually keep track of it where needed, e.g. in the guest page
1097 	 * table walker.
1098 	 *
1099 	 * Load CR0.WP with the inverse value of what will be used during
1100 	 * the access test and toggle EFER.NX to coerce KVM into rebuilding
1101 	 * the current MMU context based on the soon-to-be-stale CR0.WP.
1102 	 */
1103 	set_cr0_wp(!cr0_wp);
1104 	set_efer_nx(1);
1105 	set_efer_nx(0);
1106 
1107 	if (!ac_test_do_access(at)) {
1108 		printf("%s: %ssupervisor write with CR0.WP=%d did not %s\n",
1109 		       __FUNCTION__, (flags & AC_FEP_MASK) ? "emulated " : "",
1110 		       cr0_wp, cr0_wp ? "FAULT" : "SUCCEED");
1111 		return 1;
1112 	}
1113 
1114 	return 0;
1115 }
1116 
check_toggle_cr0_wp(ac_pt_env_t * pt_env)1117 static int check_toggle_cr0_wp(ac_pt_env_t *pt_env)
1118 {
1119 	ac_test_t at;
1120 	int err = 0;
1121 
1122 	ac_test_init(&at, 0xffff923042007000ul, pt_env);
1123 	at.flags = TOGGLE_CR0_WP_TEST_BASE_FLAGS;
1124 	ac_test_setup_ptes(&at);
1125 
1126 	err += do_cr0_wp_access(&at, 0);
1127 	err += do_cr0_wp_access(&at, AC_CPU_CR0_WP_MASK);
1128 	if (!(invalid_mask & AC_FEP_MASK)) {
1129 		err += do_cr0_wp_access(&at, AC_FEP_MASK);
1130 		err += do_cr0_wp_access(&at, AC_FEP_MASK | AC_CPU_CR0_WP_MASK);
1131 	}
1132 
1133 	return err == 0;
1134 }
1135 
check_effective_sp_permissions(ac_pt_env_t * pt_env)1136 static int check_effective_sp_permissions(ac_pt_env_t *pt_env)
1137 {
1138 	unsigned long ptr1 = 0xffff923480000000;
1139 	unsigned long ptr2 = ptr1 + SZ_2M;
1140 	unsigned long ptr3 = ptr1 + SZ_1G;
1141 	unsigned long ptr4 = ptr3 + SZ_2M;
1142 	ac_test_t at1, at2, at3, at4;
1143 	int err_read_at1, err_write_at2;
1144 	int err_read_at3, err_write_at4;
1145 
1146 	/*
1147 	 * pgd[]   pud[]        pmd[]            virtual address pointers
1148 	 *                   /->pmd(u--)->pte1(uw-)->page1 <- ptr1 (u--)
1149 	 *      /->pud1(uw-)--->pmd(uw-)->pte2(uw-)->page2 <- ptr2 (uw-)
1150 	 * pgd-|
1151 	 *      \->pud2(u--)--->pmd(u--)->pte1(uw-)->page1 <- ptr3 (u--)
1152 	 *                   \->pmd(uw-)->pte2(uw-)->page2 <- ptr4 (u--)
1153 	 * pud1 and pud2 point to the same pmd page.
1154 	 */
1155 
1156 	ac_test_init(&at1, ptr1, pt_env);
1157 	at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK |
1158 		    AC_PDE_USER_MASK | AC_PTE_USER_MASK |
1159 		    AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK |
1160 		    AC_PTE_WRITABLE_MASK | AC_ACCESS_USER_MASK;
1161 	ac_test_setup_ptes(&at1);
1162 
1163 	__ac_test_init(&at2, ptr2, pt_env, &at1);
1164 	at2.flags = at1.flags | AC_PDE_WRITABLE_MASK | AC_PTE_DIRTY_MASK | AC_ACCESS_WRITE_MASK;
1165 	ac_test_setup_ptes(&at2);
1166 
1167 	__ac_test_init(&at3, ptr3, pt_env, &at1);
1168 	/* Override the PMD (1-based index) to point at ptr1's PMD. */
1169 	at3.page_tables[3] = at1.page_tables[3];
1170 	at3.flags = AC_PDPTE_NO_WRITABLE_MASK | at1.flags;
1171 	ac_test_setup_ptes(&at3);
1172 
1173 	/* Alias ptr2, only the PMD will differ; manually override the PMD. */
1174 	__ac_test_init(&at4, ptr4, pt_env, &at2);
1175 	at4.page_tables[3] = at1.page_tables[3];
1176 	at4.flags = AC_PDPTE_NO_WRITABLE_MASK | at2.flags;
1177 	ac_test_setup_ptes(&at4);
1178 
1179 	err_read_at1 = ac_test_do_access(&at1);
1180 	if (!err_read_at1) {
1181 		printf("%s: read access at1 fail\n", __FUNCTION__);
1182 		return 0;
1183 	}
1184 
1185 	err_write_at2 = ac_test_do_access(&at2);
1186 	if (!err_write_at2) {
1187 		printf("%s: write access at2 fail\n", __FUNCTION__);
1188 		return 0;
1189 	}
1190 
1191 	err_read_at3 = ac_test_do_access(&at3);
1192 	if (!err_read_at3) {
1193 		printf("%s: read access at3 fail\n", __FUNCTION__);
1194 		return 0;
1195 	}
1196 
1197 	err_write_at4 = ac_test_do_access(&at4);
1198 	if (!err_write_at4) {
1199 		printf("%s: write access at4 should fail\n", __FUNCTION__);
1200 		return 0;
1201 	}
1202 
1203 	return 1;
1204 }
1205 
ac_test_exec(ac_test_t * at,ac_pt_env_t * pt_env)1206 static int ac_test_exec(ac_test_t *at, ac_pt_env_t *pt_env)
1207 {
1208 	int r;
1209 
1210 	if (verbose) {
1211 		ac_test_show(at);
1212 	}
1213 	ac_test_setup_ptes(at);
1214 	r = ac_test_do_access(at);
1215 	return r;
1216 }
1217 
1218 typedef int (*ac_test_fn)(ac_pt_env_t *pt_env);
1219 const ac_test_fn ac_test_cases[] =
1220 {
1221 	corrupt_hugepage_trigger,
1222 	check_pfec_on_prefetch_pte,
1223 	check_large_pte_dirty_for_nowp,
1224 	check_smep_andnot_wp,
1225 	check_toggle_cr0_wp,
1226 	check_effective_sp_permissions,
1227 };
1228 
ac_test_run(int pt_levels,bool force_emulation)1229 void ac_test_run(int pt_levels, bool force_emulation)
1230 {
1231 	ac_test_t at;
1232 	ac_pt_env_t pt_env;
1233 	int i, tests, successes;
1234 
1235 	if (force_emulation && !is_fep_available()) {
1236 		report_skip("Forced emulation prefix (FEP) not available\n");
1237 		return;
1238 	}
1239 
1240 	printf("run\n");
1241 	tests = successes = 0;
1242 
1243 	shadow_cr0 = read_cr0();
1244 	shadow_cr4 = read_cr4();
1245 	shadow_cr3 = read_cr3();
1246 	shadow_efer = rdmsr(MSR_EFER);
1247 
1248 	if (cpuid_maxphyaddr() >= 52) {
1249 		invalid_mask |= AC_PDE_BIT51_MASK;
1250 		invalid_mask |= AC_PTE_BIT51_MASK;
1251 	}
1252 	if (cpuid_maxphyaddr() >= 37) {
1253 		invalid_mask |= AC_PDE_BIT36_MASK;
1254 		invalid_mask |= AC_PTE_BIT36_MASK;
1255 	}
1256 
1257 	if (!force_emulation)
1258 		invalid_mask |= AC_FEP_MASK;
1259 
1260 	ac_env_int(&pt_env, pt_levels);
1261 	ac_test_init(&at, 0xffff923400000000ul, &pt_env);
1262 
1263 	if (this_cpu_has(X86_FEATURE_PKU)) {
1264 		set_cr4_pke(1);
1265 		set_cr4_pke(0);
1266 		/* Now PKRU = 0xFFFFFFFF.  */
1267 	} else {
1268 		tests++;
1269 		if (write_cr4_safe(shadow_cr4 | X86_CR4_PKE) == GP_VECTOR) {
1270 			successes++;
1271 			invalid_mask |= AC_PKU_AD_MASK;
1272 			invalid_mask |= AC_PKU_WD_MASK;
1273 			invalid_mask |= AC_PKU_PKEY_MASK;
1274 			invalid_mask |= AC_CPU_CR4_PKE_MASK;
1275 			printf("CR4.PKE not available, disabling PKE tests\n");
1276 		} else {
1277 			printf("Set PKE in CR4 - expect #GP: FAIL!\n");
1278 			set_cr4_pke(0);
1279 		}
1280 	}
1281 
1282 	if (!this_cpu_has(X86_FEATURE_SMEP)) {
1283 		tests++;
1284 		if (set_cr4_smep(&at, 1) == GP_VECTOR) {
1285 			successes++;
1286 			invalid_mask |= AC_CPU_CR4_SMEP_MASK;
1287 			printf("CR4.SMEP not available, disabling SMEP tests\n");
1288 		} else {
1289 			printf("Set SMEP in CR4 - expect #GP: FAIL!\n");
1290 			set_cr4_smep(&at, 0);
1291 		}
1292 	}
1293 
1294 	/* Toggling LA57 in 64-bit mode (guaranteed for this test) is illegal. */
1295 	if (this_cpu_has(X86_FEATURE_LA57)) {
1296 		tests++;
1297 		if (write_cr4_safe(shadow_cr4 ^ X86_CR4_LA57) == GP_VECTOR)
1298 			successes++;
1299 
1300 		/* Force a VM-Exit on KVM, which doesn't intercept LA57 itself. */
1301 		tests++;
1302 		if (write_cr4_safe(shadow_cr4 ^ (X86_CR4_LA57 | X86_CR4_PSE)) == GP_VECTOR)
1303 			successes++;
1304 	}
1305 
1306 	do {
1307 		++tests;
1308 		successes += ac_test_exec(&at, &pt_env);
1309 	} while (ac_test_bump(&at));
1310 
1311 	for (i = 0; i < ARRAY_SIZE(ac_test_cases); i++) {
1312 		ac_env_int(&pt_env, pt_levels);
1313 
1314 		++tests;
1315 		successes += ac_test_cases[i](&pt_env);
1316 	}
1317 
1318 	printf("\n%d tests, %d failures\n", tests, tests - successes);
1319 
1320 	report(successes == tests, "%d-level paging tests%s", pt_levels,
1321 	       force_emulation ? " (with forced emulation)" : "");
1322 }
1323