xref: /kvm-unit-tests/x86/access.c (revision b4e8c300aeef5bcdd329fc2b19c1a1b392280aca)
1 
2 #include "libcflat.h"
3 #include "desc.h"
4 #include "processor.h"
5 #include "asm/page.h"
6 #include "x86/vm.h"
7 
8 #define smp_id() 0
9 
10 #define true 1
11 #define false 0
12 
13 static _Bool verbose = false;
14 
15 typedef unsigned long pt_element_t;
16 static int invalid_mask;
17 static int page_table_levels;
18 
19 #define PT_BASE_ADDR_MASK ((pt_element_t)((((pt_element_t)1 << 36) - 1) & PAGE_MASK))
20 #define PT_PSE_BASE_ADDR_MASK (PT_BASE_ADDR_MASK & ~(1ull << 21))
21 
22 #define CR0_WP_MASK (1UL << 16)
23 #define CR4_SMEP_MASK (1UL << 20)
24 
25 #define PFERR_PRESENT_MASK (1U << 0)
26 #define PFERR_WRITE_MASK (1U << 1)
27 #define PFERR_USER_MASK (1U << 2)
28 #define PFERR_RESERVED_MASK (1U << 3)
29 #define PFERR_FETCH_MASK (1U << 4)
30 #define PFERR_PK_MASK (1U << 5)
31 
32 #define MSR_EFER 0xc0000080
33 #define EFER_NX_MASK		(1ull << 11)
34 
35 #define PT_INDEX(address, level)       \
36        ((address) >> (12 + ((level)-1) * 9)) & 511
37 
38 /*
39  * page table access check tests
40  */
41 
42 enum {
43     AC_PTE_PRESENT_BIT,
44     AC_PTE_WRITABLE_BIT,
45     AC_PTE_USER_BIT,
46     AC_PTE_ACCESSED_BIT,
47     AC_PTE_DIRTY_BIT,
48     AC_PTE_NX_BIT,
49     AC_PTE_BIT51_BIT,
50     AC_PTE_BIT36_BIT,
51 
52     AC_PDE_PRESENT_BIT,
53     AC_PDE_WRITABLE_BIT,
54     AC_PDE_USER_BIT,
55     AC_PDE_ACCESSED_BIT,
56     AC_PDE_DIRTY_BIT,
57     AC_PDE_PSE_BIT,
58     AC_PDE_NX_BIT,
59     AC_PDE_BIT51_BIT,
60     AC_PDE_BIT36_BIT,
61     AC_PDE_BIT13_BIT,
62 
63     /*
64      *  special test case to DISABLE writable bit on page directory
65      *  pointer table entry.
66      */
67     AC_PDPTE_NO_WRITABLE_BIT,
68 
69     AC_PKU_AD_BIT,
70     AC_PKU_WD_BIT,
71     AC_PKU_PKEY_BIT,
72 
73     AC_ACCESS_USER_BIT,
74     AC_ACCESS_WRITE_BIT,
75     AC_ACCESS_FETCH_BIT,
76     AC_ACCESS_TWICE_BIT,
77 
78     AC_CPU_EFER_NX_BIT,
79     AC_CPU_CR0_WP_BIT,
80     AC_CPU_CR4_SMEP_BIT,
81     AC_CPU_CR4_PKE_BIT,
82 
83     NR_AC_FLAGS
84 };
85 
86 #define AC_PTE_PRESENT_MASK   (1 << AC_PTE_PRESENT_BIT)
87 #define AC_PTE_WRITABLE_MASK  (1 << AC_PTE_WRITABLE_BIT)
88 #define AC_PTE_USER_MASK      (1 << AC_PTE_USER_BIT)
89 #define AC_PTE_ACCESSED_MASK  (1 << AC_PTE_ACCESSED_BIT)
90 #define AC_PTE_DIRTY_MASK     (1 << AC_PTE_DIRTY_BIT)
91 #define AC_PTE_NX_MASK        (1 << AC_PTE_NX_BIT)
92 #define AC_PTE_BIT51_MASK     (1 << AC_PTE_BIT51_BIT)
93 #define AC_PTE_BIT36_MASK     (1 << AC_PTE_BIT36_BIT)
94 
95 #define AC_PDE_PRESENT_MASK   (1 << AC_PDE_PRESENT_BIT)
96 #define AC_PDE_WRITABLE_MASK  (1 << AC_PDE_WRITABLE_BIT)
97 #define AC_PDE_USER_MASK      (1 << AC_PDE_USER_BIT)
98 #define AC_PDE_ACCESSED_MASK  (1 << AC_PDE_ACCESSED_BIT)
99 #define AC_PDE_DIRTY_MASK     (1 << AC_PDE_DIRTY_BIT)
100 #define AC_PDE_PSE_MASK       (1 << AC_PDE_PSE_BIT)
101 #define AC_PDE_NX_MASK        (1 << AC_PDE_NX_BIT)
102 #define AC_PDE_BIT51_MASK     (1 << AC_PDE_BIT51_BIT)
103 #define AC_PDE_BIT36_MASK     (1 << AC_PDE_BIT36_BIT)
104 #define AC_PDE_BIT13_MASK     (1 << AC_PDE_BIT13_BIT)
105 
106 #define AC_PDPTE_NO_WRITABLE_MASK  (1 << AC_PDPTE_NO_WRITABLE_BIT)
107 
108 #define AC_PKU_AD_MASK        (1 << AC_PKU_AD_BIT)
109 #define AC_PKU_WD_MASK        (1 << AC_PKU_WD_BIT)
110 #define AC_PKU_PKEY_MASK      (1 << AC_PKU_PKEY_BIT)
111 
112 #define AC_ACCESS_USER_MASK   (1 << AC_ACCESS_USER_BIT)
113 #define AC_ACCESS_WRITE_MASK  (1 << AC_ACCESS_WRITE_BIT)
114 #define AC_ACCESS_FETCH_MASK  (1 << AC_ACCESS_FETCH_BIT)
115 #define AC_ACCESS_TWICE_MASK  (1 << AC_ACCESS_TWICE_BIT)
116 
117 #define AC_CPU_EFER_NX_MASK   (1 << AC_CPU_EFER_NX_BIT)
118 #define AC_CPU_CR0_WP_MASK    (1 << AC_CPU_CR0_WP_BIT)
119 #define AC_CPU_CR4_SMEP_MASK  (1 << AC_CPU_CR4_SMEP_BIT)
120 #define AC_CPU_CR4_PKE_MASK   (1 << AC_CPU_CR4_PKE_BIT)
121 
122 const char *ac_names[] = {
123     [AC_PTE_PRESENT_BIT] = "pte.p",
124     [AC_PTE_ACCESSED_BIT] = "pte.a",
125     [AC_PTE_WRITABLE_BIT] = "pte.rw",
126     [AC_PTE_USER_BIT] = "pte.user",
127     [AC_PTE_DIRTY_BIT] = "pte.d",
128     [AC_PTE_NX_BIT] = "pte.nx",
129     [AC_PTE_BIT51_BIT] = "pte.51",
130     [AC_PTE_BIT36_BIT] = "pte.36",
131     [AC_PDE_PRESENT_BIT] = "pde.p",
132     [AC_PDE_ACCESSED_BIT] = "pde.a",
133     [AC_PDE_WRITABLE_BIT] = "pde.rw",
134     [AC_PDE_USER_BIT] = "pde.user",
135     [AC_PDE_DIRTY_BIT] = "pde.d",
136     [AC_PDE_PSE_BIT] = "pde.pse",
137     [AC_PDE_NX_BIT] = "pde.nx",
138     [AC_PDE_BIT51_BIT] = "pde.51",
139     [AC_PDE_BIT36_BIT] = "pde.36",
140     [AC_PDE_BIT13_BIT] = "pde.13",
141     [AC_PDPTE_NO_WRITABLE_BIT] = "pdpte.ro",
142     [AC_PKU_AD_BIT] = "pkru.ad",
143     [AC_PKU_WD_BIT] = "pkru.wd",
144     [AC_PKU_PKEY_BIT] = "pkey=1",
145     [AC_ACCESS_WRITE_BIT] = "write",
146     [AC_ACCESS_USER_BIT] = "user",
147     [AC_ACCESS_FETCH_BIT] = "fetch",
148     [AC_ACCESS_TWICE_BIT] = "twice",
149     [AC_CPU_EFER_NX_BIT] = "efer.nx",
150     [AC_CPU_CR0_WP_BIT] = "cr0.wp",
151     [AC_CPU_CR4_SMEP_BIT] = "cr4.smep",
152     [AC_CPU_CR4_PKE_BIT] = "cr4.pke",
153 };
154 
155 static inline void *va(pt_element_t phys)
156 {
157     return (void *)phys;
158 }
159 
160 typedef struct {
161     pt_element_t pt_pool;
162     unsigned pt_pool_size;
163     unsigned pt_pool_current;
164 } ac_pool_t;
165 
166 typedef struct {
167     unsigned flags;
168     void *virt;
169     pt_element_t phys;
170     pt_element_t *ptep;
171     pt_element_t expected_pte;
172     pt_element_t *pdep;
173     pt_element_t expected_pde;
174     pt_element_t ignore_pde;
175     int expected_fault;
176     unsigned expected_error;
177 } ac_test_t;
178 
179 typedef struct {
180     unsigned short limit;
181     unsigned long linear_addr;
182 } __attribute__((packed)) descriptor_table_t;
183 
184 
185 static void ac_test_show(ac_test_t *at);
186 
187 static unsigned long shadow_cr0;
188 static unsigned long shadow_cr4;
189 static unsigned long long shadow_efer;
190 
191 static void set_cr0_wp(int wp)
192 {
193     unsigned long cr0 = shadow_cr0;
194 
195     cr0 &= ~CR0_WP_MASK;
196     if (wp)
197 	cr0 |= CR0_WP_MASK;
198     if (cr0 != shadow_cr0) {
199         write_cr0(cr0);
200         shadow_cr0 = cr0;
201     }
202 }
203 
204 static unsigned set_cr4_smep(int smep)
205 {
206     unsigned long cr4 = shadow_cr4;
207     extern u64 ptl2[];
208     unsigned r;
209 
210     cr4 &= ~CR4_SMEP_MASK;
211     if (smep)
212 	cr4 |= CR4_SMEP_MASK;
213     if (cr4 == shadow_cr4)
214         return 0;
215 
216     if (smep)
217         ptl2[2] &= ~PT_USER_MASK;
218     r = write_cr4_checking(cr4);
219     if (r || !smep) {
220         ptl2[2] |= PT_USER_MASK;
221 
222 	/* Flush to avoid spurious #PF */
223 	invlpg((void *)(2 << 21));
224     }
225     if (!r)
226         shadow_cr4 = cr4;
227     return r;
228 }
229 
230 static void set_cr4_pke(int pke)
231 {
232     unsigned long cr4 = shadow_cr4;
233 
234     cr4 &= ~X86_CR4_PKE;
235     if (pke)
236 	cr4 |= X86_CR4_PKE;
237     if (cr4 == shadow_cr4)
238         return;
239 
240     /* Check that protection keys do not affect accesses when CR4.PKE=0.  */
241     if ((shadow_cr4 & X86_CR4_PKE) && !pke)
242         write_pkru(0xfffffffc);
243     write_cr4(cr4);
244     shadow_cr4 = cr4;
245 }
246 
247 static void set_efer_nx(int nx)
248 {
249     unsigned long long efer = shadow_efer;
250 
251     efer &= ~EFER_NX_MASK;
252     if (nx)
253 	efer |= EFER_NX_MASK;
254     if (efer != shadow_efer) {
255         wrmsr(MSR_EFER, efer);
256         shadow_efer = efer;
257     }
258 }
259 
260 static void ac_env_int(ac_pool_t *pool)
261 {
262     extern char page_fault, kernel_entry;
263     set_idt_entry(14, &page_fault, 0);
264     set_idt_entry(0x20, &kernel_entry, 3);
265 
266     pool->pt_pool = 33 * 1024 * 1024;
267     pool->pt_pool_size = 120 * 1024 * 1024 - pool->pt_pool;
268     pool->pt_pool_current = 0;
269 }
270 
271 static void ac_test_init(ac_test_t *at, void *virt)
272 {
273     set_efer_nx(1);
274     set_cr0_wp(1);
275     at->flags = 0;
276     at->virt = virt;
277     at->phys = 32 * 1024 * 1024;
278 }
279 
280 static int ac_test_bump_one(ac_test_t *at)
281 {
282     at->flags = ((at->flags | invalid_mask) + 1) & ~invalid_mask;
283     return at->flags < (1 << NR_AC_FLAGS);
284 }
285 
286 #define F(x)  ((flags & x##_MASK) != 0)
287 
288 static _Bool ac_test_legal(ac_test_t *at)
289 {
290     int flags = at->flags;
291     unsigned reserved;
292 
293     if (F(AC_ACCESS_FETCH) && F(AC_ACCESS_WRITE))
294 	return false;
295 
296     /*
297      * Since we convert current page to kernel page when cr4.smep=1,
298      * we can't switch to user mode.
299      */
300     if (F(AC_ACCESS_USER) && F(AC_CPU_CR4_SMEP))
301 	return false;
302 
303     /*
304      * Only test protection key faults if CR4.PKE=1.
305      */
306     if (!F(AC_CPU_CR4_PKE) &&
307         (F(AC_PKU_AD) || F(AC_PKU_WD))) {
308 	return false;
309     }
310 
311     /*
312      * pde.bit13 checks handling of reserved bits in largepage PDEs.  It is
313      * meaningless if there is a PTE.
314      */
315     if (!F(AC_PDE_PSE) && F(AC_PDE_BIT13))
316         return false;
317 
318     /*
319      * Shorten the test by avoiding testing too many reserved bit combinations.
320      * Skip testing multiple reserved bits to shorten the test. Reserved bit
321      * page faults are terminal and multiple reserved bits do not affect the
322      * error code; the odds of a KVM bug are super low, and the odds of actually
323      * being able to detect a bug are even lower.
324      */
325     reserved = (AC_PDE_BIT51_MASK | AC_PDE_BIT36_MASK | AC_PDE_BIT13_MASK |
326 	        AC_PTE_BIT51_MASK | AC_PTE_BIT36_MASK);
327     if (!F(AC_CPU_EFER_NX))
328         reserved |= AC_PDE_NX_MASK | AC_PTE_NX_MASK;
329 
330     /* Only test one reserved bit at a time.  */
331     reserved &= flags;
332     if (reserved & (reserved - 1))
333         return false;
334 
335     return true;
336 }
337 
338 static int ac_test_bump(ac_test_t *at)
339 {
340     int ret;
341 
342     ret = ac_test_bump_one(at);
343     while (ret && !ac_test_legal(at))
344 	ret = ac_test_bump_one(at);
345     return ret;
346 }
347 
348 static pt_element_t ac_test_alloc_pt(ac_pool_t *pool)
349 {
350     pt_element_t ret = pool->pt_pool + pool->pt_pool_current;
351     pool->pt_pool_current += PAGE_SIZE;
352     memset(va(ret), 0, PAGE_SIZE);
353     return ret;
354 }
355 
356 static _Bool ac_test_enough_room(ac_pool_t *pool)
357 {
358     return pool->pt_pool_current + 5 * PAGE_SIZE <= pool->pt_pool_size;
359 }
360 
361 static void ac_test_reset_pt_pool(ac_pool_t *pool)
362 {
363     pool->pt_pool_current = 0;
364 }
365 
366 static pt_element_t ac_test_permissions(ac_test_t *at, unsigned flags,
367                                         bool writable, bool user,
368                                         bool executable)
369 {
370     bool kwritable = !F(AC_CPU_CR0_WP) && !F(AC_ACCESS_USER);
371     pt_element_t expected = 0;
372 
373     if (F(AC_ACCESS_USER) && !user)
374 	at->expected_fault = 1;
375 
376     if (F(AC_ACCESS_WRITE) && !writable && !kwritable)
377 	at->expected_fault = 1;
378 
379     if (F(AC_ACCESS_FETCH) && !executable)
380 	at->expected_fault = 1;
381 
382     if (F(AC_ACCESS_FETCH) && user && F(AC_CPU_CR4_SMEP))
383         at->expected_fault = 1;
384 
385     if (user && !F(AC_ACCESS_FETCH) && F(AC_PKU_PKEY) && F(AC_CPU_CR4_PKE)) {
386         if (F(AC_PKU_AD)) {
387             at->expected_fault = 1;
388             at->expected_error |= PFERR_PK_MASK;
389         } else if (F(AC_ACCESS_WRITE) && F(AC_PKU_WD) && !kwritable) {
390             at->expected_fault = 1;
391             at->expected_error |= PFERR_PK_MASK;
392         }
393     }
394 
395     if (!at->expected_fault) {
396         expected |= PT_ACCESSED_MASK;
397         if (F(AC_ACCESS_WRITE))
398             expected |= PT_DIRTY_MASK;
399     }
400 
401     return expected;
402 }
403 
404 static void ac_emulate_access(ac_test_t *at, unsigned flags)
405 {
406     bool pde_valid, pte_valid;
407     bool user, writable, executable;
408 
409     if (F(AC_ACCESS_USER))
410 	at->expected_error |= PFERR_USER_MASK;
411 
412     if (F(AC_ACCESS_WRITE))
413 	at->expected_error |= PFERR_WRITE_MASK;
414 
415     if (F(AC_ACCESS_FETCH))
416 	at->expected_error |= PFERR_FETCH_MASK;
417 
418     if (!F(AC_PDE_ACCESSED))
419         at->ignore_pde = PT_ACCESSED_MASK;
420 
421     pde_valid = F(AC_PDE_PRESENT)
422         && !F(AC_PDE_BIT51) && !F(AC_PDE_BIT36) && !F(AC_PDE_BIT13)
423         && !(F(AC_PDE_NX) && !F(AC_CPU_EFER_NX));
424 
425     if (!pde_valid) {
426         at->expected_fault = 1;
427 	if (F(AC_PDE_PRESENT)) {
428             at->expected_error |= PFERR_RESERVED_MASK;
429         } else {
430             at->expected_error &= ~PFERR_PRESENT_MASK;
431         }
432 	goto fault;
433     }
434 
435     writable = !F(AC_PDPTE_NO_WRITABLE) && F(AC_PDE_WRITABLE);
436     user = F(AC_PDE_USER);
437     executable = !F(AC_PDE_NX);
438 
439     if (F(AC_PDE_PSE)) {
440         at->expected_pde |= ac_test_permissions(at, flags, writable, user,
441                                                 executable);
442 	goto no_pte;
443     }
444 
445     at->expected_pde |= PT_ACCESSED_MASK;
446 
447     pte_valid = F(AC_PTE_PRESENT)
448         && !F(AC_PTE_BIT51) && !F(AC_PTE_BIT36)
449         && !(F(AC_PTE_NX) && !F(AC_CPU_EFER_NX));
450 
451     if (!pte_valid) {
452         at->expected_fault = 1;
453 	if (F(AC_PTE_PRESENT)) {
454             at->expected_error |= PFERR_RESERVED_MASK;
455         } else {
456             at->expected_error &= ~PFERR_PRESENT_MASK;
457         }
458 	goto fault;
459     }
460 
461     writable &= F(AC_PTE_WRITABLE);
462     user &= F(AC_PTE_USER);
463     executable &= !F(AC_PTE_NX);
464 
465     at->expected_pte |= ac_test_permissions(at, flags, writable, user,
466                                             executable);
467 
468 no_pte:
469 fault:
470     if (!at->expected_fault)
471         at->ignore_pde = 0;
472     if (!F(AC_CPU_EFER_NX) && !F(AC_CPU_CR4_SMEP))
473         at->expected_error &= ~PFERR_FETCH_MASK;
474 }
475 
476 static void ac_set_expected_status(ac_test_t *at)
477 {
478     invlpg(at->virt);
479 
480     if (at->ptep)
481 	at->expected_pte = *at->ptep;
482     at->expected_pde = *at->pdep;
483     at->ignore_pde = 0;
484     at->expected_fault = 0;
485     at->expected_error = PFERR_PRESENT_MASK;
486 
487     if (at->flags & AC_ACCESS_TWICE_MASK) {
488         ac_emulate_access(at, at->flags & ~AC_ACCESS_WRITE_MASK
489                           & ~AC_ACCESS_FETCH_MASK & ~AC_ACCESS_USER_MASK);
490         at->expected_fault = 0;
491 	at->expected_error = PFERR_PRESENT_MASK;
492         at->ignore_pde = 0;
493     }
494 
495     ac_emulate_access(at, at->flags);
496 }
497 
498 static void __ac_setup_specific_pages(ac_test_t *at, ac_pool_t *pool, bool reuse,
499 				      u64 pd_page, u64 pt_page)
500 
501 {
502     unsigned long root = read_cr3();
503     int flags = at->flags;
504     bool skip = true;
505 
506     if (!ac_test_enough_room(pool))
507 	ac_test_reset_pt_pool(pool);
508 
509     at->ptep = 0;
510     for (int i = page_table_levels; i >= 1 && (i >= 2 || !F(AC_PDE_PSE)); --i) {
511 	pt_element_t *vroot = va(root & PT_BASE_ADDR_MASK);
512 	unsigned index = PT_INDEX((unsigned long)at->virt, i);
513 	pt_element_t pte = 0;
514 
515 	/*
516 	 * Reuse existing page tables along the path to the test code and data
517 	 * (which is in the bottom 2MB).
518 	 */
519 	if (skip && i >= 2 && index == 0) {
520 	    goto next;
521 	}
522 	skip = false;
523 	if (reuse && vroot[index]) {
524 	    switch (i) {
525 	    case 2:
526 		at->pdep = &vroot[index];
527 		break;
528 	    case 1:
529 		at->ptep = &vroot[index];
530 		break;
531 	    }
532 	    goto next;
533 	}
534 
535 	switch (i) {
536 	case 5:
537 	case 4:
538 	    pte = ac_test_alloc_pt(pool);
539 	    pte |= PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
540 	    break;
541 	case 3:
542 	    pte = pd_page ? pd_page : ac_test_alloc_pt(pool);
543 	    pte |= PT_PRESENT_MASK | PT_USER_MASK;
544 	    if (!F(AC_PDPTE_NO_WRITABLE))
545 		pte |= PT_WRITABLE_MASK;
546 	    break;
547 	case 2:
548 	    if (!F(AC_PDE_PSE)) {
549 		pte = pt_page ? pt_page : ac_test_alloc_pt(pool);
550 		/* The protection key is ignored on non-leaf entries.  */
551                 if (F(AC_PKU_PKEY))
552                     pte |= 2ull << 59;
553 	    } else {
554 		pte = at->phys & PT_PSE_BASE_ADDR_MASK;
555 		pte |= PT_PAGE_SIZE_MASK;
556                 if (F(AC_PKU_PKEY))
557                     pte |= 1ull << 59;
558 	    }
559 	    if (F(AC_PDE_PRESENT))
560 		pte |= PT_PRESENT_MASK;
561 	    if (F(AC_PDE_WRITABLE))
562 		pte |= PT_WRITABLE_MASK;
563 	    if (F(AC_PDE_USER))
564 		pte |= PT_USER_MASK;
565 	    if (F(AC_PDE_ACCESSED))
566 		pte |= PT_ACCESSED_MASK;
567 	    if (F(AC_PDE_DIRTY))
568 		pte |= PT_DIRTY_MASK;
569 	    if (F(AC_PDE_NX))
570 		pte |= PT64_NX_MASK;
571 	    if (F(AC_PDE_BIT51))
572 		pte |= 1ull << 51;
573 	    if (F(AC_PDE_BIT36))
574                 pte |= 1ull << 36;
575 	    if (F(AC_PDE_BIT13))
576 		pte |= 1ull << 13;
577 	    at->pdep = &vroot[index];
578 	    break;
579 	case 1:
580 	    pte = at->phys & PT_BASE_ADDR_MASK;
581 	    if (F(AC_PKU_PKEY))
582 		pte |= 1ull << 59;
583 	    if (F(AC_PTE_PRESENT))
584 		pte |= PT_PRESENT_MASK;
585 	    if (F(AC_PTE_WRITABLE))
586 		pte |= PT_WRITABLE_MASK;
587 	    if (F(AC_PTE_USER))
588 		pte |= PT_USER_MASK;
589 	    if (F(AC_PTE_ACCESSED))
590 		pte |= PT_ACCESSED_MASK;
591 	    if (F(AC_PTE_DIRTY))
592 		pte |= PT_DIRTY_MASK;
593 	    if (F(AC_PTE_NX))
594 		pte |= PT64_NX_MASK;
595 	    if (F(AC_PTE_BIT51))
596 		pte |= 1ull << 51;
597 	    if (F(AC_PTE_BIT36))
598                 pte |= 1ull << 36;
599 	    at->ptep = &vroot[index];
600 	    break;
601 	}
602 	vroot[index] = pte;
603  next:
604 	root = vroot[index];
605     }
606     ac_set_expected_status(at);
607 }
608 
609 static void ac_test_setup_pte(ac_test_t *at, ac_pool_t *pool)
610 {
611 	__ac_setup_specific_pages(at, pool, false, 0, 0);
612 }
613 
614 static void ac_setup_specific_pages(ac_test_t *at, ac_pool_t *pool,
615 				    u64 pd_page, u64 pt_page)
616 {
617 	return __ac_setup_specific_pages(at, pool, false, pd_page, pt_page);
618 }
619 
620 static void dump_mapping(ac_test_t *at)
621 {
622 	unsigned long root = read_cr3();
623         int flags = at->flags;
624 	int i;
625 
626 	printf("Dump mapping: address: %p\n", at->virt);
627 	for (i = page_table_levels ; i >= 1 && (i >= 2 || !F(AC_PDE_PSE)); --i) {
628 		pt_element_t *vroot = va(root & PT_BASE_ADDR_MASK);
629 		unsigned index = PT_INDEX((unsigned long)at->virt, i);
630 		pt_element_t pte = vroot[index];
631 
632 		printf("------L%d: %lx\n", i, pte);
633 		root = vroot[index];
634 	}
635 }
636 
637 static void ac_test_check(ac_test_t *at, _Bool *success_ret, _Bool cond,
638                           const char *fmt, ...)
639 {
640     va_list ap;
641     char buf[500];
642 
643     if (!*success_ret) {
644         return;
645     }
646 
647     if (!cond) {
648         return;
649     }
650 
651     *success_ret = false;
652 
653     if (!verbose) {
654         puts("\n");
655         ac_test_show(at);
656     }
657 
658     va_start(ap, fmt);
659     vsnprintf(buf, sizeof(buf), fmt, ap);
660     va_end(ap);
661     printf("FAIL: %s\n", buf);
662     dump_mapping(at);
663 }
664 
665 static int pt_match(pt_element_t pte1, pt_element_t pte2, pt_element_t ignore)
666 {
667     pte1 &= ~ignore;
668     pte2 &= ~ignore;
669     return pte1 == pte2;
670 }
671 
672 static int ac_test_do_access(ac_test_t *at)
673 {
674     static unsigned unique = 42;
675     int fault = 0;
676     unsigned e;
677     static unsigned char user_stack[4096];
678     unsigned long rsp;
679     _Bool success = true;
680     int flags = at->flags;
681 
682     ++unique;
683     if (!(unique & 65535)) {
684         puts(".");
685     }
686 
687     *((unsigned char *)at->phys) = 0xc3; /* ret */
688 
689     unsigned r = unique;
690     set_cr0_wp(F(AC_CPU_CR0_WP));
691     set_efer_nx(F(AC_CPU_EFER_NX));
692     set_cr4_pke(F(AC_CPU_CR4_PKE));
693     if (F(AC_CPU_CR4_PKE)) {
694         /* WD2=AD2=1, WD1=F(AC_PKU_WD), AD1=F(AC_PKU_AD) */
695         write_pkru(0x30 | (F(AC_PKU_WD) ? 8 : 0) |
696                    (F(AC_PKU_AD) ? 4 : 0));
697     }
698 
699     set_cr4_smep(F(AC_CPU_CR4_SMEP));
700 
701     if (F(AC_ACCESS_TWICE)) {
702 	asm volatile (
703 	    "lea fixed2(%%rip), %%rsi \n\t"
704 	    "mov (%[addr]), %[reg] \n\t"
705 	    "fixed2:"
706 	    : [reg]"=r"(r), [fault]"=a"(fault), "=b"(e)
707 	    : [addr]"r"(at->virt)
708 	    : "rsi"
709 	    );
710 	fault = 0;
711     }
712 
713     asm volatile ("lea fixed1(%%rip), %%rsi \n\t"
714 		  "mov %%rsp, %[rsp0] \n\t"
715 		  "cmp $0, %[user] \n\t"
716 		  "jz do_access \n\t"
717 		  "push %%rax; mov %[user_ds], %%ax; mov %%ax, %%ds; pop %%rax  \n\t"
718 		  "pushq %[user_ds] \n\t"
719 		  "pushq %[user_stack_top] \n\t"
720 		  "pushfq \n\t"
721 		  "pushq %[user_cs] \n\t"
722 		  "lea do_access(%%rip), %%r8\n\t"
723 		  "pushq %%r8\n\t"
724 		  "iretq \n"
725 		  "do_access: \n\t"
726 		  "cmp $0, %[fetch] \n\t"
727 		  "jnz 2f \n\t"
728 		  "cmp $0, %[write] \n\t"
729 		  "jnz 1f \n\t"
730 		  "mov (%[addr]), %[reg] \n\t"
731 		  "jmp done \n\t"
732 		  "1: mov %[reg], (%[addr]) \n\t"
733 		  "jmp done \n\t"
734 		  "2: call *%[addr] \n\t"
735 		  "done: \n"
736 		  "fixed1: \n"
737 		  "int %[kernel_entry_vector] \n\t"
738 		  ".section .text.entry \n\t"
739 		  "kernel_entry: \n\t"
740 		  "mov %[rsp0], %%rsp \n\t"
741 		  "jmp back_to_kernel \n\t"
742 		  ".section .text \n\t"
743 		  "back_to_kernel:"
744 		  : [reg]"+r"(r), "+a"(fault), "=b"(e), "=&d"(rsp),
745 		    [rsp0]"=m"(tss[0].rsp0)
746 		  : [addr]"r"(at->virt),
747 		    [write]"r"(F(AC_ACCESS_WRITE)),
748 		    [user]"r"(F(AC_ACCESS_USER)),
749 		    [fetch]"r"(F(AC_ACCESS_FETCH)),
750 		    [user_ds]"i"(USER_DS),
751 		    [user_cs]"i"(USER_CS),
752 		    [user_stack_top]"r"(user_stack + sizeof user_stack),
753 		    [kernel_entry_vector]"i"(0x20)
754 		  : "rsi", "r8");
755 
756     asm volatile (".section .text.pf \n\t"
757 		  "page_fault: \n\t"
758 		  "pop %rbx \n\t"
759 		  "mov %rsi, (%rsp) \n\t"
760 		  "movl $1, %eax \n\t"
761 		  "iretq \n\t"
762 		  ".section .text");
763 
764     ac_test_check(at, &success, fault && !at->expected_fault,
765                   "unexpected fault");
766     ac_test_check(at, &success, !fault && at->expected_fault,
767                   "unexpected access");
768     ac_test_check(at, &success, fault && e != at->expected_error,
769                   "error code %x expected %x", e, at->expected_error);
770     if (at->ptep)
771         ac_test_check(at, &success, *at->ptep != at->expected_pte,
772                       "pte %x expected %x", *at->ptep, at->expected_pte);
773     ac_test_check(at, &success,
774                   !pt_match(*at->pdep, at->expected_pde, at->ignore_pde),
775                   "pde %x expected %x", *at->pdep, at->expected_pde);
776 
777     if (success && verbose) {
778 	if (at->expected_fault) {
779             printf("PASS (%x)\n", at->expected_error);
780 	} else {
781             printf("PASS\n");
782 	}
783     }
784     return success;
785 }
786 
787 static void ac_test_show(ac_test_t *at)
788 {
789     char line[5000];
790 
791     *line = 0;
792     strcat(line, "test");
793     for (int i = 0; i < NR_AC_FLAGS; ++i)
794 	if (at->flags & (1 << i)) {
795 	    strcat(line, " ");
796 	    strcat(line, ac_names[i]);
797 	}
798 
799     strcat(line, ": ");
800     printf("%s", line);
801 }
802 
803 /*
804  * This test case is used to triger the bug which is fixed by
805  * commit e09e90a5 in the kvm tree
806  */
807 static int corrupt_hugepage_triger(ac_pool_t *pool)
808 {
809     ac_test_t at1, at2;
810 
811     ac_test_init(&at1, (void *)(0x123400000000));
812     ac_test_init(&at2, (void *)(0x666600000000));
813 
814     at2.flags = AC_CPU_CR0_WP_MASK | AC_PDE_PSE_MASK | AC_PDE_PRESENT_MASK;
815     ac_test_setup_pte(&at2, pool);
816     if (!ac_test_do_access(&at2))
817         goto err;
818 
819     at1.flags = at2.flags | AC_PDE_WRITABLE_MASK;
820     ac_test_setup_pte(&at1, pool);
821     if (!ac_test_do_access(&at1))
822         goto err;
823 
824     at1.flags |= AC_ACCESS_WRITE_MASK;
825     ac_set_expected_status(&at1);
826     if (!ac_test_do_access(&at1))
827         goto err;
828 
829     at2.flags |= AC_ACCESS_WRITE_MASK;
830     ac_set_expected_status(&at2);
831     if (!ac_test_do_access(&at2))
832         goto err;
833 
834     return 1;
835 
836 err:
837     printf("corrupt_hugepage_triger test fail\n");
838     return 0;
839 }
840 
841 /*
842  * This test case is used to triger the bug which is fixed by
843  * commit 3ddf6c06e13e in the kvm tree
844  */
845 static int check_pfec_on_prefetch_pte(ac_pool_t *pool)
846 {
847 	ac_test_t at1, at2;
848 
849 	ac_test_init(&at1, (void *)(0x123406001000));
850 	ac_test_init(&at2, (void *)(0x123406003000));
851 
852 	at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK;
853 	ac_setup_specific_pages(&at1, pool, 30 * 1024 * 1024, 30 * 1024 * 1024);
854 
855         at2.flags = at1.flags | AC_PTE_NX_MASK;
856 	ac_setup_specific_pages(&at2, pool, 30 * 1024 * 1024, 30 * 1024 * 1024);
857 
858 	if (!ac_test_do_access(&at1)) {
859 		printf("%s: prepare fail\n", __FUNCTION__);
860 		goto err;
861 	}
862 
863 	if (!ac_test_do_access(&at2)) {
864 		printf("%s: check PFEC on prefetch pte path fail\n",
865 			__FUNCTION__);
866 		goto err;
867 	}
868 
869 	return 1;
870 
871 err:
872     return 0;
873 }
874 
875 /*
876  * If the write-fault access is from supervisor and CR0.WP is not set on the
877  * vcpu, kvm will fix it by adjusting pte access - it sets the W bit on pte
878  * and clears U bit. This is the chance that kvm can change pte access from
879  * readonly to writable.
880  *
881  * Unfortunately, the pte access is the access of 'direct' shadow page table,
882  * means direct sp.role.access = pte_access, then we will create a writable
883  * spte entry on the readonly shadow page table. It will cause Dirty bit is
884  * not tracked when two guest ptes point to the same large page. Note, it
885  * does not have other impact except Dirty bit since cr0.wp is encoded into
886  * sp.role.
887  *
888  * Note: to trigger this bug, hugepage should be disabled on host.
889  */
890 static int check_large_pte_dirty_for_nowp(ac_pool_t *pool)
891 {
892 	ac_test_t at1, at2;
893 
894 	ac_test_init(&at1, (void *)(0x123403000000));
895 	ac_test_init(&at2, (void *)(0x666606000000));
896 
897         at2.flags = AC_PDE_PRESENT_MASK | AC_PDE_PSE_MASK;
898 	ac_test_setup_pte(&at2, pool);
899 	if (!ac_test_do_access(&at2)) {
900 		printf("%s: read on the first mapping fail.\n", __FUNCTION__);
901 		goto err;
902 	}
903 
904         at1.flags = at2.flags | AC_ACCESS_WRITE_MASK;
905 	ac_test_setup_pte(&at1, pool);
906 	if (!ac_test_do_access(&at1)) {
907 		printf("%s: write on the second mapping fail.\n", __FUNCTION__);
908 		goto err;
909 	}
910 
911 	at2.flags |= AC_ACCESS_WRITE_MASK;
912 	ac_set_expected_status(&at2);
913 	if (!ac_test_do_access(&at2)) {
914 		printf("%s: write on the first mapping fail.\n", __FUNCTION__);
915 		goto err;
916 	}
917 
918 	return 1;
919 
920 err:
921 	return 0;
922 }
923 
924 static int check_smep_andnot_wp(ac_pool_t *pool)
925 {
926 	ac_test_t at1;
927 	int err_prepare_andnot_wp, err_smep_andnot_wp;
928 
929 	if (!this_cpu_has(X86_FEATURE_SMEP)) {
930 	    return 1;
931 	}
932 
933 	ac_test_init(&at1, (void *)(0x123406001000));
934 
935 	at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK |
936             AC_PDE_USER_MASK | AC_PTE_USER_MASK |
937             AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK |
938             AC_CPU_CR4_SMEP_MASK |
939             AC_CPU_CR0_WP_MASK |
940             AC_ACCESS_WRITE_MASK;
941 	ac_test_setup_pte(&at1, pool);
942 
943 	/*
944 	 * Here we write the ro user page when
945 	 * cr0.wp=0, then we execute it and SMEP
946 	 * fault should happen.
947 	 */
948 	err_prepare_andnot_wp = ac_test_do_access(&at1);
949 	if (!err_prepare_andnot_wp) {
950 		printf("%s: SMEP prepare fail\n", __FUNCTION__);
951 		goto clean_up;
952 	}
953 
954         at1.flags &= ~AC_ACCESS_WRITE_MASK;
955         at1.flags |= AC_ACCESS_FETCH_MASK;
956         ac_set_expected_status(&at1);
957         err_smep_andnot_wp = ac_test_do_access(&at1);
958 
959 clean_up:
960 	set_cr4_smep(0);
961 
962 	if (!err_prepare_andnot_wp)
963 		goto err;
964 	if (!err_smep_andnot_wp) {
965 		printf("%s: check SMEP without wp fail\n", __FUNCTION__);
966 		goto err;
967 	}
968 	return 1;
969 
970 err:
971 	return 0;
972 }
973 
974 static int check_effective_sp_permissions(ac_pool_t *pool)
975 {
976 	unsigned long ptr1 = 0x123480000000;
977 	unsigned long ptr2 = ptr1 + SZ_2M;
978 	unsigned long ptr3 = ptr1 + SZ_1G;
979 	unsigned long ptr4 = ptr3 + SZ_2M;
980 	pt_element_t pmd = ac_test_alloc_pt(pool);
981 	ac_test_t at1, at2, at3, at4;
982 	int err_read_at1, err_write_at2;
983 	int err_read_at3, err_write_at4;
984 
985 	/*
986 	 * pgd[]   pud[]        pmd[]            virtual address pointers
987 	 *                   /->pmd1(u--)->pte1(uw-)->page1 <- ptr1 (u--)
988 	 *      /->pud1(uw-)--->pmd2(uw-)->pte2(uw-)->page2 <- ptr2 (uw-)
989 	 * pgd-|           (shared pmd[] as above)
990 	 *      \->pud2(u--)--->pmd1(u--)->pte1(uw-)->page1 <- ptr3 (u--)
991 	 *                   \->pmd2(uw-)->pte2(uw-)->page2 <- ptr4 (u--)
992 	 * pud1 and pud2 point to the same pmd page.
993 	 */
994 
995 	ac_test_init(&at1, (void *)(ptr1));
996 	at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK |
997 		    AC_PDE_USER_MASK | AC_PTE_USER_MASK |
998 		    AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK |
999 		    AC_PTE_WRITABLE_MASK | AC_ACCESS_USER_MASK;
1000 	__ac_setup_specific_pages(&at1, pool, false, pmd, 0);
1001 
1002 	ac_test_init(&at2, (void *)(ptr2));
1003 	at2.flags = at1.flags | AC_PDE_WRITABLE_MASK | AC_PTE_DIRTY_MASK | AC_ACCESS_WRITE_MASK;
1004 	__ac_setup_specific_pages(&at2, pool, true, pmd, 0);
1005 
1006 	ac_test_init(&at3, (void *)(ptr3));
1007 	at3.flags = AC_PDPTE_NO_WRITABLE_MASK | at1.flags;
1008 	__ac_setup_specific_pages(&at3, pool, true, pmd, 0);
1009 
1010 	ac_test_init(&at4, (void *)(ptr4));
1011 	at4.flags = AC_PDPTE_NO_WRITABLE_MASK | at2.flags;
1012 	__ac_setup_specific_pages(&at4, pool, true, pmd, 0);
1013 
1014 	err_read_at1 = ac_test_do_access(&at1);
1015 	if (!err_read_at1) {
1016 		printf("%s: read access at1 fail\n", __FUNCTION__);
1017 		return 0;
1018 	}
1019 
1020 	err_write_at2 = ac_test_do_access(&at2);
1021 	if (!err_write_at2) {
1022 		printf("%s: write access at2 fail\n", __FUNCTION__);
1023 		return 0;
1024 	}
1025 
1026 	err_read_at3 = ac_test_do_access(&at3);
1027 	if (!err_read_at3) {
1028 		printf("%s: read access at3 fail\n", __FUNCTION__);
1029 		return 0;
1030 	}
1031 
1032 	err_write_at4 = ac_test_do_access(&at4);
1033 	if (!err_write_at4) {
1034 		printf("%s: write access at4 should fail\n", __FUNCTION__);
1035 		return 0;
1036 	}
1037 
1038 	return 1;
1039 }
1040 
1041 static int ac_test_exec(ac_test_t *at, ac_pool_t *pool)
1042 {
1043     int r;
1044 
1045     if (verbose) {
1046         ac_test_show(at);
1047     }
1048     ac_test_setup_pte(at, pool);
1049     r = ac_test_do_access(at);
1050     return r;
1051 }
1052 
1053 typedef int (*ac_test_fn)(ac_pool_t *pool);
1054 const ac_test_fn ac_test_cases[] =
1055 {
1056 	corrupt_hugepage_triger,
1057 	check_pfec_on_prefetch_pte,
1058 	check_large_pte_dirty_for_nowp,
1059 	check_smep_andnot_wp,
1060 	check_effective_sp_permissions,
1061 };
1062 
1063 static int ac_test_run(void)
1064 {
1065     ac_test_t at;
1066     ac_pool_t pool;
1067     int i, tests, successes;
1068 
1069     printf("run\n");
1070     tests = successes = 0;
1071 
1072     shadow_cr0 = read_cr0();
1073     shadow_cr4 = read_cr4();
1074     shadow_efer = rdmsr(MSR_EFER);
1075 
1076     if (cpuid_maxphyaddr() >= 52) {
1077         invalid_mask |= AC_PDE_BIT51_MASK;
1078         invalid_mask |= AC_PTE_BIT51_MASK;
1079     }
1080     if (cpuid_maxphyaddr() >= 37) {
1081         invalid_mask |= AC_PDE_BIT36_MASK;
1082         invalid_mask |= AC_PTE_BIT36_MASK;
1083     }
1084 
1085     if (this_cpu_has(X86_FEATURE_PKU)) {
1086         set_cr4_pke(1);
1087         set_cr4_pke(0);
1088         /* Now PKRU = 0xFFFFFFFF.  */
1089     } else {
1090 	tests++;
1091 	if (write_cr4_checking(shadow_cr4 | X86_CR4_PKE) == GP_VECTOR) {
1092             successes++;
1093             invalid_mask |= AC_PKU_AD_MASK;
1094             invalid_mask |= AC_PKU_WD_MASK;
1095             invalid_mask |= AC_PKU_PKEY_MASK;
1096             invalid_mask |= AC_CPU_CR4_PKE_MASK;
1097             printf("CR4.PKE not available, disabling PKE tests\n");
1098 	} else {
1099             printf("Set PKE in CR4 - expect #GP: FAIL!\n");
1100             set_cr4_pke(0);
1101 	}
1102     }
1103 
1104     if (!this_cpu_has(X86_FEATURE_SMEP)) {
1105 	tests++;
1106 	if (set_cr4_smep(1) == GP_VECTOR) {
1107             successes++;
1108             invalid_mask |= AC_CPU_CR4_SMEP_MASK;
1109             printf("CR4.SMEP not available, disabling SMEP tests\n");
1110 	} else {
1111             printf("Set SMEP in CR4 - expect #GP: FAIL!\n");
1112             set_cr4_smep(0);
1113 	}
1114     }
1115 
1116     /* Toggling LA57 in 64-bit mode (guaranteed for this test) is illegal. */
1117     if (this_cpu_has(X86_FEATURE_LA57)) {
1118         tests++;
1119         if (write_cr4_checking(shadow_cr4 ^ X86_CR4_LA57) == GP_VECTOR)
1120             successes++;
1121 
1122         /* Force a VM-Exit on KVM, which doesn't intercept LA57 itself. */
1123         tests++;
1124         if (write_cr4_checking(shadow_cr4 ^ (X86_CR4_LA57 | X86_CR4_PSE)) == GP_VECTOR)
1125             successes++;
1126     }
1127 
1128     ac_env_int(&pool);
1129     ac_test_init(&at, (void *)(0x123400000000 + 16 * smp_id()));
1130     do {
1131 	++tests;
1132 	successes += ac_test_exec(&at, &pool);
1133     } while (ac_test_bump(&at));
1134 
1135     for (i = 0; i < ARRAY_SIZE(ac_test_cases); i++) {
1136 	++tests;
1137 	successes += ac_test_cases[i](&pool);
1138     }
1139 
1140     printf("\n%d tests, %d failures\n", tests, tests - successes);
1141 
1142     return successes == tests;
1143 }
1144 
1145 int main(void)
1146 {
1147     int r;
1148 
1149     printf("starting test\n\n");
1150     page_table_levels = 4;
1151     r = ac_test_run();
1152 
1153 #ifndef TARGET_EFI
1154     /*
1155      * Not supported yet for UEFI, because setting up 5
1156      * level page table requires entering real mode.
1157      */
1158     if (this_cpu_has(X86_FEATURE_LA57)) {
1159         page_table_levels = 5;
1160         printf("starting 5-level paging test.\n\n");
1161         setup_5level_page_table();
1162         r = ac_test_run();
1163     }
1164 #endif
1165 
1166     return r ? 0 : 1;
1167 }
1168