1 2 #include "libcflat.h" 3 #include "desc.h" 4 #include "processor.h" 5 #include "asm/page.h" 6 #include "x86/vm.h" 7 8 #define smp_id() 0 9 10 #define true 1 11 #define false 0 12 13 static _Bool verbose = false; 14 15 typedef unsigned long pt_element_t; 16 static int invalid_mask; 17 static int page_table_levels; 18 19 #define PT_BASE_ADDR_MASK ((pt_element_t)((((pt_element_t)1 << 36) - 1) & PAGE_MASK)) 20 #define PT_PSE_BASE_ADDR_MASK (PT_BASE_ADDR_MASK & ~(1ull << 21)) 21 22 #define CR0_WP_MASK (1UL << 16) 23 #define CR4_SMEP_MASK (1UL << 20) 24 25 #define PFERR_PRESENT_MASK (1U << 0) 26 #define PFERR_WRITE_MASK (1U << 1) 27 #define PFERR_USER_MASK (1U << 2) 28 #define PFERR_RESERVED_MASK (1U << 3) 29 #define PFERR_FETCH_MASK (1U << 4) 30 #define PFERR_PK_MASK (1U << 5) 31 32 #define MSR_EFER 0xc0000080 33 #define EFER_NX_MASK (1ull << 11) 34 35 #define PT_INDEX(address, level) \ 36 ((address) >> (12 + ((level)-1) * 9)) & 511 37 38 /* 39 * page table access check tests 40 */ 41 42 enum { 43 AC_PTE_PRESENT_BIT, 44 AC_PTE_WRITABLE_BIT, 45 AC_PTE_USER_BIT, 46 AC_PTE_ACCESSED_BIT, 47 AC_PTE_DIRTY_BIT, 48 AC_PTE_NX_BIT, 49 AC_PTE_BIT51_BIT, 50 AC_PTE_BIT36_BIT, 51 52 AC_PDE_PRESENT_BIT, 53 AC_PDE_WRITABLE_BIT, 54 AC_PDE_USER_BIT, 55 AC_PDE_ACCESSED_BIT, 56 AC_PDE_DIRTY_BIT, 57 AC_PDE_PSE_BIT, 58 AC_PDE_NX_BIT, 59 AC_PDE_BIT51_BIT, 60 AC_PDE_BIT36_BIT, 61 AC_PDE_BIT13_BIT, 62 63 /* 64 * special test case to DISABLE writable bit on page directory 65 * pointer table entry. 66 */ 67 AC_PDPTE_NO_WRITABLE_BIT, 68 69 AC_PKU_AD_BIT, 70 AC_PKU_WD_BIT, 71 AC_PKU_PKEY_BIT, 72 73 AC_ACCESS_USER_BIT, 74 AC_ACCESS_WRITE_BIT, 75 AC_ACCESS_FETCH_BIT, 76 AC_ACCESS_TWICE_BIT, 77 78 AC_CPU_EFER_NX_BIT, 79 AC_CPU_CR0_WP_BIT, 80 AC_CPU_CR4_SMEP_BIT, 81 AC_CPU_CR4_PKE_BIT, 82 83 NR_AC_FLAGS 84 }; 85 86 #define AC_PTE_PRESENT_MASK (1 << AC_PTE_PRESENT_BIT) 87 #define AC_PTE_WRITABLE_MASK (1 << AC_PTE_WRITABLE_BIT) 88 #define AC_PTE_USER_MASK (1 << AC_PTE_USER_BIT) 89 #define AC_PTE_ACCESSED_MASK (1 << AC_PTE_ACCESSED_BIT) 90 #define AC_PTE_DIRTY_MASK (1 << AC_PTE_DIRTY_BIT) 91 #define AC_PTE_NX_MASK (1 << AC_PTE_NX_BIT) 92 #define AC_PTE_BIT51_MASK (1 << AC_PTE_BIT51_BIT) 93 #define AC_PTE_BIT36_MASK (1 << AC_PTE_BIT36_BIT) 94 95 #define AC_PDE_PRESENT_MASK (1 << AC_PDE_PRESENT_BIT) 96 #define AC_PDE_WRITABLE_MASK (1 << AC_PDE_WRITABLE_BIT) 97 #define AC_PDE_USER_MASK (1 << AC_PDE_USER_BIT) 98 #define AC_PDE_ACCESSED_MASK (1 << AC_PDE_ACCESSED_BIT) 99 #define AC_PDE_DIRTY_MASK (1 << AC_PDE_DIRTY_BIT) 100 #define AC_PDE_PSE_MASK (1 << AC_PDE_PSE_BIT) 101 #define AC_PDE_NX_MASK (1 << AC_PDE_NX_BIT) 102 #define AC_PDE_BIT51_MASK (1 << AC_PDE_BIT51_BIT) 103 #define AC_PDE_BIT36_MASK (1 << AC_PDE_BIT36_BIT) 104 #define AC_PDE_BIT13_MASK (1 << AC_PDE_BIT13_BIT) 105 106 #define AC_PDPTE_NO_WRITABLE_MASK (1 << AC_PDPTE_NO_WRITABLE_BIT) 107 108 #define AC_PKU_AD_MASK (1 << AC_PKU_AD_BIT) 109 #define AC_PKU_WD_MASK (1 << AC_PKU_WD_BIT) 110 #define AC_PKU_PKEY_MASK (1 << AC_PKU_PKEY_BIT) 111 112 #define AC_ACCESS_USER_MASK (1 << AC_ACCESS_USER_BIT) 113 #define AC_ACCESS_WRITE_MASK (1 << AC_ACCESS_WRITE_BIT) 114 #define AC_ACCESS_FETCH_MASK (1 << AC_ACCESS_FETCH_BIT) 115 #define AC_ACCESS_TWICE_MASK (1 << AC_ACCESS_TWICE_BIT) 116 117 #define AC_CPU_EFER_NX_MASK (1 << AC_CPU_EFER_NX_BIT) 118 #define AC_CPU_CR0_WP_MASK (1 << AC_CPU_CR0_WP_BIT) 119 #define AC_CPU_CR4_SMEP_MASK (1 << AC_CPU_CR4_SMEP_BIT) 120 #define AC_CPU_CR4_PKE_MASK (1 << AC_CPU_CR4_PKE_BIT) 121 122 const char *ac_names[] = { 123 [AC_PTE_PRESENT_BIT] = "pte.p", 124 [AC_PTE_ACCESSED_BIT] = "pte.a", 125 [AC_PTE_WRITABLE_BIT] = "pte.rw", 126 [AC_PTE_USER_BIT] = "pte.user", 127 [AC_PTE_DIRTY_BIT] = "pte.d", 128 [AC_PTE_NX_BIT] = "pte.nx", 129 [AC_PTE_BIT51_BIT] = "pte.51", 130 [AC_PTE_BIT36_BIT] = "pte.36", 131 [AC_PDE_PRESENT_BIT] = "pde.p", 132 [AC_PDE_ACCESSED_BIT] = "pde.a", 133 [AC_PDE_WRITABLE_BIT] = "pde.rw", 134 [AC_PDE_USER_BIT] = "pde.user", 135 [AC_PDE_DIRTY_BIT] = "pde.d", 136 [AC_PDE_PSE_BIT] = "pde.pse", 137 [AC_PDE_NX_BIT] = "pde.nx", 138 [AC_PDE_BIT51_BIT] = "pde.51", 139 [AC_PDE_BIT36_BIT] = "pde.36", 140 [AC_PDE_BIT13_BIT] = "pde.13", 141 [AC_PDPTE_NO_WRITABLE_BIT] = "pdpte.ro", 142 [AC_PKU_AD_BIT] = "pkru.ad", 143 [AC_PKU_WD_BIT] = "pkru.wd", 144 [AC_PKU_PKEY_BIT] = "pkey=1", 145 [AC_ACCESS_WRITE_BIT] = "write", 146 [AC_ACCESS_USER_BIT] = "user", 147 [AC_ACCESS_FETCH_BIT] = "fetch", 148 [AC_ACCESS_TWICE_BIT] = "twice", 149 [AC_CPU_EFER_NX_BIT] = "efer.nx", 150 [AC_CPU_CR0_WP_BIT] = "cr0.wp", 151 [AC_CPU_CR4_SMEP_BIT] = "cr4.smep", 152 [AC_CPU_CR4_PKE_BIT] = "cr4.pke", 153 }; 154 155 static inline void *va(pt_element_t phys) 156 { 157 return (void *)phys; 158 } 159 160 typedef struct { 161 pt_element_t pt_pool; 162 unsigned pt_pool_size; 163 unsigned pt_pool_current; 164 } ac_pool_t; 165 166 typedef struct { 167 unsigned flags; 168 void *virt; 169 pt_element_t phys; 170 pt_element_t *ptep; 171 pt_element_t expected_pte; 172 pt_element_t *pdep; 173 pt_element_t expected_pde; 174 pt_element_t ignore_pde; 175 int expected_fault; 176 unsigned expected_error; 177 } ac_test_t; 178 179 typedef struct { 180 unsigned short limit; 181 unsigned long linear_addr; 182 } __attribute__((packed)) descriptor_table_t; 183 184 185 static void ac_test_show(ac_test_t *at); 186 187 static unsigned long shadow_cr0; 188 static unsigned long shadow_cr4; 189 static unsigned long long shadow_efer; 190 191 static void set_cr0_wp(int wp) 192 { 193 unsigned long cr0 = shadow_cr0; 194 195 cr0 &= ~CR0_WP_MASK; 196 if (wp) 197 cr0 |= CR0_WP_MASK; 198 if (cr0 != shadow_cr0) { 199 write_cr0(cr0); 200 shadow_cr0 = cr0; 201 } 202 } 203 204 static unsigned set_cr4_smep(int smep) 205 { 206 unsigned long cr4 = shadow_cr4; 207 extern u64 ptl2[]; 208 unsigned r; 209 210 cr4 &= ~CR4_SMEP_MASK; 211 if (smep) 212 cr4 |= CR4_SMEP_MASK; 213 if (cr4 == shadow_cr4) 214 return 0; 215 216 if (smep) 217 ptl2[2] &= ~PT_USER_MASK; 218 r = write_cr4_checking(cr4); 219 if (r || !smep) { 220 ptl2[2] |= PT_USER_MASK; 221 222 /* Flush to avoid spurious #PF */ 223 invlpg((void *)(2 << 21)); 224 } 225 if (!r) 226 shadow_cr4 = cr4; 227 return r; 228 } 229 230 static void set_cr4_pke(int pke) 231 { 232 unsigned long cr4 = shadow_cr4; 233 234 cr4 &= ~X86_CR4_PKE; 235 if (pke) 236 cr4 |= X86_CR4_PKE; 237 if (cr4 == shadow_cr4) 238 return; 239 240 /* Check that protection keys do not affect accesses when CR4.PKE=0. */ 241 if ((shadow_cr4 & X86_CR4_PKE) && !pke) 242 write_pkru(0xfffffffc); 243 write_cr4(cr4); 244 shadow_cr4 = cr4; 245 } 246 247 static void set_efer_nx(int nx) 248 { 249 unsigned long long efer = shadow_efer; 250 251 efer &= ~EFER_NX_MASK; 252 if (nx) 253 efer |= EFER_NX_MASK; 254 if (efer != shadow_efer) { 255 wrmsr(MSR_EFER, efer); 256 shadow_efer = efer; 257 } 258 } 259 260 static void ac_env_int(ac_pool_t *pool) 261 { 262 extern char page_fault, kernel_entry; 263 set_idt_entry(14, &page_fault, 0); 264 set_idt_entry(0x20, &kernel_entry, 3); 265 266 pool->pt_pool = 33 * 1024 * 1024; 267 pool->pt_pool_size = 120 * 1024 * 1024 - pool->pt_pool; 268 pool->pt_pool_current = 0; 269 } 270 271 static void ac_test_init(ac_test_t *at, void *virt) 272 { 273 set_efer_nx(1); 274 set_cr0_wp(1); 275 at->flags = 0; 276 at->virt = virt; 277 at->phys = 32 * 1024 * 1024; 278 } 279 280 static int ac_test_bump_one(ac_test_t *at) 281 { 282 at->flags = ((at->flags | invalid_mask) + 1) & ~invalid_mask; 283 return at->flags < (1 << NR_AC_FLAGS); 284 } 285 286 #define F(x) ((flags & x##_MASK) != 0) 287 288 static _Bool ac_test_legal(ac_test_t *at) 289 { 290 int flags = at->flags; 291 unsigned reserved; 292 293 if (F(AC_ACCESS_FETCH) && F(AC_ACCESS_WRITE)) 294 return false; 295 296 /* 297 * Since we convert current page to kernel page when cr4.smep=1, 298 * we can't switch to user mode. 299 */ 300 if (F(AC_ACCESS_USER) && F(AC_CPU_CR4_SMEP)) 301 return false; 302 303 /* 304 * Only test protection key faults if CR4.PKE=1. 305 */ 306 if (!F(AC_CPU_CR4_PKE) && 307 (F(AC_PKU_AD) || F(AC_PKU_WD))) { 308 return false; 309 } 310 311 /* 312 * pde.bit13 checks handling of reserved bits in largepage PDEs. It is 313 * meaningless if there is a PTE. 314 */ 315 if (!F(AC_PDE_PSE) && F(AC_PDE_BIT13)) 316 return false; 317 318 /* 319 * Shorten the test by avoiding testing too many reserved bit combinations. 320 * Skip testing multiple reserved bits to shorten the test. Reserved bit 321 * page faults are terminal and multiple reserved bits do not affect the 322 * error code; the odds of a KVM bug are super low, and the odds of actually 323 * being able to detect a bug are even lower. 324 */ 325 reserved = (AC_PDE_BIT51_MASK | AC_PDE_BIT36_MASK | AC_PDE_BIT13_MASK | 326 AC_PTE_BIT51_MASK | AC_PTE_BIT36_MASK); 327 if (!F(AC_CPU_EFER_NX)) 328 reserved |= AC_PDE_NX_MASK | AC_PTE_NX_MASK; 329 330 /* Only test one reserved bit at a time. */ 331 reserved &= flags; 332 if (reserved & (reserved - 1)) 333 return false; 334 335 return true; 336 } 337 338 static int ac_test_bump(ac_test_t *at) 339 { 340 int ret; 341 342 ret = ac_test_bump_one(at); 343 while (ret && !ac_test_legal(at)) 344 ret = ac_test_bump_one(at); 345 return ret; 346 } 347 348 static pt_element_t ac_test_alloc_pt(ac_pool_t *pool) 349 { 350 pt_element_t ret = pool->pt_pool + pool->pt_pool_current; 351 pool->pt_pool_current += PAGE_SIZE; 352 memset(va(ret), 0, PAGE_SIZE); 353 return ret; 354 } 355 356 static _Bool ac_test_enough_room(ac_pool_t *pool) 357 { 358 return pool->pt_pool_current + 5 * PAGE_SIZE <= pool->pt_pool_size; 359 } 360 361 static void ac_test_reset_pt_pool(ac_pool_t *pool) 362 { 363 pool->pt_pool_current = 0; 364 } 365 366 static pt_element_t ac_test_permissions(ac_test_t *at, unsigned flags, 367 bool writable, bool user, 368 bool executable) 369 { 370 bool kwritable = !F(AC_CPU_CR0_WP) && !F(AC_ACCESS_USER); 371 pt_element_t expected = 0; 372 373 if (F(AC_ACCESS_USER) && !user) 374 at->expected_fault = 1; 375 376 if (F(AC_ACCESS_WRITE) && !writable && !kwritable) 377 at->expected_fault = 1; 378 379 if (F(AC_ACCESS_FETCH) && !executable) 380 at->expected_fault = 1; 381 382 if (F(AC_ACCESS_FETCH) && user && F(AC_CPU_CR4_SMEP)) 383 at->expected_fault = 1; 384 385 if (user && !F(AC_ACCESS_FETCH) && F(AC_PKU_PKEY) && F(AC_CPU_CR4_PKE)) { 386 if (F(AC_PKU_AD)) { 387 at->expected_fault = 1; 388 at->expected_error |= PFERR_PK_MASK; 389 } else if (F(AC_ACCESS_WRITE) && F(AC_PKU_WD) && !kwritable) { 390 at->expected_fault = 1; 391 at->expected_error |= PFERR_PK_MASK; 392 } 393 } 394 395 if (!at->expected_fault) { 396 expected |= PT_ACCESSED_MASK; 397 if (F(AC_ACCESS_WRITE)) 398 expected |= PT_DIRTY_MASK; 399 } 400 401 return expected; 402 } 403 404 static void ac_emulate_access(ac_test_t *at, unsigned flags) 405 { 406 bool pde_valid, pte_valid; 407 bool user, writable, executable; 408 409 if (F(AC_ACCESS_USER)) 410 at->expected_error |= PFERR_USER_MASK; 411 412 if (F(AC_ACCESS_WRITE)) 413 at->expected_error |= PFERR_WRITE_MASK; 414 415 if (F(AC_ACCESS_FETCH)) 416 at->expected_error |= PFERR_FETCH_MASK; 417 418 if (!F(AC_PDE_ACCESSED)) 419 at->ignore_pde = PT_ACCESSED_MASK; 420 421 pde_valid = F(AC_PDE_PRESENT) 422 && !F(AC_PDE_BIT51) && !F(AC_PDE_BIT36) && !F(AC_PDE_BIT13) 423 && !(F(AC_PDE_NX) && !F(AC_CPU_EFER_NX)); 424 425 if (!pde_valid) { 426 at->expected_fault = 1; 427 if (F(AC_PDE_PRESENT)) { 428 at->expected_error |= PFERR_RESERVED_MASK; 429 } else { 430 at->expected_error &= ~PFERR_PRESENT_MASK; 431 } 432 goto fault; 433 } 434 435 writable = !F(AC_PDPTE_NO_WRITABLE) && F(AC_PDE_WRITABLE); 436 user = F(AC_PDE_USER); 437 executable = !F(AC_PDE_NX); 438 439 if (F(AC_PDE_PSE)) { 440 at->expected_pde |= ac_test_permissions(at, flags, writable, user, 441 executable); 442 goto no_pte; 443 } 444 445 at->expected_pde |= PT_ACCESSED_MASK; 446 447 pte_valid = F(AC_PTE_PRESENT) 448 && !F(AC_PTE_BIT51) && !F(AC_PTE_BIT36) 449 && !(F(AC_PTE_NX) && !F(AC_CPU_EFER_NX)); 450 451 if (!pte_valid) { 452 at->expected_fault = 1; 453 if (F(AC_PTE_PRESENT)) { 454 at->expected_error |= PFERR_RESERVED_MASK; 455 } else { 456 at->expected_error &= ~PFERR_PRESENT_MASK; 457 } 458 goto fault; 459 } 460 461 writable &= F(AC_PTE_WRITABLE); 462 user &= F(AC_PTE_USER); 463 executable &= !F(AC_PTE_NX); 464 465 at->expected_pte |= ac_test_permissions(at, flags, writable, user, 466 executable); 467 468 no_pte: 469 fault: 470 if (!at->expected_fault) 471 at->ignore_pde = 0; 472 if (!F(AC_CPU_EFER_NX) && !F(AC_CPU_CR4_SMEP)) 473 at->expected_error &= ~PFERR_FETCH_MASK; 474 } 475 476 static void ac_set_expected_status(ac_test_t *at) 477 { 478 invlpg(at->virt); 479 480 if (at->ptep) 481 at->expected_pte = *at->ptep; 482 at->expected_pde = *at->pdep; 483 at->ignore_pde = 0; 484 at->expected_fault = 0; 485 at->expected_error = PFERR_PRESENT_MASK; 486 487 if (at->flags & AC_ACCESS_TWICE_MASK) { 488 ac_emulate_access(at, at->flags & ~AC_ACCESS_WRITE_MASK 489 & ~AC_ACCESS_FETCH_MASK & ~AC_ACCESS_USER_MASK); 490 at->expected_fault = 0; 491 at->expected_error = PFERR_PRESENT_MASK; 492 at->ignore_pde = 0; 493 } 494 495 ac_emulate_access(at, at->flags); 496 } 497 498 static void __ac_setup_specific_pages(ac_test_t *at, ac_pool_t *pool, bool reuse, 499 u64 pd_page, u64 pt_page) 500 501 { 502 unsigned long root = read_cr3(); 503 int flags = at->flags; 504 bool skip = true; 505 506 if (!ac_test_enough_room(pool)) 507 ac_test_reset_pt_pool(pool); 508 509 at->ptep = 0; 510 for (int i = page_table_levels; i >= 1 && (i >= 2 || !F(AC_PDE_PSE)); --i) { 511 pt_element_t *vroot = va(root & PT_BASE_ADDR_MASK); 512 unsigned index = PT_INDEX((unsigned long)at->virt, i); 513 pt_element_t pte = 0; 514 515 /* 516 * Reuse existing page tables along the path to the test code and data 517 * (which is in the bottom 2MB). 518 */ 519 if (skip && i >= 2 && index == 0) { 520 goto next; 521 } 522 skip = false; 523 if (reuse && vroot[index]) { 524 switch (i) { 525 case 2: 526 at->pdep = &vroot[index]; 527 break; 528 case 1: 529 at->ptep = &vroot[index]; 530 break; 531 } 532 goto next; 533 } 534 535 switch (i) { 536 case 5: 537 case 4: 538 pte = ac_test_alloc_pt(pool); 539 pte |= PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 540 break; 541 case 3: 542 pte = pd_page ? pd_page : ac_test_alloc_pt(pool); 543 pte |= PT_PRESENT_MASK | PT_USER_MASK; 544 if (!F(AC_PDPTE_NO_WRITABLE)) 545 pte |= PT_WRITABLE_MASK; 546 break; 547 case 2: 548 if (!F(AC_PDE_PSE)) { 549 pte = pt_page ? pt_page : ac_test_alloc_pt(pool); 550 /* The protection key is ignored on non-leaf entries. */ 551 if (F(AC_PKU_PKEY)) 552 pte |= 2ull << 59; 553 } else { 554 pte = at->phys & PT_PSE_BASE_ADDR_MASK; 555 pte |= PT_PAGE_SIZE_MASK; 556 if (F(AC_PKU_PKEY)) 557 pte |= 1ull << 59; 558 } 559 if (F(AC_PDE_PRESENT)) 560 pte |= PT_PRESENT_MASK; 561 if (F(AC_PDE_WRITABLE)) 562 pte |= PT_WRITABLE_MASK; 563 if (F(AC_PDE_USER)) 564 pte |= PT_USER_MASK; 565 if (F(AC_PDE_ACCESSED)) 566 pte |= PT_ACCESSED_MASK; 567 if (F(AC_PDE_DIRTY)) 568 pte |= PT_DIRTY_MASK; 569 if (F(AC_PDE_NX)) 570 pte |= PT64_NX_MASK; 571 if (F(AC_PDE_BIT51)) 572 pte |= 1ull << 51; 573 if (F(AC_PDE_BIT36)) 574 pte |= 1ull << 36; 575 if (F(AC_PDE_BIT13)) 576 pte |= 1ull << 13; 577 at->pdep = &vroot[index]; 578 break; 579 case 1: 580 pte = at->phys & PT_BASE_ADDR_MASK; 581 if (F(AC_PKU_PKEY)) 582 pte |= 1ull << 59; 583 if (F(AC_PTE_PRESENT)) 584 pte |= PT_PRESENT_MASK; 585 if (F(AC_PTE_WRITABLE)) 586 pte |= PT_WRITABLE_MASK; 587 if (F(AC_PTE_USER)) 588 pte |= PT_USER_MASK; 589 if (F(AC_PTE_ACCESSED)) 590 pte |= PT_ACCESSED_MASK; 591 if (F(AC_PTE_DIRTY)) 592 pte |= PT_DIRTY_MASK; 593 if (F(AC_PTE_NX)) 594 pte |= PT64_NX_MASK; 595 if (F(AC_PTE_BIT51)) 596 pte |= 1ull << 51; 597 if (F(AC_PTE_BIT36)) 598 pte |= 1ull << 36; 599 at->ptep = &vroot[index]; 600 break; 601 } 602 vroot[index] = pte; 603 next: 604 root = vroot[index]; 605 } 606 ac_set_expected_status(at); 607 } 608 609 static void ac_test_setup_pte(ac_test_t *at, ac_pool_t *pool) 610 { 611 __ac_setup_specific_pages(at, pool, false, 0, 0); 612 } 613 614 static void ac_setup_specific_pages(ac_test_t *at, ac_pool_t *pool, 615 u64 pd_page, u64 pt_page) 616 { 617 return __ac_setup_specific_pages(at, pool, false, pd_page, pt_page); 618 } 619 620 static void dump_mapping(ac_test_t *at) 621 { 622 unsigned long root = read_cr3(); 623 int flags = at->flags; 624 int i; 625 626 printf("Dump mapping: address: %p\n", at->virt); 627 for (i = page_table_levels ; i >= 1 && (i >= 2 || !F(AC_PDE_PSE)); --i) { 628 pt_element_t *vroot = va(root & PT_BASE_ADDR_MASK); 629 unsigned index = PT_INDEX((unsigned long)at->virt, i); 630 pt_element_t pte = vroot[index]; 631 632 printf("------L%d: %lx\n", i, pte); 633 root = vroot[index]; 634 } 635 } 636 637 static void ac_test_check(ac_test_t *at, _Bool *success_ret, _Bool cond, 638 const char *fmt, ...) 639 { 640 va_list ap; 641 char buf[500]; 642 643 if (!*success_ret) { 644 return; 645 } 646 647 if (!cond) { 648 return; 649 } 650 651 *success_ret = false; 652 653 if (!verbose) { 654 puts("\n"); 655 ac_test_show(at); 656 } 657 658 va_start(ap, fmt); 659 vsnprintf(buf, sizeof(buf), fmt, ap); 660 va_end(ap); 661 printf("FAIL: %s\n", buf); 662 dump_mapping(at); 663 } 664 665 static int pt_match(pt_element_t pte1, pt_element_t pte2, pt_element_t ignore) 666 { 667 pte1 &= ~ignore; 668 pte2 &= ~ignore; 669 return pte1 == pte2; 670 } 671 672 static int ac_test_do_access(ac_test_t *at) 673 { 674 static unsigned unique = 42; 675 int fault = 0; 676 unsigned e; 677 static unsigned char user_stack[4096]; 678 unsigned long rsp; 679 _Bool success = true; 680 int flags = at->flags; 681 682 ++unique; 683 if (!(unique & 65535)) { 684 puts("."); 685 } 686 687 *((unsigned char *)at->phys) = 0xc3; /* ret */ 688 689 unsigned r = unique; 690 set_cr0_wp(F(AC_CPU_CR0_WP)); 691 set_efer_nx(F(AC_CPU_EFER_NX)); 692 set_cr4_pke(F(AC_CPU_CR4_PKE)); 693 if (F(AC_CPU_CR4_PKE)) { 694 /* WD2=AD2=1, WD1=F(AC_PKU_WD), AD1=F(AC_PKU_AD) */ 695 write_pkru(0x30 | (F(AC_PKU_WD) ? 8 : 0) | 696 (F(AC_PKU_AD) ? 4 : 0)); 697 } 698 699 set_cr4_smep(F(AC_CPU_CR4_SMEP)); 700 701 if (F(AC_ACCESS_TWICE)) { 702 asm volatile ( 703 "lea fixed2(%%rip), %%rsi \n\t" 704 "mov (%[addr]), %[reg] \n\t" 705 "fixed2:" 706 : [reg]"=r"(r), [fault]"=a"(fault), "=b"(e) 707 : [addr]"r"(at->virt) 708 : "rsi" 709 ); 710 fault = 0; 711 } 712 713 asm volatile ("lea fixed1(%%rip), %%rsi \n\t" 714 "mov %%rsp, %[rsp0] \n\t" 715 "cmp $0, %[user] \n\t" 716 "jz do_access \n\t" 717 "push %%rax; mov %[user_ds], %%ax; mov %%ax, %%ds; pop %%rax \n\t" 718 "pushq %[user_ds] \n\t" 719 "pushq %[user_stack_top] \n\t" 720 "pushfq \n\t" 721 "pushq %[user_cs] \n\t" 722 "lea do_access(%%rip), %%r8\n\t" 723 "pushq %%r8\n\t" 724 "iretq \n" 725 "do_access: \n\t" 726 "cmp $0, %[fetch] \n\t" 727 "jnz 2f \n\t" 728 "cmp $0, %[write] \n\t" 729 "jnz 1f \n\t" 730 "mov (%[addr]), %[reg] \n\t" 731 "jmp done \n\t" 732 "1: mov %[reg], (%[addr]) \n\t" 733 "jmp done \n\t" 734 "2: call *%[addr] \n\t" 735 "done: \n" 736 "fixed1: \n" 737 "int %[kernel_entry_vector] \n\t" 738 ".section .text.entry \n\t" 739 "kernel_entry: \n\t" 740 "mov %[rsp0], %%rsp \n\t" 741 "jmp back_to_kernel \n\t" 742 ".section .text \n\t" 743 "back_to_kernel:" 744 : [reg]"+r"(r), "+a"(fault), "=b"(e), "=&d"(rsp), 745 [rsp0]"=m"(tss[0].rsp0) 746 : [addr]"r"(at->virt), 747 [write]"r"(F(AC_ACCESS_WRITE)), 748 [user]"r"(F(AC_ACCESS_USER)), 749 [fetch]"r"(F(AC_ACCESS_FETCH)), 750 [user_ds]"i"(USER_DS), 751 [user_cs]"i"(USER_CS), 752 [user_stack_top]"r"(user_stack + sizeof user_stack), 753 [kernel_entry_vector]"i"(0x20) 754 : "rsi", "r8"); 755 756 asm volatile (".section .text.pf \n\t" 757 "page_fault: \n\t" 758 "pop %rbx \n\t" 759 "mov %rsi, (%rsp) \n\t" 760 "movl $1, %eax \n\t" 761 "iretq \n\t" 762 ".section .text"); 763 764 ac_test_check(at, &success, fault && !at->expected_fault, 765 "unexpected fault"); 766 ac_test_check(at, &success, !fault && at->expected_fault, 767 "unexpected access"); 768 ac_test_check(at, &success, fault && e != at->expected_error, 769 "error code %x expected %x", e, at->expected_error); 770 if (at->ptep) 771 ac_test_check(at, &success, *at->ptep != at->expected_pte, 772 "pte %x expected %x", *at->ptep, at->expected_pte); 773 ac_test_check(at, &success, 774 !pt_match(*at->pdep, at->expected_pde, at->ignore_pde), 775 "pde %x expected %x", *at->pdep, at->expected_pde); 776 777 if (success && verbose) { 778 if (at->expected_fault) { 779 printf("PASS (%x)\n", at->expected_error); 780 } else { 781 printf("PASS\n"); 782 } 783 } 784 return success; 785 } 786 787 static void ac_test_show(ac_test_t *at) 788 { 789 char line[5000]; 790 791 *line = 0; 792 strcat(line, "test"); 793 for (int i = 0; i < NR_AC_FLAGS; ++i) 794 if (at->flags & (1 << i)) { 795 strcat(line, " "); 796 strcat(line, ac_names[i]); 797 } 798 799 strcat(line, ": "); 800 printf("%s", line); 801 } 802 803 /* 804 * This test case is used to triger the bug which is fixed by 805 * commit e09e90a5 in the kvm tree 806 */ 807 static int corrupt_hugepage_triger(ac_pool_t *pool) 808 { 809 ac_test_t at1, at2; 810 811 ac_test_init(&at1, (void *)(0x123400000000)); 812 ac_test_init(&at2, (void *)(0x666600000000)); 813 814 at2.flags = AC_CPU_CR0_WP_MASK | AC_PDE_PSE_MASK | AC_PDE_PRESENT_MASK; 815 ac_test_setup_pte(&at2, pool); 816 if (!ac_test_do_access(&at2)) 817 goto err; 818 819 at1.flags = at2.flags | AC_PDE_WRITABLE_MASK; 820 ac_test_setup_pte(&at1, pool); 821 if (!ac_test_do_access(&at1)) 822 goto err; 823 824 at1.flags |= AC_ACCESS_WRITE_MASK; 825 ac_set_expected_status(&at1); 826 if (!ac_test_do_access(&at1)) 827 goto err; 828 829 at2.flags |= AC_ACCESS_WRITE_MASK; 830 ac_set_expected_status(&at2); 831 if (!ac_test_do_access(&at2)) 832 goto err; 833 834 return 1; 835 836 err: 837 printf("corrupt_hugepage_triger test fail\n"); 838 return 0; 839 } 840 841 /* 842 * This test case is used to triger the bug which is fixed by 843 * commit 3ddf6c06e13e in the kvm tree 844 */ 845 static int check_pfec_on_prefetch_pte(ac_pool_t *pool) 846 { 847 ac_test_t at1, at2; 848 849 ac_test_init(&at1, (void *)(0x123406001000)); 850 ac_test_init(&at2, (void *)(0x123406003000)); 851 852 at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK; 853 ac_setup_specific_pages(&at1, pool, 30 * 1024 * 1024, 30 * 1024 * 1024); 854 855 at2.flags = at1.flags | AC_PTE_NX_MASK; 856 ac_setup_specific_pages(&at2, pool, 30 * 1024 * 1024, 30 * 1024 * 1024); 857 858 if (!ac_test_do_access(&at1)) { 859 printf("%s: prepare fail\n", __FUNCTION__); 860 goto err; 861 } 862 863 if (!ac_test_do_access(&at2)) { 864 printf("%s: check PFEC on prefetch pte path fail\n", 865 __FUNCTION__); 866 goto err; 867 } 868 869 return 1; 870 871 err: 872 return 0; 873 } 874 875 /* 876 * If the write-fault access is from supervisor and CR0.WP is not set on the 877 * vcpu, kvm will fix it by adjusting pte access - it sets the W bit on pte 878 * and clears U bit. This is the chance that kvm can change pte access from 879 * readonly to writable. 880 * 881 * Unfortunately, the pte access is the access of 'direct' shadow page table, 882 * means direct sp.role.access = pte_access, then we will create a writable 883 * spte entry on the readonly shadow page table. It will cause Dirty bit is 884 * not tracked when two guest ptes point to the same large page. Note, it 885 * does not have other impact except Dirty bit since cr0.wp is encoded into 886 * sp.role. 887 * 888 * Note: to trigger this bug, hugepage should be disabled on host. 889 */ 890 static int check_large_pte_dirty_for_nowp(ac_pool_t *pool) 891 { 892 ac_test_t at1, at2; 893 894 ac_test_init(&at1, (void *)(0x123403000000)); 895 ac_test_init(&at2, (void *)(0x666606000000)); 896 897 at2.flags = AC_PDE_PRESENT_MASK | AC_PDE_PSE_MASK; 898 ac_test_setup_pte(&at2, pool); 899 if (!ac_test_do_access(&at2)) { 900 printf("%s: read on the first mapping fail.\n", __FUNCTION__); 901 goto err; 902 } 903 904 at1.flags = at2.flags | AC_ACCESS_WRITE_MASK; 905 ac_test_setup_pte(&at1, pool); 906 if (!ac_test_do_access(&at1)) { 907 printf("%s: write on the second mapping fail.\n", __FUNCTION__); 908 goto err; 909 } 910 911 at2.flags |= AC_ACCESS_WRITE_MASK; 912 ac_set_expected_status(&at2); 913 if (!ac_test_do_access(&at2)) { 914 printf("%s: write on the first mapping fail.\n", __FUNCTION__); 915 goto err; 916 } 917 918 return 1; 919 920 err: 921 return 0; 922 } 923 924 static int check_smep_andnot_wp(ac_pool_t *pool) 925 { 926 ac_test_t at1; 927 int err_prepare_andnot_wp, err_smep_andnot_wp; 928 929 if (!this_cpu_has(X86_FEATURE_SMEP)) { 930 return 1; 931 } 932 933 ac_test_init(&at1, (void *)(0x123406001000)); 934 935 at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK | 936 AC_PDE_USER_MASK | AC_PTE_USER_MASK | 937 AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK | 938 AC_CPU_CR4_SMEP_MASK | 939 AC_CPU_CR0_WP_MASK | 940 AC_ACCESS_WRITE_MASK; 941 ac_test_setup_pte(&at1, pool); 942 943 /* 944 * Here we write the ro user page when 945 * cr0.wp=0, then we execute it and SMEP 946 * fault should happen. 947 */ 948 err_prepare_andnot_wp = ac_test_do_access(&at1); 949 if (!err_prepare_andnot_wp) { 950 printf("%s: SMEP prepare fail\n", __FUNCTION__); 951 goto clean_up; 952 } 953 954 at1.flags &= ~AC_ACCESS_WRITE_MASK; 955 at1.flags |= AC_ACCESS_FETCH_MASK; 956 ac_set_expected_status(&at1); 957 err_smep_andnot_wp = ac_test_do_access(&at1); 958 959 clean_up: 960 set_cr4_smep(0); 961 962 if (!err_prepare_andnot_wp) 963 goto err; 964 if (!err_smep_andnot_wp) { 965 printf("%s: check SMEP without wp fail\n", __FUNCTION__); 966 goto err; 967 } 968 return 1; 969 970 err: 971 return 0; 972 } 973 974 static int check_effective_sp_permissions(ac_pool_t *pool) 975 { 976 unsigned long ptr1 = 0x123480000000; 977 unsigned long ptr2 = ptr1 + SZ_2M; 978 unsigned long ptr3 = ptr1 + SZ_1G; 979 unsigned long ptr4 = ptr3 + SZ_2M; 980 pt_element_t pmd = ac_test_alloc_pt(pool); 981 ac_test_t at1, at2, at3, at4; 982 int err_read_at1, err_write_at2; 983 int err_read_at3, err_write_at4; 984 985 /* 986 * pgd[] pud[] pmd[] virtual address pointers 987 * /->pmd1(u--)->pte1(uw-)->page1 <- ptr1 (u--) 988 * /->pud1(uw-)--->pmd2(uw-)->pte2(uw-)->page2 <- ptr2 (uw-) 989 * pgd-| (shared pmd[] as above) 990 * \->pud2(u--)--->pmd1(u--)->pte1(uw-)->page1 <- ptr3 (u--) 991 * \->pmd2(uw-)->pte2(uw-)->page2 <- ptr4 (u--) 992 * pud1 and pud2 point to the same pmd page. 993 */ 994 995 ac_test_init(&at1, (void *)(ptr1)); 996 at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK | 997 AC_PDE_USER_MASK | AC_PTE_USER_MASK | 998 AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK | 999 AC_PTE_WRITABLE_MASK | AC_ACCESS_USER_MASK; 1000 __ac_setup_specific_pages(&at1, pool, false, pmd, 0); 1001 1002 ac_test_init(&at2, (void *)(ptr2)); 1003 at2.flags = at1.flags | AC_PDE_WRITABLE_MASK | AC_PTE_DIRTY_MASK | AC_ACCESS_WRITE_MASK; 1004 __ac_setup_specific_pages(&at2, pool, true, pmd, 0); 1005 1006 ac_test_init(&at3, (void *)(ptr3)); 1007 at3.flags = AC_PDPTE_NO_WRITABLE_MASK | at1.flags; 1008 __ac_setup_specific_pages(&at3, pool, true, pmd, 0); 1009 1010 ac_test_init(&at4, (void *)(ptr4)); 1011 at4.flags = AC_PDPTE_NO_WRITABLE_MASK | at2.flags; 1012 __ac_setup_specific_pages(&at4, pool, true, pmd, 0); 1013 1014 err_read_at1 = ac_test_do_access(&at1); 1015 if (!err_read_at1) { 1016 printf("%s: read access at1 fail\n", __FUNCTION__); 1017 return 0; 1018 } 1019 1020 err_write_at2 = ac_test_do_access(&at2); 1021 if (!err_write_at2) { 1022 printf("%s: write access at2 fail\n", __FUNCTION__); 1023 return 0; 1024 } 1025 1026 err_read_at3 = ac_test_do_access(&at3); 1027 if (!err_read_at3) { 1028 printf("%s: read access at3 fail\n", __FUNCTION__); 1029 return 0; 1030 } 1031 1032 err_write_at4 = ac_test_do_access(&at4); 1033 if (!err_write_at4) { 1034 printf("%s: write access at4 should fail\n", __FUNCTION__); 1035 return 0; 1036 } 1037 1038 return 1; 1039 } 1040 1041 static int ac_test_exec(ac_test_t *at, ac_pool_t *pool) 1042 { 1043 int r; 1044 1045 if (verbose) { 1046 ac_test_show(at); 1047 } 1048 ac_test_setup_pte(at, pool); 1049 r = ac_test_do_access(at); 1050 return r; 1051 } 1052 1053 typedef int (*ac_test_fn)(ac_pool_t *pool); 1054 const ac_test_fn ac_test_cases[] = 1055 { 1056 corrupt_hugepage_triger, 1057 check_pfec_on_prefetch_pte, 1058 check_large_pte_dirty_for_nowp, 1059 check_smep_andnot_wp, 1060 check_effective_sp_permissions, 1061 }; 1062 1063 static int ac_test_run(void) 1064 { 1065 ac_test_t at; 1066 ac_pool_t pool; 1067 int i, tests, successes; 1068 1069 printf("run\n"); 1070 tests = successes = 0; 1071 1072 shadow_cr0 = read_cr0(); 1073 shadow_cr4 = read_cr4(); 1074 shadow_efer = rdmsr(MSR_EFER); 1075 1076 if (cpuid_maxphyaddr() >= 52) { 1077 invalid_mask |= AC_PDE_BIT51_MASK; 1078 invalid_mask |= AC_PTE_BIT51_MASK; 1079 } 1080 if (cpuid_maxphyaddr() >= 37) { 1081 invalid_mask |= AC_PDE_BIT36_MASK; 1082 invalid_mask |= AC_PTE_BIT36_MASK; 1083 } 1084 1085 if (this_cpu_has(X86_FEATURE_PKU)) { 1086 set_cr4_pke(1); 1087 set_cr4_pke(0); 1088 /* Now PKRU = 0xFFFFFFFF. */ 1089 } else { 1090 tests++; 1091 if (write_cr4_checking(shadow_cr4 | X86_CR4_PKE) == GP_VECTOR) { 1092 successes++; 1093 invalid_mask |= AC_PKU_AD_MASK; 1094 invalid_mask |= AC_PKU_WD_MASK; 1095 invalid_mask |= AC_PKU_PKEY_MASK; 1096 invalid_mask |= AC_CPU_CR4_PKE_MASK; 1097 printf("CR4.PKE not available, disabling PKE tests\n"); 1098 } else { 1099 printf("Set PKE in CR4 - expect #GP: FAIL!\n"); 1100 set_cr4_pke(0); 1101 } 1102 } 1103 1104 if (!this_cpu_has(X86_FEATURE_SMEP)) { 1105 tests++; 1106 if (set_cr4_smep(1) == GP_VECTOR) { 1107 successes++; 1108 invalid_mask |= AC_CPU_CR4_SMEP_MASK; 1109 printf("CR4.SMEP not available, disabling SMEP tests\n"); 1110 } else { 1111 printf("Set SMEP in CR4 - expect #GP: FAIL!\n"); 1112 set_cr4_smep(0); 1113 } 1114 } 1115 1116 /* Toggling LA57 in 64-bit mode (guaranteed for this test) is illegal. */ 1117 if (this_cpu_has(X86_FEATURE_LA57)) { 1118 tests++; 1119 if (write_cr4_checking(shadow_cr4 ^ X86_CR4_LA57) == GP_VECTOR) 1120 successes++; 1121 1122 /* Force a VM-Exit on KVM, which doesn't intercept LA57 itself. */ 1123 tests++; 1124 if (write_cr4_checking(shadow_cr4 ^ (X86_CR4_LA57 | X86_CR4_PSE)) == GP_VECTOR) 1125 successes++; 1126 } 1127 1128 ac_env_int(&pool); 1129 ac_test_init(&at, (void *)(0x123400000000 + 16 * smp_id())); 1130 do { 1131 ++tests; 1132 successes += ac_test_exec(&at, &pool); 1133 } while (ac_test_bump(&at)); 1134 1135 for (i = 0; i < ARRAY_SIZE(ac_test_cases); i++) { 1136 ++tests; 1137 successes += ac_test_cases[i](&pool); 1138 } 1139 1140 printf("\n%d tests, %d failures\n", tests, tests - successes); 1141 1142 return successes == tests; 1143 } 1144 1145 int main(void) 1146 { 1147 int r; 1148 1149 printf("starting test\n\n"); 1150 page_table_levels = 4; 1151 r = ac_test_run(); 1152 1153 #ifndef TARGET_EFI 1154 /* 1155 * Not supported yet for UEFI, because setting up 5 1156 * level page table requires entering real mode. 1157 */ 1158 if (this_cpu_has(X86_FEATURE_LA57)) { 1159 page_table_levels = 5; 1160 printf("starting 5-level paging test.\n\n"); 1161 setup_5level_page_table(); 1162 r = ac_test_run(); 1163 } 1164 #endif 1165 1166 return r ? 0 : 1; 1167 } 1168