1 #include "libcflat.h" 2 #include "desc.h" 3 #include "processor.h" 4 #include "asm/page.h" 5 #include "x86/vm.h" 6 #include "access.h" 7 8 #define true 1 9 #define false 0 10 11 static _Bool verbose = false; 12 13 typedef unsigned long pt_element_t; 14 static int invalid_mask; 15 16 /* Test code/data is at 32MiB, paging structures at 33MiB. */ 17 #define AT_CODE_DATA_PHYS 32 * 1024 * 1024 18 #define AT_PAGING_STRUCTURES_PHYS 33 * 1024 * 1024 19 20 #define PT_BASE_ADDR_MASK ((pt_element_t)((((pt_element_t)1 << 36) - 1) & PAGE_MASK)) 21 #define PT_PSE_BASE_ADDR_MASK (PT_BASE_ADDR_MASK & ~(1ull << 21)) 22 23 #define CR0_WP_MASK (1UL << 16) 24 #define CR4_SMEP_MASK (1UL << 20) 25 26 #define PFERR_PRESENT_MASK (1U << 0) 27 #define PFERR_WRITE_MASK (1U << 1) 28 #define PFERR_USER_MASK (1U << 2) 29 #define PFERR_RESERVED_MASK (1U << 3) 30 #define PFERR_FETCH_MASK (1U << 4) 31 #define PFERR_PK_MASK (1U << 5) 32 33 #define MSR_EFER 0xc0000080 34 #define EFER_NX_MASK (1ull << 11) 35 36 #define PT_INDEX(address, level) \ 37 (((address) >> (12 + ((level)-1) * 9)) & 511) 38 39 /* 40 * Page table access check tests. Each number/bit represent an individual 41 * test case. The main test will bump a counter by 1 to run all permutations 42 * of the below test cases (sans illegal combinations). 43 * 44 * Keep the PRESENT and reserved bits in the higher numbers so that they aren't 45 * toggled on every test, e.g. to keep entries in the TLB. 46 */ 47 enum { 48 AC_PTE_WRITABLE_BIT, 49 AC_PTE_USER_BIT, 50 AC_PTE_ACCESSED_BIT, 51 AC_PTE_DIRTY_BIT, 52 AC_PTE_NX_BIT, 53 AC_PTE_PRESENT_BIT, 54 AC_PTE_BIT51_BIT, 55 AC_PTE_BIT36_BIT, 56 57 AC_PDE_WRITABLE_BIT, 58 AC_PDE_USER_BIT, 59 AC_PDE_ACCESSED_BIT, 60 AC_PDE_DIRTY_BIT, 61 AC_PDE_PSE_BIT, 62 AC_PDE_NX_BIT, 63 AC_PDE_PRESENT_BIT, 64 AC_PDE_BIT51_BIT, 65 AC_PDE_BIT36_BIT, 66 AC_PDE_BIT13_BIT, 67 68 /* 69 * special test case to DISABLE writable bit on page directory 70 * pointer table entry. 71 */ 72 AC_PDPTE_NO_WRITABLE_BIT, 73 74 AC_PKU_AD_BIT, 75 AC_PKU_WD_BIT, 76 AC_PKU_PKEY_BIT, 77 78 AC_ACCESS_USER_BIT, 79 AC_ACCESS_WRITE_BIT, 80 AC_ACCESS_FETCH_BIT, 81 AC_ACCESS_TWICE_BIT, 82 83 AC_CPU_EFER_NX_BIT, 84 AC_CPU_CR0_WP_BIT, 85 AC_CPU_CR4_SMEP_BIT, 86 AC_CPU_CR4_PKE_BIT, 87 88 NR_AC_FLAGS 89 }; 90 91 #define AC_PTE_PRESENT_MASK (1 << AC_PTE_PRESENT_BIT) 92 #define AC_PTE_WRITABLE_MASK (1 << AC_PTE_WRITABLE_BIT) 93 #define AC_PTE_USER_MASK (1 << AC_PTE_USER_BIT) 94 #define AC_PTE_ACCESSED_MASK (1 << AC_PTE_ACCESSED_BIT) 95 #define AC_PTE_DIRTY_MASK (1 << AC_PTE_DIRTY_BIT) 96 #define AC_PTE_NX_MASK (1 << AC_PTE_NX_BIT) 97 #define AC_PTE_BIT51_MASK (1 << AC_PTE_BIT51_BIT) 98 #define AC_PTE_BIT36_MASK (1 << AC_PTE_BIT36_BIT) 99 100 #define AC_PDE_PRESENT_MASK (1 << AC_PDE_PRESENT_BIT) 101 #define AC_PDE_WRITABLE_MASK (1 << AC_PDE_WRITABLE_BIT) 102 #define AC_PDE_USER_MASK (1 << AC_PDE_USER_BIT) 103 #define AC_PDE_ACCESSED_MASK (1 << AC_PDE_ACCESSED_BIT) 104 #define AC_PDE_DIRTY_MASK (1 << AC_PDE_DIRTY_BIT) 105 #define AC_PDE_PSE_MASK (1 << AC_PDE_PSE_BIT) 106 #define AC_PDE_NX_MASK (1 << AC_PDE_NX_BIT) 107 #define AC_PDE_BIT51_MASK (1 << AC_PDE_BIT51_BIT) 108 #define AC_PDE_BIT36_MASK (1 << AC_PDE_BIT36_BIT) 109 #define AC_PDE_BIT13_MASK (1 << AC_PDE_BIT13_BIT) 110 111 #define AC_PDPTE_NO_WRITABLE_MASK (1 << AC_PDPTE_NO_WRITABLE_BIT) 112 113 #define AC_PKU_AD_MASK (1 << AC_PKU_AD_BIT) 114 #define AC_PKU_WD_MASK (1 << AC_PKU_WD_BIT) 115 #define AC_PKU_PKEY_MASK (1 << AC_PKU_PKEY_BIT) 116 117 #define AC_ACCESS_USER_MASK (1 << AC_ACCESS_USER_BIT) 118 #define AC_ACCESS_WRITE_MASK (1 << AC_ACCESS_WRITE_BIT) 119 #define AC_ACCESS_FETCH_MASK (1 << AC_ACCESS_FETCH_BIT) 120 #define AC_ACCESS_TWICE_MASK (1 << AC_ACCESS_TWICE_BIT) 121 122 #define AC_CPU_EFER_NX_MASK (1 << AC_CPU_EFER_NX_BIT) 123 #define AC_CPU_CR0_WP_MASK (1 << AC_CPU_CR0_WP_BIT) 124 #define AC_CPU_CR4_SMEP_MASK (1 << AC_CPU_CR4_SMEP_BIT) 125 #define AC_CPU_CR4_PKE_MASK (1 << AC_CPU_CR4_PKE_BIT) 126 127 const char *ac_names[] = { 128 [AC_PTE_PRESENT_BIT] = "pte.p", 129 [AC_PTE_ACCESSED_BIT] = "pte.a", 130 [AC_PTE_WRITABLE_BIT] = "pte.rw", 131 [AC_PTE_USER_BIT] = "pte.user", 132 [AC_PTE_DIRTY_BIT] = "pte.d", 133 [AC_PTE_NX_BIT] = "pte.nx", 134 [AC_PTE_BIT51_BIT] = "pte.51", 135 [AC_PTE_BIT36_BIT] = "pte.36", 136 [AC_PDE_PRESENT_BIT] = "pde.p", 137 [AC_PDE_ACCESSED_BIT] = "pde.a", 138 [AC_PDE_WRITABLE_BIT] = "pde.rw", 139 [AC_PDE_USER_BIT] = "pde.user", 140 [AC_PDE_DIRTY_BIT] = "pde.d", 141 [AC_PDE_PSE_BIT] = "pde.pse", 142 [AC_PDE_NX_BIT] = "pde.nx", 143 [AC_PDE_BIT51_BIT] = "pde.51", 144 [AC_PDE_BIT36_BIT] = "pde.36", 145 [AC_PDE_BIT13_BIT] = "pde.13", 146 [AC_PDPTE_NO_WRITABLE_BIT] = "pdpte.ro", 147 [AC_PKU_AD_BIT] = "pkru.ad", 148 [AC_PKU_WD_BIT] = "pkru.wd", 149 [AC_PKU_PKEY_BIT] = "pkey=1", 150 [AC_ACCESS_WRITE_BIT] = "write", 151 [AC_ACCESS_USER_BIT] = "user", 152 [AC_ACCESS_FETCH_BIT] = "fetch", 153 [AC_ACCESS_TWICE_BIT] = "twice", 154 [AC_CPU_EFER_NX_BIT] = "efer.nx", 155 [AC_CPU_CR0_WP_BIT] = "cr0.wp", 156 [AC_CPU_CR4_SMEP_BIT] = "cr4.smep", 157 [AC_CPU_CR4_PKE_BIT] = "cr4.pke", 158 }; 159 160 static inline void *va(pt_element_t phys) 161 { 162 return (void *)phys; 163 } 164 165 typedef struct { 166 pt_element_t pt_pool_pa; 167 unsigned int pt_pool_current; 168 int pt_levels; 169 } ac_pt_env_t; 170 171 typedef struct { 172 unsigned flags; 173 void *virt; 174 pt_element_t phys; 175 pt_element_t *ptep; 176 pt_element_t expected_pte; 177 pt_element_t *pdep; 178 pt_element_t expected_pde; 179 pt_element_t ignore_pde; 180 int expected_fault; 181 unsigned expected_error; 182 int pt_levels; 183 184 /* 5-level paging, 1-based to avoid math. */ 185 pt_element_t page_tables[6]; 186 } ac_test_t; 187 188 typedef struct { 189 unsigned short limit; 190 unsigned long linear_addr; 191 } __attribute__((packed)) descriptor_table_t; 192 193 194 static void ac_test_show(ac_test_t *at); 195 196 static unsigned long shadow_cr0; 197 static unsigned long shadow_cr3; 198 static unsigned long shadow_cr4; 199 static unsigned long long shadow_efer; 200 201 typedef void (*walk_fn)(pt_element_t *ptep, int level, unsigned long virt); 202 203 /* Returns the size of the range covered by the last processed entry. */ 204 static unsigned long walk_va(ac_test_t *at, int min_level, unsigned long virt, 205 walk_fn callback, bool leaf_only) 206 { 207 unsigned long parent_pte = shadow_cr3; 208 int i; 209 210 for (i = at->pt_levels; i >= min_level; --i) { 211 pt_element_t *parent_pt = va(parent_pte & PT_BASE_ADDR_MASK); 212 unsigned int index = PT_INDEX(virt, i); 213 pt_element_t *ptep = &parent_pt[index]; 214 215 assert(!leaf_only || (*ptep & PT_PRESENT_MASK)); 216 217 if (!leaf_only || i == 1 || (*ptep & PT_PAGE_SIZE_MASK)) 218 callback(ptep, i, virt); 219 220 if (i == 1 || *ptep & PT_PAGE_SIZE_MASK) 221 break; 222 223 parent_pte = *ptep; 224 } 225 226 return 1ul << PGDIR_BITS(i); 227 } 228 229 static void walk_ptes(ac_test_t *at, unsigned long virt, unsigned long end, 230 walk_fn callback) 231 { 232 unsigned long page_size; 233 234 for ( ; virt < end; virt = ALIGN_DOWN(virt + page_size, page_size)) 235 page_size = walk_va(at, 1, virt, callback, true); 236 } 237 238 static void set_cr0_wp(int wp) 239 { 240 unsigned long cr0 = shadow_cr0; 241 242 cr0 &= ~CR0_WP_MASK; 243 if (wp) 244 cr0 |= CR0_WP_MASK; 245 if (cr0 != shadow_cr0) { 246 write_cr0(cr0); 247 shadow_cr0 = cr0; 248 } 249 } 250 251 static void clear_user_mask(pt_element_t *ptep, int level, unsigned long virt) 252 { 253 *ptep &= ~PT_USER_MASK; 254 } 255 256 static void set_user_mask(pt_element_t *ptep, int level, unsigned long virt) 257 { 258 *ptep |= PT_USER_MASK; 259 260 /* Flush to avoid spurious #PF */ 261 invlpg((void*)virt); 262 } 263 264 static unsigned set_cr4_smep(ac_test_t *at, int smep) 265 { 266 extern char stext, etext; 267 unsigned long code_start = (unsigned long)&stext; 268 unsigned long code_end = (unsigned long)&etext; 269 unsigned long cr4 = shadow_cr4; 270 unsigned r; 271 272 cr4 &= ~CR4_SMEP_MASK; 273 if (smep) 274 cr4 |= CR4_SMEP_MASK; 275 if (cr4 == shadow_cr4) 276 return 0; 277 278 if (smep) 279 walk_ptes(at, code_start, code_end, clear_user_mask); 280 r = write_cr4_checking(cr4); 281 if (r || !smep) 282 walk_ptes(at, code_start, code_end, set_user_mask); 283 if (!r) 284 shadow_cr4 = cr4; 285 return r; 286 } 287 288 static void set_cr4_pke(int pke) 289 { 290 unsigned long cr4 = shadow_cr4; 291 292 cr4 &= ~X86_CR4_PKE; 293 if (pke) 294 cr4 |= X86_CR4_PKE; 295 if (cr4 == shadow_cr4) 296 return; 297 298 /* Check that protection keys do not affect accesses when CR4.PKE=0. */ 299 if ((shadow_cr4 & X86_CR4_PKE) && !pke) 300 write_pkru(0xfffffffc); 301 write_cr4(cr4); 302 shadow_cr4 = cr4; 303 } 304 305 static void set_efer_nx(int nx) 306 { 307 unsigned long long efer = shadow_efer; 308 309 efer &= ~EFER_NX_MASK; 310 if (nx) 311 efer |= EFER_NX_MASK; 312 if (efer != shadow_efer) { 313 wrmsr(MSR_EFER, efer); 314 shadow_efer = efer; 315 } 316 } 317 318 static void ac_env_int(ac_pt_env_t *pt_env, int page_table_levels) 319 { 320 extern char page_fault, kernel_entry; 321 set_idt_entry(14, &page_fault, 0); 322 set_idt_entry(0x20, &kernel_entry, 3); 323 324 pt_env->pt_pool_pa = AT_PAGING_STRUCTURES_PHYS; 325 pt_env->pt_pool_current = 0; 326 pt_env->pt_levels = page_table_levels; 327 } 328 329 static pt_element_t ac_test_alloc_pt(ac_pt_env_t *pt_env) 330 { 331 pt_element_t pt; 332 333 /* 334 * Each test needs at most pt_levels-1 structures per virtual address, 335 * and no existing scenario uses more than four addresses. 336 */ 337 assert(pt_env->pt_pool_current < (4 * (pt_env->pt_levels - 1))); 338 339 pt = pt_env->pt_pool_pa + (pt_env->pt_pool_current * PAGE_SIZE); 340 pt_env->pt_pool_current++; 341 memset(va(pt), 0, PAGE_SIZE); 342 return pt; 343 } 344 345 static void __ac_test_init(ac_test_t *at, unsigned long virt, 346 ac_pt_env_t *pt_env, ac_test_t *buddy) 347 { 348 unsigned long buddy_virt = buddy ? (unsigned long)buddy->virt : 0; 349 pt_element_t *root_pt = va(shadow_cr3 & PT_BASE_ADDR_MASK); 350 int i; 351 352 /* 353 * The test infrastructure, e.g. this function, must use a different 354 * top-level SPTE than the test, otherwise modifying SPTEs can affect 355 * normal behavior, e.g. crash the test due to marking code SPTEs 356 * USER when CR4.SMEP=1. 357 */ 358 assert(PT_INDEX(virt, pt_env->pt_levels) != 359 PT_INDEX((unsigned long)__ac_test_init, pt_env->pt_levels)); 360 361 set_efer_nx(1); 362 set_cr0_wp(1); 363 at->flags = 0; 364 at->virt = (void *)virt; 365 at->phys = AT_CODE_DATA_PHYS; 366 at->pt_levels = pt_env->pt_levels; 367 368 at->page_tables[0] = -1ull; 369 at->page_tables[1] = -1ull; 370 371 /* 372 * Zap the existing top-level PTE as it may be reused from a previous 373 * sub-test. This allows runtime PTE modification to assert that two 374 * overlapping walks don't try to install different paging structures. 375 */ 376 root_pt[PT_INDEX(virt, pt_env->pt_levels)] = 0; 377 378 for (i = at->pt_levels; i > 1; i--) { 379 /* 380 * Buddies can reuse any part of the walk that share the same 381 * index. This is weird, but intentional, as several tests 382 * want different walks to merge at lower levels. 383 */ 384 if (buddy && PT_INDEX(virt, i) == PT_INDEX(buddy_virt, i)) 385 at->page_tables[i] = buddy->page_tables[i]; 386 else 387 at->page_tables[i] = ac_test_alloc_pt(pt_env); 388 } 389 } 390 391 static void ac_test_init(ac_test_t *at, unsigned long virt, ac_pt_env_t *pt_env) 392 { 393 __ac_test_init(at, virt, pt_env, NULL); 394 } 395 396 static int ac_test_bump_one(ac_test_t *at) 397 { 398 at->flags = ((at->flags | invalid_mask) + 1) & ~invalid_mask; 399 return at->flags < (1 << NR_AC_FLAGS); 400 } 401 402 #define F(x) ((flags & x##_MASK) != 0) 403 404 static _Bool ac_test_legal(ac_test_t *at) 405 { 406 int flags = at->flags; 407 unsigned reserved; 408 409 if (F(AC_CPU_CR4_SMEP)) 410 return false; 411 412 if (F(AC_ACCESS_FETCH) && F(AC_ACCESS_WRITE)) 413 return false; 414 415 /* 416 * Since we convert current page to kernel page when cr4.smep=1, 417 * we can't switch to user mode. 418 */ 419 if (F(AC_ACCESS_USER) && F(AC_CPU_CR4_SMEP)) 420 return false; 421 422 /* 423 * Only test protection key faults if CR4.PKE=1. 424 */ 425 if (!F(AC_CPU_CR4_PKE) && 426 (F(AC_PKU_AD) || F(AC_PKU_WD))) { 427 return false; 428 } 429 430 /* 431 * pde.bit13 checks handling of reserved bits in largepage PDEs. It is 432 * meaningless if there is a PTE. 433 */ 434 if (!F(AC_PDE_PSE) && F(AC_PDE_BIT13)) 435 return false; 436 437 /* 438 * Shorten the test by avoiding testing too many reserved bit combinations. 439 * Skip testing multiple reserved bits to shorten the test. Reserved bit 440 * page faults are terminal and multiple reserved bits do not affect the 441 * error code; the odds of a KVM bug are super low, and the odds of actually 442 * being able to detect a bug are even lower. 443 */ 444 reserved = (AC_PDE_BIT51_MASK | AC_PDE_BIT36_MASK | AC_PDE_BIT13_MASK | 445 AC_PTE_BIT51_MASK | AC_PTE_BIT36_MASK); 446 if (!F(AC_CPU_EFER_NX)) 447 reserved |= AC_PDE_NX_MASK | AC_PTE_NX_MASK; 448 449 /* Only test one reserved bit at a time. */ 450 reserved &= flags; 451 if (reserved & (reserved - 1)) 452 return false; 453 454 return true; 455 } 456 457 static int ac_test_bump(ac_test_t *at) 458 { 459 int ret; 460 461 do { 462 ret = ac_test_bump_one(at); 463 } while (ret && !ac_test_legal(at)); 464 465 return ret; 466 } 467 468 static pt_element_t ac_test_permissions(ac_test_t *at, unsigned flags, 469 bool writable, bool user, 470 bool executable) 471 { 472 bool kwritable = !F(AC_CPU_CR0_WP) && !F(AC_ACCESS_USER); 473 pt_element_t expected = 0; 474 475 if (F(AC_ACCESS_USER) && !user) 476 at->expected_fault = 1; 477 478 if (F(AC_ACCESS_WRITE) && !writable && !kwritable) 479 at->expected_fault = 1; 480 481 if (F(AC_ACCESS_FETCH) && !executable) 482 at->expected_fault = 1; 483 484 if (F(AC_ACCESS_FETCH) && user && F(AC_CPU_CR4_SMEP)) 485 at->expected_fault = 1; 486 487 if (user && !F(AC_ACCESS_FETCH) && F(AC_PKU_PKEY) && F(AC_CPU_CR4_PKE)) { 488 if (F(AC_PKU_AD)) { 489 at->expected_fault = 1; 490 at->expected_error |= PFERR_PK_MASK; 491 } else if (F(AC_ACCESS_WRITE) && F(AC_PKU_WD) && !kwritable) { 492 at->expected_fault = 1; 493 at->expected_error |= PFERR_PK_MASK; 494 } 495 } 496 497 if (!at->expected_fault) { 498 expected |= PT_ACCESSED_MASK; 499 if (F(AC_ACCESS_WRITE)) 500 expected |= PT_DIRTY_MASK; 501 } 502 503 return expected; 504 } 505 506 static void ac_emulate_access(ac_test_t *at, unsigned flags) 507 { 508 bool pde_valid, pte_valid; 509 bool user, writable, executable; 510 511 if (F(AC_ACCESS_USER)) 512 at->expected_error |= PFERR_USER_MASK; 513 514 if (F(AC_ACCESS_WRITE)) 515 at->expected_error |= PFERR_WRITE_MASK; 516 517 if (F(AC_ACCESS_FETCH)) 518 at->expected_error |= PFERR_FETCH_MASK; 519 520 if (!F(AC_PDE_ACCESSED)) 521 at->ignore_pde = PT_ACCESSED_MASK; 522 523 pde_valid = F(AC_PDE_PRESENT) 524 && !F(AC_PDE_BIT51) && !F(AC_PDE_BIT36) && !F(AC_PDE_BIT13) 525 && !(F(AC_PDE_NX) && !F(AC_CPU_EFER_NX)); 526 527 if (!pde_valid) { 528 at->expected_fault = 1; 529 if (F(AC_PDE_PRESENT)) { 530 at->expected_error |= PFERR_RESERVED_MASK; 531 } else { 532 at->expected_error &= ~PFERR_PRESENT_MASK; 533 } 534 goto fault; 535 } 536 537 writable = !F(AC_PDPTE_NO_WRITABLE) && F(AC_PDE_WRITABLE); 538 user = F(AC_PDE_USER); 539 executable = !F(AC_PDE_NX); 540 541 if (F(AC_PDE_PSE)) { 542 at->expected_pde |= ac_test_permissions(at, flags, writable, 543 user, executable); 544 goto no_pte; 545 } 546 547 at->expected_pde |= PT_ACCESSED_MASK; 548 549 pte_valid = F(AC_PTE_PRESENT) 550 && !F(AC_PTE_BIT51) && !F(AC_PTE_BIT36) 551 && !(F(AC_PTE_NX) && !F(AC_CPU_EFER_NX)); 552 553 if (!pte_valid) { 554 at->expected_fault = 1; 555 if (F(AC_PTE_PRESENT)) { 556 at->expected_error |= PFERR_RESERVED_MASK; 557 } else { 558 at->expected_error &= ~PFERR_PRESENT_MASK; 559 } 560 goto fault; 561 } 562 563 writable &= F(AC_PTE_WRITABLE); 564 user &= F(AC_PTE_USER); 565 executable &= !F(AC_PTE_NX); 566 567 at->expected_pte |= ac_test_permissions(at, flags, writable, user, 568 executable); 569 570 no_pte: 571 fault: 572 if (!at->expected_fault) 573 at->ignore_pde = 0; 574 if (!F(AC_CPU_EFER_NX) && !F(AC_CPU_CR4_SMEP)) 575 at->expected_error &= ~PFERR_FETCH_MASK; 576 } 577 578 static void ac_set_expected_status(ac_test_t *at) 579 { 580 invlpg(at->virt); 581 582 if (at->ptep) 583 at->expected_pte = *at->ptep; 584 at->expected_pde = *at->pdep; 585 at->ignore_pde = 0; 586 at->expected_fault = 0; 587 at->expected_error = PFERR_PRESENT_MASK; 588 589 if (at->flags & AC_ACCESS_TWICE_MASK) { 590 ac_emulate_access(at, at->flags & 591 ~AC_ACCESS_WRITE_MASK & 592 ~AC_ACCESS_FETCH_MASK & 593 ~AC_ACCESS_USER_MASK); 594 at->expected_fault = 0; 595 at->expected_error = PFERR_PRESENT_MASK; 596 at->ignore_pde = 0; 597 } 598 599 ac_emulate_access(at, at->flags); 600 } 601 602 static pt_element_t ac_get_pt(ac_test_t *at, int i, pt_element_t *ptep) 603 { 604 pt_element_t pte; 605 606 pte = *ptep; 607 if (pte && !(pte & PT_PAGE_SIZE_MASK) && 608 (pte & PT_BASE_ADDR_MASK) != at->page_tables[i]) { 609 printf("\nPT collision. VA = 0x%lx, level = %d, index = %ld, found PT = 0x%lx, want PT = 0x%lx\n", 610 (unsigned long)at->virt, i, 611 PT_INDEX((unsigned long)at->virt, i), 612 pte, at->page_tables[i]); 613 abort(); 614 } 615 616 /* 617 * Preserve A/D bits to avoid writing upper level PTEs, 618 * which cannot be unsyc'd when KVM uses shadow paging. 619 */ 620 pte = at->page_tables[i] | (pte & (PT_DIRTY_MASK | PT_ACCESSED_MASK)); 621 return pte; 622 } 623 624 static void ac_test_setup_ptes(ac_test_t *at) 625 { 626 unsigned long parent_pte = shadow_cr3; 627 int flags = at->flags; 628 int i; 629 630 at->ptep = 0; 631 for (i = at->pt_levels; i >= 1 && (i >= 2 || !F(AC_PDE_PSE)); --i) { 632 pt_element_t *parent_pt = va(parent_pte & PT_BASE_ADDR_MASK); 633 unsigned index = PT_INDEX((unsigned long)at->virt, i); 634 pt_element_t *ptep = &parent_pt[index]; 635 pt_element_t pte; 636 637 switch (i) { 638 case 5: 639 case 4: 640 pte = ac_get_pt(at, i, ptep); 641 pte |= PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 642 break; 643 case 3: 644 pte = ac_get_pt(at, i, ptep); 645 pte |= PT_PRESENT_MASK | PT_USER_MASK; 646 if (!F(AC_PDPTE_NO_WRITABLE)) 647 pte |= PT_WRITABLE_MASK; 648 break; 649 case 2: 650 if (!F(AC_PDE_PSE)) { 651 pte = ac_get_pt(at, i, ptep); 652 653 /* The protection key is ignored on non-leaf entries. */ 654 if (F(AC_PKU_PKEY)) 655 pte |= 2ull << 59; 656 } else { 657 pte = at->phys & PT_PSE_BASE_ADDR_MASK; 658 pte |= PT_PAGE_SIZE_MASK; 659 if (F(AC_PKU_PKEY)) 660 pte |= 1ull << 59; 661 } 662 if (F(AC_PDE_PRESENT)) 663 pte |= PT_PRESENT_MASK; 664 if (F(AC_PDE_WRITABLE)) 665 pte |= PT_WRITABLE_MASK; 666 if (F(AC_PDE_USER)) 667 pte |= PT_USER_MASK; 668 if (F(AC_PDE_ACCESSED)) 669 pte |= PT_ACCESSED_MASK; 670 if (F(AC_PDE_DIRTY)) 671 pte |= PT_DIRTY_MASK; 672 if (F(AC_PDE_NX)) 673 pte |= PT64_NX_MASK; 674 if (F(AC_PDE_BIT51)) 675 pte |= 1ull << 51; 676 if (F(AC_PDE_BIT36)) 677 pte |= 1ull << 36; 678 if (F(AC_PDE_BIT13)) 679 pte |= 1ull << 13; 680 at->pdep = ptep; 681 break; 682 case 1: 683 pte = at->phys & PT_BASE_ADDR_MASK; 684 if (F(AC_PKU_PKEY)) 685 pte |= 1ull << 59; 686 if (F(AC_PTE_PRESENT)) 687 pte |= PT_PRESENT_MASK; 688 if (F(AC_PTE_WRITABLE)) 689 pte |= PT_WRITABLE_MASK; 690 if (F(AC_PTE_USER)) 691 pte |= PT_USER_MASK; 692 if (F(AC_PTE_ACCESSED)) 693 pte |= PT_ACCESSED_MASK; 694 if (F(AC_PTE_DIRTY)) 695 pte |= PT_DIRTY_MASK; 696 if (F(AC_PTE_NX)) 697 pte |= PT64_NX_MASK; 698 if (F(AC_PTE_BIT51)) 699 pte |= 1ull << 51; 700 if (F(AC_PTE_BIT36)) 701 pte |= 1ull << 36; 702 at->ptep = ptep; 703 break; 704 default: 705 assert(0); 706 } 707 708 if (pte != *ptep) 709 *ptep = pte; 710 711 parent_pte = pte; 712 } 713 ac_set_expected_status(at); 714 } 715 716 static void __dump_pte(pt_element_t *ptep, int level, unsigned long virt) 717 { 718 printf("------L%d I%lu: %lx\n", level, PT_INDEX(virt, level), *ptep); 719 } 720 721 static void dump_mapping(ac_test_t *at) 722 { 723 unsigned long virt = (unsigned long)at->virt; 724 int flags = at->flags; 725 726 printf("Dump mapping: address: %p\n", at->virt); 727 walk_va(at, F(AC_PDE_PSE) ? 2 : 1, virt, __dump_pte, false); 728 } 729 730 static void ac_test_check(ac_test_t *at, _Bool *success_ret, _Bool cond, 731 const char *fmt, ...) 732 { 733 va_list ap; 734 char buf[500]; 735 736 if (!*success_ret) { 737 return; 738 } 739 740 if (!cond) { 741 return; 742 } 743 744 *success_ret = false; 745 746 if (!verbose) { 747 puts("\n"); 748 ac_test_show(at); 749 } 750 751 va_start(ap, fmt); 752 vsnprintf(buf, sizeof(buf), fmt, ap); 753 va_end(ap); 754 printf("FAIL: %s\n", buf); 755 dump_mapping(at); 756 } 757 758 static int pt_match(pt_element_t pte1, pt_element_t pte2, pt_element_t ignore) 759 { 760 pte1 &= ~ignore; 761 pte2 &= ~ignore; 762 return pte1 == pte2; 763 } 764 765 static int ac_test_do_access(ac_test_t *at) 766 { 767 static unsigned unique = 42; 768 int fault = 0; 769 unsigned e; 770 static unsigned char user_stack[4096]; 771 unsigned long rsp; 772 _Bool success = true; 773 int flags = at->flags; 774 775 ++unique; 776 if (!(unique & 65535)) { 777 puts("."); 778 } 779 780 *((unsigned char *)at->phys) = 0xc3; /* ret */ 781 782 unsigned r = unique; 783 set_cr0_wp(F(AC_CPU_CR0_WP)); 784 set_efer_nx(F(AC_CPU_EFER_NX)); 785 set_cr4_pke(F(AC_CPU_CR4_PKE)); 786 if (F(AC_CPU_CR4_PKE)) { 787 /* WD2=AD2=1, WD1=F(AC_PKU_WD), AD1=F(AC_PKU_AD) */ 788 write_pkru(0x30 | (F(AC_PKU_WD) ? 8 : 0) | 789 (F(AC_PKU_AD) ? 4 : 0)); 790 } 791 792 set_cr4_smep(at, F(AC_CPU_CR4_SMEP)); 793 794 if (F(AC_ACCESS_TWICE)) { 795 asm volatile ("mov $fixed2, %%rsi \n\t" 796 "mov (%[addr]), %[reg] \n\t" 797 "fixed2:" 798 : [reg]"=r"(r), [fault]"=a"(fault), "=b"(e) 799 : [addr]"r"(at->virt) 800 : "rsi"); 801 fault = 0; 802 } 803 804 asm volatile ("mov $fixed1, %%rsi \n\t" 805 "mov %%rsp, %[rsp0] \n\t" 806 "cmp $0, %[user] \n\t" 807 "jz do_access \n\t" 808 "push %%rax; mov %[user_ds], %%ax; mov %%ax, %%ds; pop %%rax \n\t" 809 "pushq %[user_ds] \n\t" 810 "pushq %[user_stack_top] \n\t" 811 "pushfq \n\t" 812 "pushq %[user_cs] \n\t" 813 "pushq $do_access \n\t" 814 "iretq \n" 815 "do_access: \n\t" 816 "cmp $0, %[fetch] \n\t" 817 "jnz 2f \n\t" 818 "cmp $0, %[write] \n\t" 819 "jnz 1f \n\t" 820 "mov (%[addr]), %[reg] \n\t" 821 "jmp done \n\t" 822 "1: mov %[reg], (%[addr]) \n\t" 823 "jmp done \n\t" 824 "2: call *%[addr] \n\t" 825 "done: \n" 826 "fixed1: \n" 827 "int %[kernel_entry_vector] \n\t" 828 ".section .text.entry \n\t" 829 "kernel_entry: \n\t" 830 "mov %[rsp0], %%rsp \n\t" 831 "jmp back_to_kernel \n\t" 832 ".section .text \n\t" 833 "back_to_kernel:" 834 : [reg]"+r"(r), "+a"(fault), "=b"(e), "=&d"(rsp), 835 [rsp0]"=m"(tss[0].rsp0) 836 : [addr]"r"(at->virt), 837 [write]"r"(F(AC_ACCESS_WRITE)), 838 [user]"r"(F(AC_ACCESS_USER)), 839 [fetch]"r"(F(AC_ACCESS_FETCH)), 840 [user_ds]"i"(USER_DS), 841 [user_cs]"i"(USER_CS), 842 [user_stack_top]"r"(user_stack + sizeof user_stack), 843 [kernel_entry_vector]"i"(0x20) 844 : "rsi"); 845 846 asm volatile (".section .text.pf \n\t" 847 "page_fault: \n\t" 848 "pop %rbx \n\t" 849 "mov %rsi, (%rsp) \n\t" 850 "movl $1, %eax \n\t" 851 "iretq \n\t" 852 ".section .text"); 853 854 ac_test_check(at, &success, fault && !at->expected_fault, 855 "unexpected fault"); 856 ac_test_check(at, &success, !fault && at->expected_fault, 857 "unexpected access"); 858 ac_test_check(at, &success, fault && e != at->expected_error, 859 "error code %x expected %x", e, at->expected_error); 860 if (at->ptep) 861 ac_test_check(at, &success, *at->ptep != at->expected_pte, 862 "pte %x expected %x", *at->ptep, at->expected_pte); 863 ac_test_check(at, &success, 864 !pt_match(*at->pdep, at->expected_pde, at->ignore_pde), 865 "pde %x expected %x", *at->pdep, at->expected_pde); 866 867 if (success && verbose) { 868 if (at->expected_fault) { 869 printf("PASS (%x)\n", at->expected_error); 870 } else { 871 printf("PASS\n"); 872 } 873 } 874 return success; 875 } 876 877 static void ac_test_show(ac_test_t *at) 878 { 879 char line[5000]; 880 881 *line = 0; 882 strcat(line, "test"); 883 for (int i = 0; i < NR_AC_FLAGS; ++i) 884 if (at->flags & (1 << i)) { 885 strcat(line, " "); 886 strcat(line, ac_names[i]); 887 } 888 889 strcat(line, ": "); 890 printf("%s", line); 891 } 892 893 /* 894 * This test case is used to triger the bug which is fixed by 895 * commit e09e90a5 in the kvm tree 896 */ 897 static int corrupt_hugepage_triger(ac_pt_env_t *pt_env) 898 { 899 ac_test_t at1, at2; 900 901 ac_test_init(&at1, 0xffff923400000000ul, pt_env); 902 __ac_test_init(&at2, 0xffffe66600000000ul, pt_env, &at1); 903 904 at2.flags = AC_CPU_CR0_WP_MASK | AC_PDE_PSE_MASK | AC_PDE_PRESENT_MASK; 905 ac_test_setup_ptes(&at2); 906 if (!ac_test_do_access(&at2)) 907 goto err; 908 909 at1.flags = at2.flags | AC_PDE_WRITABLE_MASK; 910 ac_test_setup_ptes(&at1); 911 if (!ac_test_do_access(&at1)) 912 goto err; 913 914 at1.flags |= AC_ACCESS_WRITE_MASK; 915 ac_set_expected_status(&at1); 916 if (!ac_test_do_access(&at1)) 917 goto err; 918 919 at2.flags |= AC_ACCESS_WRITE_MASK; 920 ac_set_expected_status(&at2); 921 if (!ac_test_do_access(&at2)) 922 goto err; 923 924 return 1; 925 926 err: 927 printf("corrupt_hugepage_triger test fail\n"); 928 return 0; 929 } 930 931 /* 932 * This test case is used to triger the bug which is fixed by 933 * commit 3ddf6c06e13e in the kvm tree 934 */ 935 static int check_pfec_on_prefetch_pte(ac_pt_env_t *pt_env) 936 { 937 ac_test_t at1, at2; 938 939 ac_test_init(&at1, 0xffff923406001000ul, pt_env); 940 __ac_test_init(&at2, 0xffff923406003000ul, pt_env, &at1); 941 942 at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK; 943 ac_test_setup_ptes(&at1); 944 945 at2.flags = at1.flags | AC_PTE_NX_MASK; 946 ac_test_setup_ptes(&at2); 947 948 if (!ac_test_do_access(&at1)) { 949 printf("%s: prepare fail\n", __FUNCTION__); 950 goto err; 951 } 952 953 if (!ac_test_do_access(&at2)) { 954 printf("%s: check PFEC on prefetch pte path fail\n", 955 __FUNCTION__); 956 goto err; 957 } 958 959 return 1; 960 961 err: 962 return 0; 963 } 964 965 /* 966 * If the write-fault access is from supervisor and CR0.WP is not set on the 967 * vcpu, kvm will fix it by adjusting pte access - it sets the W bit on pte 968 * and clears U bit. This is the chance that kvm can change pte access from 969 * readonly to writable. 970 * 971 * Unfortunately, the pte access is the access of 'direct' shadow page table, 972 * means direct sp.role.access = pte_access, then we will create a writable 973 * spte entry on the readonly shadow page table. It will cause Dirty bit is 974 * not tracked when two guest ptes point to the same large page. Note, it 975 * does not have other impact except Dirty bit since cr0.wp is encoded into 976 * sp.role. 977 * 978 * Note: to trigger this bug, hugepage should be disabled on host. 979 */ 980 static int check_large_pte_dirty_for_nowp(ac_pt_env_t *pt_env) 981 { 982 ac_test_t at1, at2; 983 984 ac_test_init(&at1, 0xffff923403000000ul, pt_env); 985 __ac_test_init(&at2, 0xffffe66606000000ul, pt_env, &at1); 986 987 at2.flags = AC_PDE_PRESENT_MASK | AC_PDE_PSE_MASK; 988 ac_test_setup_ptes(&at2); 989 if (!ac_test_do_access(&at2)) { 990 printf("%s: read on the first mapping fail.\n", __FUNCTION__); 991 goto err; 992 } 993 994 at1.flags = at2.flags | AC_ACCESS_WRITE_MASK; 995 ac_test_setup_ptes(&at1); 996 if (!ac_test_do_access(&at1)) { 997 printf("%s: write on the second mapping fail.\n", __FUNCTION__); 998 goto err; 999 } 1000 1001 at2.flags |= AC_ACCESS_WRITE_MASK; 1002 ac_set_expected_status(&at2); 1003 if (!ac_test_do_access(&at2)) { 1004 printf("%s: write on the first mapping fail.\n", __FUNCTION__); 1005 goto err; 1006 } 1007 1008 return 1; 1009 1010 err: 1011 return 0; 1012 } 1013 1014 static int check_smep_andnot_wp(ac_pt_env_t *pt_env) 1015 { 1016 ac_test_t at1; 1017 int err_prepare_andnot_wp, err_smep_andnot_wp; 1018 1019 if (!this_cpu_has(X86_FEATURE_SMEP)) { 1020 return 1; 1021 } 1022 1023 ac_test_init(&at1, 0xffff923406001000ul, pt_env); 1024 1025 at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK | 1026 AC_PDE_USER_MASK | AC_PTE_USER_MASK | 1027 AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK | 1028 AC_CPU_CR4_SMEP_MASK | 1029 AC_CPU_CR0_WP_MASK | 1030 AC_ACCESS_WRITE_MASK; 1031 ac_test_setup_ptes(&at1); 1032 1033 /* 1034 * Here we write the ro user page when 1035 * cr0.wp=0, then we execute it and SMEP 1036 * fault should happen. 1037 */ 1038 err_prepare_andnot_wp = ac_test_do_access(&at1); 1039 if (!err_prepare_andnot_wp) { 1040 printf("%s: SMEP prepare fail\n", __FUNCTION__); 1041 goto clean_up; 1042 } 1043 1044 at1.flags &= ~AC_ACCESS_WRITE_MASK; 1045 at1.flags |= AC_ACCESS_FETCH_MASK; 1046 ac_set_expected_status(&at1); 1047 err_smep_andnot_wp = ac_test_do_access(&at1); 1048 1049 clean_up: 1050 set_cr4_smep(&at1, 0); 1051 1052 if (!err_prepare_andnot_wp) 1053 goto err; 1054 if (!err_smep_andnot_wp) { 1055 printf("%s: check SMEP without wp fail\n", __FUNCTION__); 1056 goto err; 1057 } 1058 return 1; 1059 1060 err: 1061 return 0; 1062 } 1063 1064 static int check_effective_sp_permissions(ac_pt_env_t *pt_env) 1065 { 1066 unsigned long ptr1 = 0xffff923480000000; 1067 unsigned long ptr2 = ptr1 + SZ_2M; 1068 unsigned long ptr3 = ptr1 + SZ_1G; 1069 unsigned long ptr4 = ptr3 + SZ_2M; 1070 ac_test_t at1, at2, at3, at4; 1071 int err_read_at1, err_write_at2; 1072 int err_read_at3, err_write_at4; 1073 1074 /* 1075 * pgd[] pud[] pmd[] virtual address pointers 1076 * /->pmd(u--)->pte1(uw-)->page1 <- ptr1 (u--) 1077 * /->pud1(uw-)--->pmd(uw-)->pte2(uw-)->page2 <- ptr2 (uw-) 1078 * pgd-| 1079 * \->pud2(u--)--->pmd(u--)->pte1(uw-)->page1 <- ptr3 (u--) 1080 * \->pmd(uw-)->pte2(uw-)->page2 <- ptr4 (u--) 1081 * pud1 and pud2 point to the same pmd page. 1082 */ 1083 1084 ac_test_init(&at1, ptr1, pt_env); 1085 at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK | 1086 AC_PDE_USER_MASK | AC_PTE_USER_MASK | 1087 AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK | 1088 AC_PTE_WRITABLE_MASK | AC_ACCESS_USER_MASK; 1089 ac_test_setup_ptes(&at1); 1090 1091 __ac_test_init(&at2, ptr2, pt_env, &at1); 1092 at2.flags = at1.flags | AC_PDE_WRITABLE_MASK | AC_PTE_DIRTY_MASK | AC_ACCESS_WRITE_MASK; 1093 ac_test_setup_ptes(&at2); 1094 1095 __ac_test_init(&at3, ptr3, pt_env, &at1); 1096 /* Override the PMD (1-based index) to point at ptr1's PMD. */ 1097 at3.page_tables[3] = at1.page_tables[3]; 1098 at3.flags = AC_PDPTE_NO_WRITABLE_MASK | at1.flags; 1099 ac_test_setup_ptes(&at3); 1100 1101 /* Alias ptr2, only the PMD will differ; manually override the PMD. */ 1102 __ac_test_init(&at4, ptr4, pt_env, &at2); 1103 at4.page_tables[3] = at1.page_tables[3]; 1104 at4.flags = AC_PDPTE_NO_WRITABLE_MASK | at2.flags; 1105 ac_test_setup_ptes(&at4); 1106 1107 err_read_at1 = ac_test_do_access(&at1); 1108 if (!err_read_at1) { 1109 printf("%s: read access at1 fail\n", __FUNCTION__); 1110 return 0; 1111 } 1112 1113 err_write_at2 = ac_test_do_access(&at2); 1114 if (!err_write_at2) { 1115 printf("%s: write access at2 fail\n", __FUNCTION__); 1116 return 0; 1117 } 1118 1119 err_read_at3 = ac_test_do_access(&at3); 1120 if (!err_read_at3) { 1121 printf("%s: read access at3 fail\n", __FUNCTION__); 1122 return 0; 1123 } 1124 1125 err_write_at4 = ac_test_do_access(&at4); 1126 if (!err_write_at4) { 1127 printf("%s: write access at4 should fail\n", __FUNCTION__); 1128 return 0; 1129 } 1130 1131 return 1; 1132 } 1133 1134 static int ac_test_exec(ac_test_t *at, ac_pt_env_t *pt_env) 1135 { 1136 int r; 1137 1138 if (verbose) { 1139 ac_test_show(at); 1140 } 1141 ac_test_setup_ptes(at); 1142 r = ac_test_do_access(at); 1143 return r; 1144 } 1145 1146 typedef int (*ac_test_fn)(ac_pt_env_t *pt_env); 1147 const ac_test_fn ac_test_cases[] = 1148 { 1149 corrupt_hugepage_triger, 1150 check_pfec_on_prefetch_pte, 1151 check_large_pte_dirty_for_nowp, 1152 check_smep_andnot_wp, 1153 check_effective_sp_permissions, 1154 }; 1155 1156 int ac_test_run(int pt_levels) 1157 { 1158 ac_test_t at; 1159 ac_pt_env_t pt_env; 1160 int i, tests, successes; 1161 1162 printf("run\n"); 1163 tests = successes = 0; 1164 1165 shadow_cr0 = read_cr0(); 1166 shadow_cr4 = read_cr4(); 1167 shadow_cr3 = read_cr3(); 1168 shadow_efer = rdmsr(MSR_EFER); 1169 1170 if (cpuid_maxphyaddr() >= 52) { 1171 invalid_mask |= AC_PDE_BIT51_MASK; 1172 invalid_mask |= AC_PTE_BIT51_MASK; 1173 } 1174 if (cpuid_maxphyaddr() >= 37) { 1175 invalid_mask |= AC_PDE_BIT36_MASK; 1176 invalid_mask |= AC_PTE_BIT36_MASK; 1177 } 1178 1179 ac_env_int(&pt_env, pt_levels); 1180 ac_test_init(&at, 0xffff923400000000ul, &pt_env); 1181 1182 if (this_cpu_has(X86_FEATURE_PKU)) { 1183 set_cr4_pke(1); 1184 set_cr4_pke(0); 1185 /* Now PKRU = 0xFFFFFFFF. */ 1186 } else { 1187 tests++; 1188 if (write_cr4_checking(shadow_cr4 | X86_CR4_PKE) == GP_VECTOR) { 1189 successes++; 1190 invalid_mask |= AC_PKU_AD_MASK; 1191 invalid_mask |= AC_PKU_WD_MASK; 1192 invalid_mask |= AC_PKU_PKEY_MASK; 1193 invalid_mask |= AC_CPU_CR4_PKE_MASK; 1194 printf("CR4.PKE not available, disabling PKE tests\n"); 1195 } else { 1196 printf("Set PKE in CR4 - expect #GP: FAIL!\n"); 1197 set_cr4_pke(0); 1198 } 1199 } 1200 1201 if (!this_cpu_has(X86_FEATURE_SMEP)) { 1202 tests++; 1203 if (set_cr4_smep(&at, 1) == GP_VECTOR) { 1204 successes++; 1205 invalid_mask |= AC_CPU_CR4_SMEP_MASK; 1206 printf("CR4.SMEP not available, disabling SMEP tests\n"); 1207 } else { 1208 printf("Set SMEP in CR4 - expect #GP: FAIL!\n"); 1209 set_cr4_smep(&at, 0); 1210 } 1211 } 1212 1213 /* Toggling LA57 in 64-bit mode (guaranteed for this test) is illegal. */ 1214 if (this_cpu_has(X86_FEATURE_LA57)) { 1215 tests++; 1216 if (write_cr4_checking(shadow_cr4 ^ X86_CR4_LA57) == GP_VECTOR) 1217 successes++; 1218 1219 /* Force a VM-Exit on KVM, which doesn't intercept LA57 itself. */ 1220 tests++; 1221 if (write_cr4_checking(shadow_cr4 ^ (X86_CR4_LA57 | X86_CR4_PSE)) == GP_VECTOR) 1222 successes++; 1223 } 1224 1225 do { 1226 ++tests; 1227 successes += ac_test_exec(&at, &pt_env); 1228 } while (ac_test_bump(&at)); 1229 1230 for (i = 0; i < ARRAY_SIZE(ac_test_cases); i++) { 1231 ac_env_int(&pt_env, pt_levels); 1232 1233 ++tests; 1234 successes += ac_test_cases[i](&pt_env); 1235 } 1236 1237 printf("\n%d tests, %d failures\n", tests, tests - successes); 1238 1239 return successes == tests; 1240 } 1241