1 /* 2 * x86 exception helpers - system code 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/cpu_ldst.h" 23 #include "exec/cputlb.h" 24 #include "exec/page-protection.h" 25 #include "exec/tlb-flags.h" 26 #include "exec/tswap.h" 27 #include "tcg/helper-tcg.h" 28 29 typedef struct TranslateParams { 30 target_ulong addr; 31 target_ulong cr3; 32 int pg_mode; 33 int mmu_idx; 34 int ptw_idx; 35 MMUAccessType access_type; 36 } TranslateParams; 37 38 typedef struct TranslateResult { 39 hwaddr paddr; 40 int prot; 41 int page_size; 42 } TranslateResult; 43 44 typedef enum TranslateFaultStage2 { 45 S2_NONE, 46 S2_GPA, 47 S2_GPT, 48 } TranslateFaultStage2; 49 50 typedef struct TranslateFault { 51 int exception_index; 52 int error_code; 53 target_ulong cr2; 54 TranslateFaultStage2 stage2; 55 } TranslateFault; 56 57 typedef struct PTETranslate { 58 CPUX86State *env; 59 TranslateFault *err; 60 int ptw_idx; 61 void *haddr; 62 hwaddr gaddr; 63 } PTETranslate; 64 65 static bool ptw_translate(PTETranslate *inout, hwaddr addr) 66 { 67 int flags; 68 69 inout->gaddr = addr; 70 flags = probe_access_full_mmu(inout->env, addr, 0, MMU_DATA_STORE, 71 inout->ptw_idx, &inout->haddr, NULL); 72 73 if (unlikely(flags & TLB_INVALID_MASK)) { 74 TranslateFault *err = inout->err; 75 76 assert(inout->ptw_idx == MMU_NESTED_IDX); 77 *err = (TranslateFault){ 78 .error_code = inout->env->error_code, 79 .cr2 = addr, 80 .stage2 = S2_GPT, 81 }; 82 return false; 83 } 84 return true; 85 } 86 87 static inline uint32_t ptw_ldl(const PTETranslate *in, uint64_t ra) 88 { 89 if (likely(in->haddr)) { 90 return ldl_p(in->haddr); 91 } 92 return cpu_ldl_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, ra); 93 } 94 95 static inline uint64_t ptw_ldq(const PTETranslate *in, uint64_t ra) 96 { 97 if (likely(in->haddr)) { 98 return ldq_p(in->haddr); 99 } 100 return cpu_ldq_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, ra); 101 } 102 103 /* 104 * Note that we can use a 32-bit cmpxchg for all page table entries, 105 * even 64-bit ones, because PG_PRESENT_MASK, PG_ACCESSED_MASK and 106 * PG_DIRTY_MASK are all in the low 32 bits. 107 */ 108 static bool ptw_setl_slow(const PTETranslate *in, uint32_t old, uint32_t new) 109 { 110 uint32_t cmp; 111 112 CPUState *cpu = env_cpu(in->env); 113 /* We are in cpu_exec, and start_exclusive can't be called directly.*/ 114 g_assert(cpu->running); 115 cpu_exec_end(cpu); 116 /* Does x86 really perform a rmw cycle on mmio for ptw? */ 117 start_exclusive(); 118 cmp = cpu_ldl_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0); 119 if (cmp == old) { 120 cpu_stl_mmuidx_ra(in->env, in->gaddr, new, in->ptw_idx, 0); 121 } 122 end_exclusive(); 123 cpu_exec_start(cpu); 124 return cmp == old; 125 } 126 127 static inline bool ptw_setl(const PTETranslate *in, uint32_t old, uint32_t set) 128 { 129 if (set & ~old) { 130 uint32_t new = old | set; 131 if (likely(in->haddr)) { 132 old = cpu_to_le32(old); 133 new = cpu_to_le32(new); 134 return qatomic_cmpxchg((uint32_t *)in->haddr, old, new) == old; 135 } 136 return ptw_setl_slow(in, old, new); 137 } 138 return true; 139 } 140 141 static bool mmu_translate(CPUX86State *env, const TranslateParams *in, 142 TranslateResult *out, TranslateFault *err, 143 uint64_t ra) 144 { 145 const target_ulong addr = in->addr; 146 const int pg_mode = in->pg_mode; 147 const bool is_user = is_mmu_index_user(in->mmu_idx); 148 const MMUAccessType access_type = in->access_type; 149 uint64_t ptep, pte, rsvd_mask; 150 PTETranslate pte_trans = { 151 .env = env, 152 .err = err, 153 .ptw_idx = in->ptw_idx, 154 }; 155 hwaddr pte_addr, paddr; 156 uint32_t pkr; 157 int page_size; 158 int error_code; 159 int prot; 160 161 restart_all: 162 rsvd_mask = ~MAKE_64BIT_MASK(0, env_archcpu(env)->phys_bits); 163 rsvd_mask &= PG_ADDRESS_MASK; 164 if (!(pg_mode & PG_MODE_NXE)) { 165 rsvd_mask |= PG_NX_MASK; 166 } 167 168 if (pg_mode & PG_MODE_PAE) { 169 #ifdef TARGET_X86_64 170 if (pg_mode & PG_MODE_LMA) { 171 if (pg_mode & PG_MODE_LA57) { 172 /* 173 * Page table level 5 174 */ 175 pte_addr = (in->cr3 & ~0xfff) + (((addr >> 48) & 0x1ff) << 3); 176 if (!ptw_translate(&pte_trans, pte_addr)) { 177 return false; 178 } 179 restart_5: 180 pte = ptw_ldq(&pte_trans, ra); 181 if (!(pte & PG_PRESENT_MASK)) { 182 goto do_fault; 183 } 184 if (pte & (rsvd_mask | PG_PSE_MASK)) { 185 goto do_fault_rsvd; 186 } 187 if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { 188 goto restart_5; 189 } 190 ptep = pte ^ PG_NX_MASK; 191 } else { 192 pte = in->cr3; 193 ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK; 194 } 195 196 /* 197 * Page table level 4 198 */ 199 pte_addr = (pte & PG_ADDRESS_MASK) + (((addr >> 39) & 0x1ff) << 3); 200 if (!ptw_translate(&pte_trans, pte_addr)) { 201 return false; 202 } 203 restart_4: 204 pte = ptw_ldq(&pte_trans, ra); 205 if (!(pte & PG_PRESENT_MASK)) { 206 goto do_fault; 207 } 208 if (pte & (rsvd_mask | PG_PSE_MASK)) { 209 goto do_fault_rsvd; 210 } 211 if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { 212 goto restart_4; 213 } 214 ptep &= pte ^ PG_NX_MASK; 215 216 /* 217 * Page table level 3 218 */ 219 pte_addr = (pte & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3); 220 if (!ptw_translate(&pte_trans, pte_addr)) { 221 return false; 222 } 223 restart_3_lma: 224 pte = ptw_ldq(&pte_trans, ra); 225 if (!(pte & PG_PRESENT_MASK)) { 226 goto do_fault; 227 } 228 if (pte & rsvd_mask) { 229 goto do_fault_rsvd; 230 } 231 if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { 232 goto restart_3_lma; 233 } 234 ptep &= pte ^ PG_NX_MASK; 235 if (pte & PG_PSE_MASK) { 236 /* 1 GB page */ 237 page_size = 1024 * 1024 * 1024; 238 goto do_check_protect; 239 } 240 } else 241 #endif 242 { 243 /* 244 * Page table level 3 245 */ 246 pte_addr = (in->cr3 & 0xffffffe0ULL) + ((addr >> 27) & 0x18); 247 if (!ptw_translate(&pte_trans, pte_addr)) { 248 return false; 249 } 250 rsvd_mask |= PG_HI_USER_MASK; 251 restart_3_nolma: 252 pte = ptw_ldq(&pte_trans, ra); 253 if (!(pte & PG_PRESENT_MASK)) { 254 goto do_fault; 255 } 256 if (pte & (rsvd_mask | PG_NX_MASK)) { 257 goto do_fault_rsvd; 258 } 259 if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { 260 goto restart_3_nolma; 261 } 262 ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK; 263 } 264 265 /* 266 * Page table level 2 267 */ 268 pte_addr = (pte & PG_ADDRESS_MASK) + (((addr >> 21) & 0x1ff) << 3); 269 if (!ptw_translate(&pte_trans, pte_addr)) { 270 return false; 271 } 272 restart_2_pae: 273 pte = ptw_ldq(&pte_trans, ra); 274 if (!(pte & PG_PRESENT_MASK)) { 275 goto do_fault; 276 } 277 if (pte & rsvd_mask) { 278 goto do_fault_rsvd; 279 } 280 if (pte & PG_PSE_MASK) { 281 /* 2 MB page */ 282 page_size = 2048 * 1024; 283 ptep &= pte ^ PG_NX_MASK; 284 goto do_check_protect; 285 } 286 if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { 287 goto restart_2_pae; 288 } 289 ptep &= pte ^ PG_NX_MASK; 290 291 /* 292 * Page table level 1 293 */ 294 pte_addr = (pte & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3); 295 if (!ptw_translate(&pte_trans, pte_addr)) { 296 return false; 297 } 298 pte = ptw_ldq(&pte_trans, ra); 299 if (!(pte & PG_PRESENT_MASK)) { 300 goto do_fault; 301 } 302 if (pte & rsvd_mask) { 303 goto do_fault_rsvd; 304 } 305 /* combine pde and pte nx, user and rw protections */ 306 ptep &= pte ^ PG_NX_MASK; 307 page_size = 4096; 308 } else if (pg_mode & PG_MODE_PG) { 309 /* 310 * Page table level 2 311 */ 312 pte_addr = (in->cr3 & 0xfffff000ULL) + ((addr >> 20) & 0xffc); 313 if (!ptw_translate(&pte_trans, pte_addr)) { 314 return false; 315 } 316 restart_2_nopae: 317 pte = ptw_ldl(&pte_trans, ra); 318 if (!(pte & PG_PRESENT_MASK)) { 319 goto do_fault; 320 } 321 ptep = pte | PG_NX_MASK; 322 323 /* if PSE bit is set, then we use a 4MB page */ 324 if ((pte & PG_PSE_MASK) && (pg_mode & PG_MODE_PSE)) { 325 page_size = 4096 * 1024; 326 /* 327 * Bits 20-13 provide bits 39-32 of the address, bit 21 is reserved. 328 * Leave bits 20-13 in place for setting accessed/dirty bits below. 329 */ 330 pte = (uint32_t)pte | ((pte & 0x1fe000LL) << (32 - 13)); 331 rsvd_mask = 0x200000; 332 goto do_check_protect_pse36; 333 } 334 if (!ptw_setl(&pte_trans, pte, PG_ACCESSED_MASK)) { 335 goto restart_2_nopae; 336 } 337 338 /* 339 * Page table level 1 340 */ 341 pte_addr = (pte & ~0xfffu) + ((addr >> 10) & 0xffc); 342 if (!ptw_translate(&pte_trans, pte_addr)) { 343 return false; 344 } 345 pte = ptw_ldl(&pte_trans, ra); 346 if (!(pte & PG_PRESENT_MASK)) { 347 goto do_fault; 348 } 349 /* combine pde and pte user and rw protections */ 350 ptep &= pte | PG_NX_MASK; 351 page_size = 4096; 352 rsvd_mask = 0; 353 } else { 354 /* 355 * No paging (real mode), let's tentatively resolve the address as 1:1 356 * here, but conditionally still perform an NPT walk on it later. 357 */ 358 page_size = 0x40000000; 359 paddr = in->addr; 360 prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; 361 goto stage2; 362 } 363 364 do_check_protect: 365 rsvd_mask |= (page_size - 1) & PG_ADDRESS_MASK & ~PG_PSE_PAT_MASK; 366 do_check_protect_pse36: 367 if (pte & rsvd_mask) { 368 goto do_fault_rsvd; 369 } 370 ptep ^= PG_NX_MASK; 371 372 /* can the page can be put in the TLB? prot will tell us */ 373 if (is_user && !(ptep & PG_USER_MASK)) { 374 goto do_fault_protect; 375 } 376 377 prot = 0; 378 if (!is_mmu_index_smap(in->mmu_idx) || !(ptep & PG_USER_MASK)) { 379 prot |= PAGE_READ; 380 if ((ptep & PG_RW_MASK) || !(is_user || (pg_mode & PG_MODE_WP))) { 381 prot |= PAGE_WRITE; 382 } 383 } 384 if (!(ptep & PG_NX_MASK) && 385 (is_user || 386 !((pg_mode & PG_MODE_SMEP) && (ptep & PG_USER_MASK)))) { 387 prot |= PAGE_EXEC; 388 } 389 390 if (ptep & PG_USER_MASK) { 391 pkr = pg_mode & PG_MODE_PKE ? env->pkru : 0; 392 } else { 393 pkr = pg_mode & PG_MODE_PKS ? env->pkrs : 0; 394 } 395 if (pkr) { 396 uint32_t pk = (pte & PG_PKRU_MASK) >> PG_PKRU_BIT; 397 uint32_t pkr_ad = (pkr >> pk * 2) & 1; 398 uint32_t pkr_wd = (pkr >> pk * 2) & 2; 399 uint32_t pkr_prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; 400 401 if (pkr_ad) { 402 pkr_prot &= ~(PAGE_READ | PAGE_WRITE); 403 } else if (pkr_wd && (is_user || (pg_mode & PG_MODE_WP))) { 404 pkr_prot &= ~PAGE_WRITE; 405 } 406 if ((pkr_prot & (1 << access_type)) == 0) { 407 goto do_fault_pk_protect; 408 } 409 prot &= pkr_prot; 410 } 411 412 if ((prot & (1 << access_type)) == 0) { 413 goto do_fault_protect; 414 } 415 416 /* yes, it can! */ 417 { 418 uint32_t set = PG_ACCESSED_MASK; 419 if (access_type == MMU_DATA_STORE) { 420 set |= PG_DIRTY_MASK; 421 } else if (!(pte & PG_DIRTY_MASK)) { 422 /* 423 * Only set write access if already dirty... 424 * otherwise wait for dirty access. 425 */ 426 prot &= ~PAGE_WRITE; 427 } 428 if (!ptw_setl(&pte_trans, pte, set)) { 429 /* 430 * We can arrive here from any of 3 levels and 2 formats. 431 * The only safe thing is to restart the entire lookup. 432 */ 433 goto restart_all; 434 } 435 } 436 437 /* merge offset within page */ 438 paddr = (pte & PG_ADDRESS_MASK & ~(page_size - 1)) | (addr & (page_size - 1)); 439 stage2: 440 441 /* 442 * Note that NPT is walked (for both paging structures and final guest 443 * addresses) using the address with the A20 bit set. 444 */ 445 if (in->ptw_idx == MMU_NESTED_IDX) { 446 CPUTLBEntryFull *full; 447 int flags, nested_page_size; 448 449 flags = probe_access_full_mmu(env, paddr, 0, access_type, 450 MMU_NESTED_IDX, &pte_trans.haddr, &full); 451 if (unlikely(flags & TLB_INVALID_MASK)) { 452 *err = (TranslateFault){ 453 .error_code = env->error_code, 454 .cr2 = paddr, 455 .stage2 = S2_GPA, 456 }; 457 return false; 458 } 459 460 /* Merge stage1 & stage2 protection bits. */ 461 prot &= full->prot; 462 463 /* Re-verify resulting protection. */ 464 if ((prot & (1 << access_type)) == 0) { 465 goto do_fault_protect; 466 } 467 468 /* Merge stage1 & stage2 addresses to final physical address. */ 469 nested_page_size = 1 << full->lg_page_size; 470 paddr = (full->phys_addr & ~(nested_page_size - 1)) 471 | (paddr & (nested_page_size - 1)); 472 473 /* 474 * Use the larger of stage1 & stage2 page sizes, so that 475 * invalidation works. 476 */ 477 if (nested_page_size > page_size) { 478 page_size = nested_page_size; 479 } 480 } 481 482 out->paddr = paddr & x86_get_a20_mask(env); 483 out->prot = prot; 484 out->page_size = page_size; 485 return true; 486 487 do_fault_rsvd: 488 error_code = PG_ERROR_RSVD_MASK; 489 goto do_fault_cont; 490 do_fault_protect: 491 error_code = PG_ERROR_P_MASK; 492 goto do_fault_cont; 493 do_fault_pk_protect: 494 assert(access_type != MMU_INST_FETCH); 495 error_code = PG_ERROR_PK_MASK | PG_ERROR_P_MASK; 496 goto do_fault_cont; 497 do_fault: 498 error_code = 0; 499 do_fault_cont: 500 if (is_user) { 501 error_code |= PG_ERROR_U_MASK; 502 } 503 switch (access_type) { 504 case MMU_DATA_LOAD: 505 break; 506 case MMU_DATA_STORE: 507 error_code |= PG_ERROR_W_MASK; 508 break; 509 case MMU_INST_FETCH: 510 if (pg_mode & (PG_MODE_NXE | PG_MODE_SMEP)) { 511 error_code |= PG_ERROR_I_D_MASK; 512 } 513 break; 514 } 515 *err = (TranslateFault){ 516 .exception_index = EXCP0E_PAGE, 517 .error_code = error_code, 518 .cr2 = addr, 519 }; 520 return false; 521 } 522 523 static G_NORETURN void raise_stage2(CPUX86State *env, TranslateFault *err, 524 uintptr_t retaddr) 525 { 526 uint64_t exit_info_1 = err->error_code; 527 528 switch (err->stage2) { 529 case S2_GPT: 530 exit_info_1 |= SVM_NPTEXIT_GPT; 531 break; 532 case S2_GPA: 533 exit_info_1 |= SVM_NPTEXIT_GPA; 534 break; 535 default: 536 g_assert_not_reached(); 537 } 538 539 x86_stq_phys(env_cpu(env), 540 env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2), 541 err->cr2); 542 cpu_vmexit(env, SVM_EXIT_NPF, exit_info_1, retaddr); 543 } 544 545 static bool get_physical_address(CPUX86State *env, vaddr addr, 546 MMUAccessType access_type, int mmu_idx, 547 TranslateResult *out, TranslateFault *err, 548 uint64_t ra) 549 { 550 TranslateParams in; 551 bool use_stage2 = env->hflags2 & HF2_NPT_MASK; 552 553 in.addr = addr; 554 in.access_type = access_type; 555 556 switch (mmu_idx) { 557 case MMU_PHYS_IDX: 558 break; 559 560 case MMU_NESTED_IDX: 561 if (likely(use_stage2)) { 562 in.cr3 = env->nested_cr3; 563 in.pg_mode = env->nested_pg_mode; 564 in.mmu_idx = 565 env->nested_pg_mode & PG_MODE_LMA ? MMU_USER64_IDX : MMU_USER32_IDX; 566 in.ptw_idx = MMU_PHYS_IDX; 567 568 if (!mmu_translate(env, &in, out, err, ra)) { 569 err->stage2 = S2_GPA; 570 return false; 571 } 572 return true; 573 } 574 break; 575 576 default: 577 if (is_mmu_index_32(mmu_idx)) { 578 addr = (uint32_t)addr; 579 } 580 581 if (likely(env->cr[0] & CR0_PG_MASK || use_stage2)) { 582 in.cr3 = env->cr[3]; 583 in.mmu_idx = mmu_idx; 584 in.ptw_idx = use_stage2 ? MMU_NESTED_IDX : MMU_PHYS_IDX; 585 in.pg_mode = get_pg_mode(env); 586 587 if (in.pg_mode & PG_MODE_LMA) { 588 /* test virtual address sign extension */ 589 int shift = in.pg_mode & PG_MODE_LA57 ? 56 : 47; 590 int64_t sext = (int64_t)addr >> shift; 591 if (sext != 0 && sext != -1) { 592 *err = (TranslateFault){ 593 .exception_index = EXCP0D_GPF, 594 .cr2 = addr, 595 }; 596 return false; 597 } 598 } 599 return mmu_translate(env, &in, out, err, ra); 600 } 601 break; 602 } 603 604 /* No translation needed. */ 605 out->paddr = addr & x86_get_a20_mask(env); 606 out->prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; 607 out->page_size = TARGET_PAGE_SIZE; 608 return true; 609 } 610 611 bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size, 612 MMUAccessType access_type, int mmu_idx, 613 bool probe, uintptr_t retaddr) 614 { 615 CPUX86State *env = cpu_env(cs); 616 TranslateResult out; 617 TranslateFault err; 618 619 if (get_physical_address(env, addr, access_type, mmu_idx, &out, &err, 620 retaddr)) { 621 /* 622 * Even if 4MB pages, we map only one 4KB page in the cache to 623 * avoid filling it too fast. 624 */ 625 assert(out.prot & (1 << access_type)); 626 tlb_set_page_with_attrs(cs, addr & TARGET_PAGE_MASK, 627 out.paddr & TARGET_PAGE_MASK, 628 cpu_get_mem_attrs(env), 629 out.prot, mmu_idx, out.page_size); 630 return true; 631 } 632 633 if (probe) { 634 /* This will be used if recursing for stage2 translation. */ 635 env->error_code = err.error_code; 636 return false; 637 } 638 639 if (err.stage2 != S2_NONE) { 640 raise_stage2(env, &err, retaddr); 641 } 642 643 if (env->intercept_exceptions & (1 << err.exception_index)) { 644 /* cr2 is not modified in case of exceptions */ 645 x86_stq_phys(cs, env->vm_vmcb + 646 offsetof(struct vmcb, control.exit_info_2), 647 err.cr2); 648 } else { 649 env->cr[2] = err.cr2; 650 } 651 raise_exception_err_ra(env, err.exception_index, err.error_code, retaddr); 652 } 653 654 G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, 655 MMUAccessType access_type, 656 int mmu_idx, uintptr_t retaddr) 657 { 658 X86CPU *cpu = X86_CPU(cs); 659 handle_unaligned_access(&cpu->env, vaddr, access_type, retaddr); 660 } 661