17d36db35SAvi Kivity #include "vm.h" 27d36db35SAvi Kivity #include "libcflat.h" 3efd8e5aaSPaolo Bonzini #include "vmalloc.h" 45aca024eSPaolo Bonzini #include "alloc_page.h" 548f67910SCathy Avery #include "smp.h" 67d36db35SAvi Kivity 75ae6c45fSSean Christopherson static pteval_t pte_opt_mask; 85ae6c45fSSean Christopherson 94363f1d9SPaolo Bonzini pteval_t *install_pte(pgd_t *cr3, 107d36db35SAvi Kivity int pte_level, 117d36db35SAvi Kivity void *virt, 124363f1d9SPaolo Bonzini pteval_t pte, 134363f1d9SPaolo Bonzini pteval_t *pt_page) 147d36db35SAvi Kivity { 157d36db35SAvi Kivity int level; 164363f1d9SPaolo Bonzini pteval_t *pt = cr3; 177d36db35SAvi Kivity unsigned offset; 187d36db35SAvi Kivity 197d36db35SAvi Kivity for (level = PAGE_LEVEL; level > pte_level; --level) { 204363f1d9SPaolo Bonzini offset = PGDIR_OFFSET((uintptr_t)virt, level); 21d10d16e1SAlexander Gordeev if (!(pt[offset] & PT_PRESENT_MASK)) { 224363f1d9SPaolo Bonzini pteval_t *new_pt = pt_page; 237d36db35SAvi Kivity if (!new_pt) 247d36db35SAvi Kivity new_pt = alloc_page(); 257d36db35SAvi Kivity else 267d36db35SAvi Kivity pt_page = 0; 277d36db35SAvi Kivity memset(new_pt, 0, PAGE_SIZE); 285ae6c45fSSean Christopherson pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask; 29*c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI 3030203ea5SZixuan Wang pt[offset] |= get_amd_sev_c_bit_mask(); 31*c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */ 327d36db35SAvi Kivity } 33d10d16e1SAlexander Gordeev pt = phys_to_virt(pt[offset] & PT_ADDR_MASK); 347d36db35SAvi Kivity } 354363f1d9SPaolo Bonzini offset = PGDIR_OFFSET((uintptr_t)virt, level); 367d36db35SAvi Kivity pt[offset] = pte; 3704262816SPaolo Bonzini return &pt[offset]; 387d36db35SAvi Kivity } 397d36db35SAvi Kivity 401df80b57SPeter Feiner /* 411df80b57SPeter Feiner * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The 421df80b57SPeter Feiner * returned PTE isn't necessarily present, but its parent is. 431df80b57SPeter Feiner */ 444363f1d9SPaolo Bonzini struct pte_search find_pte_level(pgd_t *cr3, void *virt, 451df80b57SPeter Feiner int lowest_level) 467d36db35SAvi Kivity { 474363f1d9SPaolo Bonzini pteval_t *pt = cr3, pte; 487d36db35SAvi Kivity unsigned offset; 494363f1d9SPaolo Bonzini unsigned shift; 501df80b57SPeter Feiner struct pte_search r; 517d36db35SAvi Kivity 521df80b57SPeter Feiner assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL); 531df80b57SPeter Feiner 541df80b57SPeter Feiner for (r.level = PAGE_LEVEL;; --r.level) { 551df80b57SPeter Feiner shift = (r.level - 1) * PGDIR_WIDTH + 12; 564363f1d9SPaolo Bonzini offset = ((uintptr_t)virt >> shift) & PGDIR_MASK; 571df80b57SPeter Feiner r.pte = &pt[offset]; 581df80b57SPeter Feiner pte = *r.pte; 591df80b57SPeter Feiner 60d10d16e1SAlexander Gordeev if (!(pte & PT_PRESENT_MASK)) 611df80b57SPeter Feiner return r; 621df80b57SPeter Feiner 631df80b57SPeter Feiner if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK)) 641df80b57SPeter Feiner return r; 651df80b57SPeter Feiner 661df80b57SPeter Feiner if (r.level == lowest_level) 671df80b57SPeter Feiner return r; 681df80b57SPeter Feiner 6930203ea5SZixuan Wang pt = phys_to_virt(pte & PT_ADDR_MASK); 707d36db35SAvi Kivity } 711df80b57SPeter Feiner } 721df80b57SPeter Feiner 731df80b57SPeter Feiner /* 741df80b57SPeter Feiner * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge 751df80b57SPeter Feiner * PTE). Returns NULL if no leaf PTE exists. 761df80b57SPeter Feiner */ 774363f1d9SPaolo Bonzini pteval_t *get_pte(pgd_t *cr3, void *virt) 781df80b57SPeter Feiner { 791df80b57SPeter Feiner struct pte_search search; 801df80b57SPeter Feiner 811df80b57SPeter Feiner search = find_pte_level(cr3, virt, 1); 821df80b57SPeter Feiner return found_leaf_pte(search) ? search.pte : NULL; 831df80b57SPeter Feiner } 841df80b57SPeter Feiner 851df80b57SPeter Feiner /* 861df80b57SPeter Feiner * Returns the PTE in the mapping of @virt at the given level @pte_level. 871df80b57SPeter Feiner * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at 881df80b57SPeter Feiner * @pte_level - 1 isn't present). 891df80b57SPeter Feiner */ 904363f1d9SPaolo Bonzini pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level) 911df80b57SPeter Feiner { 921df80b57SPeter Feiner struct pte_search search; 931df80b57SPeter Feiner 941df80b57SPeter Feiner search = find_pte_level(cr3, virt, pte_level); 951df80b57SPeter Feiner return search.level == pte_level ? search.pte : NULL; 967d36db35SAvi Kivity } 977d36db35SAvi Kivity 984363f1d9SPaolo Bonzini pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt) 997d36db35SAvi Kivity { 10030203ea5SZixuan Wang phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask | PT_PAGE_SIZE_MASK; 101*c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI 10230203ea5SZixuan Wang flags |= get_amd_sev_c_bit_mask(); 103*c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */ 10430203ea5SZixuan Wang return install_pte(cr3, 2, virt, phys | flags, 0); 1057d36db35SAvi Kivity } 1067d36db35SAvi Kivity 1074363f1d9SPaolo Bonzini pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt) 1087d36db35SAvi Kivity { 10930203ea5SZixuan Wang phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask; 110*c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI 11130203ea5SZixuan Wang flags |= get_amd_sev_c_bit_mask(); 112*c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */ 11330203ea5SZixuan Wang return install_pte(cr3, 1, virt, phys | flags, 0); 1147d36db35SAvi Kivity } 1157d36db35SAvi Kivity 1164363f1d9SPaolo Bonzini void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt) 1171df80b57SPeter Feiner { 1184363f1d9SPaolo Bonzini phys_addr_t max = (u64)len + (u64)phys; 1191df80b57SPeter Feiner assert(phys % PAGE_SIZE == 0); 1204363f1d9SPaolo Bonzini assert((uintptr_t) virt % PAGE_SIZE == 0); 1211df80b57SPeter Feiner assert(len % PAGE_SIZE == 0); 1221df80b57SPeter Feiner 1231df80b57SPeter Feiner while (phys + PAGE_SIZE <= max) { 1241df80b57SPeter Feiner install_page(cr3, phys, virt); 1251df80b57SPeter Feiner phys += PAGE_SIZE; 1261df80b57SPeter Feiner virt = (char *) virt + PAGE_SIZE; 1271df80b57SPeter Feiner } 1281df80b57SPeter Feiner } 1291df80b57SPeter Feiner 1304363f1d9SPaolo Bonzini bool any_present_pages(pgd_t *cr3, void *virt, size_t len) 1311df80b57SPeter Feiner { 1324363f1d9SPaolo Bonzini uintptr_t max = (uintptr_t) virt + len; 1334363f1d9SPaolo Bonzini uintptr_t curr; 1341df80b57SPeter Feiner 1354363f1d9SPaolo Bonzini for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) { 1364363f1d9SPaolo Bonzini pteval_t *ptep = get_pte(cr3, (void *) curr); 1371df80b57SPeter Feiner if (ptep && (*ptep & PT_PRESENT_MASK)) 1381df80b57SPeter Feiner return true; 1391df80b57SPeter Feiner } 1401df80b57SPeter Feiner return false; 1411df80b57SPeter Feiner } 1427d36db35SAvi Kivity 1434363f1d9SPaolo Bonzini static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len) 14463254428SGleb Natapov { 14563254428SGleb Natapov u64 max = (u64)len + (u64)start; 14663254428SGleb Natapov u64 phys = start; 14763254428SGleb Natapov 14863254428SGleb Natapov while (phys + LARGE_PAGE_SIZE <= max) { 14963254428SGleb Natapov install_large_page(cr3, phys, (void *)(ulong)phys); 15063254428SGleb Natapov phys += LARGE_PAGE_SIZE; 15163254428SGleb Natapov } 1521df80b57SPeter Feiner install_pages(cr3, phys, max - phys, (void *)(ulong)phys); 15363254428SGleb Natapov } 15463254428SGleb Natapov 15548f67910SCathy Avery static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info) 15648f67910SCathy Avery { 15748f67910SCathy Avery write_cr3(info->cr3); 15848f67910SCathy Avery write_cr4(info->cr4); 15948f67910SCathy Avery write_cr0(info->cr0); 16048f67910SCathy Avery } 16148f67910SCathy Avery 1625ae6c45fSSean Christopherson void *setup_mmu(phys_addr_t end_of_memory, void *opt_mask) 1637d36db35SAvi Kivity { 1644363f1d9SPaolo Bonzini pgd_t *cr3 = alloc_page(); 16548f67910SCathy Avery struct vm_vcpu_info info; 16648f67910SCathy Avery int i; 1677d36db35SAvi Kivity 1685ae6c45fSSean Christopherson if (opt_mask) 1695ae6c45fSSean Christopherson pte_opt_mask = *(pteval_t *)opt_mask; 1705ae6c45fSSean Christopherson else 1715ae6c45fSSean Christopherson pte_opt_mask = PT_USER_MASK; 1725ae6c45fSSean Christopherson 1737d36db35SAvi Kivity memset(cr3, 0, PAGE_SIZE); 17463254428SGleb Natapov 17563254428SGleb Natapov #ifdef __x86_64__ 176937e2392SPaolo Bonzini if (end_of_memory < (1ul << 32)) 177937e2392SPaolo Bonzini end_of_memory = (1ul << 32); /* map mmio 1:1 */ 17863254428SGleb Natapov 179937e2392SPaolo Bonzini setup_mmu_range(cr3, 0, end_of_memory); 18063254428SGleb Natapov #else 1816b9334e7SPaolo Bonzini setup_mmu_range(cr3, 0, (2ul << 30)); 18263254428SGleb Natapov setup_mmu_range(cr3, 3ul << 30, (1ul << 30)); 183efd8e5aaSPaolo Bonzini init_alloc_vpage((void*)(3ul << 30)); 18463254428SGleb Natapov #endif 18563254428SGleb Natapov 1867d36db35SAvi Kivity write_cr3(virt_to_phys(cr3)); 1877d36db35SAvi Kivity #ifndef __x86_64__ 1887d36db35SAvi Kivity write_cr4(X86_CR4_PSE); 1897d36db35SAvi Kivity #endif 19097011120SGleb Natapov write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP); 1917d36db35SAvi Kivity 1927d36db35SAvi Kivity printf("paging enabled\n"); 193b006d7ebSAndrew Jones printf("cr0 = %lx\n", read_cr0()); 194b006d7ebSAndrew Jones printf("cr3 = %lx\n", read_cr3()); 195b006d7ebSAndrew Jones printf("cr4 = %lx\n", read_cr4()); 19648f67910SCathy Avery 19748f67910SCathy Avery info.cr3 = read_cr3(); 19848f67910SCathy Avery info.cr4 = read_cr4(); 19948f67910SCathy Avery info.cr0 = read_cr0(); 20048f67910SCathy Avery 20148f67910SCathy Avery for (i = 1; i < cpu_count(); i++) 20248f67910SCathy Avery on_cpu(i, (void *)set_additional_vcpu_vmregs, &info); 20348f67910SCathy Avery 204937e2392SPaolo Bonzini return cr3; 2057d36db35SAvi Kivity } 2067d36db35SAvi Kivity 207c41e032aSPaolo Bonzini phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem) 208334cd2bfSGleb Natapov { 209c41e032aSPaolo Bonzini return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1)); 210334cd2bfSGleb Natapov } 2115868743aSMarc Orr 2125868743aSMarc Orr /* 2135868743aSMarc Orr * split_large_page: Split a 2M/1G large page into 512 smaller PTEs. 2145868743aSMarc Orr * @ptep : large page table entry to split 2155868743aSMarc Orr * @level : level of ptep (2 or 3) 2165868743aSMarc Orr */ 2175868743aSMarc Orr void split_large_page(unsigned long *ptep, int level) 2185868743aSMarc Orr { 2195868743aSMarc Orr unsigned long *new_pt; 2205868743aSMarc Orr unsigned long pa; 2215868743aSMarc Orr unsigned long pte; 2225868743aSMarc Orr unsigned long prototype; 2235868743aSMarc Orr int i; 2245868743aSMarc Orr 2255868743aSMarc Orr pte = *ptep; 2265868743aSMarc Orr assert(pte & PT_PRESENT_MASK); 2275868743aSMarc Orr assert(pte & PT_PAGE_SIZE_MASK); 2285868743aSMarc Orr assert(level == 2 || level == 3); 2295868743aSMarc Orr 2305868743aSMarc Orr new_pt = alloc_page(); 2315868743aSMarc Orr assert(new_pt); 2325868743aSMarc Orr 2335868743aSMarc Orr prototype = pte & ~PT_ADDR_MASK; 2345868743aSMarc Orr if (level == 2) 2355868743aSMarc Orr prototype &= ~PT_PAGE_SIZE_MASK; 2365868743aSMarc Orr 2375868743aSMarc Orr pa = pte & PT_ADDR_MASK; 2385868743aSMarc Orr for (i = 0; i < (1 << PGDIR_WIDTH); i++) { 2395868743aSMarc Orr new_pt[i] = prototype | pa; 2405868743aSMarc Orr pa += 1ul << PGDIR_BITS(level - 1); 2415868743aSMarc Orr } 2425868743aSMarc Orr 2435868743aSMarc Orr pte &= ~PT_PAGE_SIZE_MASK; 2445868743aSMarc Orr pte &= ~PT_ADDR_MASK; 2455868743aSMarc Orr pte |= virt_to_phys(new_pt); 2465868743aSMarc Orr 2475868743aSMarc Orr /* Modify the relevant paging-structure entry */ 2485868743aSMarc Orr *ptep = pte; 2495868743aSMarc Orr 2505868743aSMarc Orr /* 2515868743aSMarc Orr * Flush the TLB to eradicate stale mappings. 2525868743aSMarc Orr * 2535868743aSMarc Orr * Note: Removing specific TLB mappings is tricky because 2545868743aSMarc Orr * split_large_page() can be called to split the active code page 2555868743aSMarc Orr * backing the next set of instructions to be fetched and executed. 2565868743aSMarc Orr * Furthermore, Intel SDM volume 3 recommends to clear the present bit 2575868743aSMarc Orr * for the page being split, before invalidating any mappings. 2585868743aSMarc Orr * 2595868743aSMarc Orr * But clearing the mapping from the page table and removing it from the 2605868743aSMarc Orr * TLB (where it's not actually guaranteed to reside anyway) makes it 2615868743aSMarc Orr * impossible to continue fetching instructions! 2625868743aSMarc Orr */ 2635868743aSMarc Orr flush_tlb(); 2645868743aSMarc Orr } 2655868743aSMarc Orr 2665868743aSMarc Orr /* 2675868743aSMarc Orr * force_4k_page: Ensures that addr translate to a 4k page. 2685868743aSMarc Orr * 2695868743aSMarc Orr * This function uses split_large_page(), as needed, to ensure that target 2705868743aSMarc Orr * address, addr, translates to a 4k page. 2715868743aSMarc Orr * 2725868743aSMarc Orr * @addr: target address that should be mapped to a 4k page 2735868743aSMarc Orr */ 2745868743aSMarc Orr void force_4k_page(void *addr) 2755868743aSMarc Orr { 2765868743aSMarc Orr unsigned long *ptep; 2775868743aSMarc Orr unsigned long pte; 2785868743aSMarc Orr unsigned long *cr3 = current_page_table(); 2795868743aSMarc Orr 2805868743aSMarc Orr ptep = get_pte_level(cr3, addr, 3); 2815868743aSMarc Orr assert(ptep); 2825868743aSMarc Orr pte = *ptep; 2835868743aSMarc Orr assert(pte & PT_PRESENT_MASK); 2845868743aSMarc Orr if (pte & PT_PAGE_SIZE_MASK) 2855868743aSMarc Orr split_large_page(ptep, 3); 2865868743aSMarc Orr 2875868743aSMarc Orr ptep = get_pte_level(cr3, addr, 2); 2885868743aSMarc Orr assert(ptep); 2895868743aSMarc Orr pte = *ptep; 2905868743aSMarc Orr assert(pte & PT_PRESENT_MASK); 2915868743aSMarc Orr if (pte & PT_PAGE_SIZE_MASK) 2925868743aSMarc Orr split_large_page(ptep, 2); 2935868743aSMarc Orr } 294f3e081d7SAaron Lewis 295f3e081d7SAaron Lewis /* 296f3e081d7SAaron Lewis * Call the callback on each page from virt to virt + len. 297f3e081d7SAaron Lewis */ 298f3e081d7SAaron Lewis void walk_pte(void *virt, size_t len, pte_callback_t callback) 299f3e081d7SAaron Lewis { 300f3e081d7SAaron Lewis pgd_t *cr3 = current_page_table(); 301f3e081d7SAaron Lewis uintptr_t start = (uintptr_t)virt; 302f3e081d7SAaron Lewis uintptr_t end = (uintptr_t)virt + len; 303f3e081d7SAaron Lewis struct pte_search search; 304f3e081d7SAaron Lewis size_t page_size; 305f3e081d7SAaron Lewis uintptr_t curr; 306f3e081d7SAaron Lewis 307f3e081d7SAaron Lewis for (curr = start; curr < end; curr = ALIGN_DOWN(curr + page_size, page_size)) { 308f3e081d7SAaron Lewis search = find_pte_level(cr3, (void *)curr, 1); 309f3e081d7SAaron Lewis assert(found_leaf_pte(search)); 310f3e081d7SAaron Lewis page_size = 1ul << PGDIR_BITS(search.level); 311f3e081d7SAaron Lewis 312f3e081d7SAaron Lewis callback(search, (void *)curr); 313f3e081d7SAaron Lewis } 314f3e081d7SAaron Lewis } 315