17d36db35SAvi Kivity #include "vm.h" 27d36db35SAvi Kivity #include "libcflat.h" 3efd8e5aaSPaolo Bonzini #include "vmalloc.h" 45aca024eSPaolo Bonzini #include "alloc_page.h" 5*48f67910SCathy Avery #include "smp.h" 67d36db35SAvi Kivity 74363f1d9SPaolo Bonzini pteval_t *install_pte(pgd_t *cr3, 87d36db35SAvi Kivity int pte_level, 97d36db35SAvi Kivity void *virt, 104363f1d9SPaolo Bonzini pteval_t pte, 114363f1d9SPaolo Bonzini pteval_t *pt_page) 127d36db35SAvi Kivity { 137d36db35SAvi Kivity int level; 144363f1d9SPaolo Bonzini pteval_t *pt = cr3; 157d36db35SAvi Kivity unsigned offset; 167d36db35SAvi Kivity 177d36db35SAvi Kivity for (level = PAGE_LEVEL; level > pte_level; --level) { 184363f1d9SPaolo Bonzini offset = PGDIR_OFFSET((uintptr_t)virt, level); 19d10d16e1SAlexander Gordeev if (!(pt[offset] & PT_PRESENT_MASK)) { 204363f1d9SPaolo Bonzini pteval_t *new_pt = pt_page; 217d36db35SAvi Kivity if (!new_pt) 227d36db35SAvi Kivity new_pt = alloc_page(); 237d36db35SAvi Kivity else 247d36db35SAvi Kivity pt_page = 0; 257d36db35SAvi Kivity memset(new_pt, 0, PAGE_SIZE); 26d10d16e1SAlexander Gordeev pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 277d36db35SAvi Kivity } 28d10d16e1SAlexander Gordeev pt = phys_to_virt(pt[offset] & PT_ADDR_MASK); 297d36db35SAvi Kivity } 304363f1d9SPaolo Bonzini offset = PGDIR_OFFSET((uintptr_t)virt, level); 317d36db35SAvi Kivity pt[offset] = pte; 3204262816SPaolo Bonzini return &pt[offset]; 337d36db35SAvi Kivity } 347d36db35SAvi Kivity 351df80b57SPeter Feiner /* 361df80b57SPeter Feiner * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The 371df80b57SPeter Feiner * returned PTE isn't necessarily present, but its parent is. 381df80b57SPeter Feiner */ 394363f1d9SPaolo Bonzini struct pte_search find_pte_level(pgd_t *cr3, void *virt, 401df80b57SPeter Feiner int lowest_level) 417d36db35SAvi Kivity { 424363f1d9SPaolo Bonzini pteval_t *pt = cr3, pte; 437d36db35SAvi Kivity unsigned offset; 444363f1d9SPaolo Bonzini unsigned shift; 451df80b57SPeter Feiner struct pte_search r; 467d36db35SAvi Kivity 471df80b57SPeter Feiner assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL); 481df80b57SPeter Feiner 491df80b57SPeter Feiner for (r.level = PAGE_LEVEL;; --r.level) { 501df80b57SPeter Feiner shift = (r.level - 1) * PGDIR_WIDTH + 12; 514363f1d9SPaolo Bonzini offset = ((uintptr_t)virt >> shift) & PGDIR_MASK; 521df80b57SPeter Feiner r.pte = &pt[offset]; 531df80b57SPeter Feiner pte = *r.pte; 541df80b57SPeter Feiner 55d10d16e1SAlexander Gordeev if (!(pte & PT_PRESENT_MASK)) 561df80b57SPeter Feiner return r; 571df80b57SPeter Feiner 581df80b57SPeter Feiner if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK)) 591df80b57SPeter Feiner return r; 601df80b57SPeter Feiner 611df80b57SPeter Feiner if (r.level == lowest_level) 621df80b57SPeter Feiner return r; 631df80b57SPeter Feiner 641df80b57SPeter Feiner pt = phys_to_virt(pte & 0xffffffffff000ull); 657d36db35SAvi Kivity } 661df80b57SPeter Feiner } 671df80b57SPeter Feiner 681df80b57SPeter Feiner /* 691df80b57SPeter Feiner * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge 701df80b57SPeter Feiner * PTE). Returns NULL if no leaf PTE exists. 711df80b57SPeter Feiner */ 724363f1d9SPaolo Bonzini pteval_t *get_pte(pgd_t *cr3, void *virt) 731df80b57SPeter Feiner { 741df80b57SPeter Feiner struct pte_search search; 751df80b57SPeter Feiner 761df80b57SPeter Feiner search = find_pte_level(cr3, virt, 1); 771df80b57SPeter Feiner return found_leaf_pte(search) ? search.pte : NULL; 781df80b57SPeter Feiner } 791df80b57SPeter Feiner 801df80b57SPeter Feiner /* 811df80b57SPeter Feiner * Returns the PTE in the mapping of @virt at the given level @pte_level. 821df80b57SPeter Feiner * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at 831df80b57SPeter Feiner * @pte_level - 1 isn't present). 841df80b57SPeter Feiner */ 854363f1d9SPaolo Bonzini pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level) 861df80b57SPeter Feiner { 871df80b57SPeter Feiner struct pte_search search; 881df80b57SPeter Feiner 891df80b57SPeter Feiner search = find_pte_level(cr3, virt, pte_level); 901df80b57SPeter Feiner return search.level == pte_level ? search.pte : NULL; 917d36db35SAvi Kivity } 927d36db35SAvi Kivity 934363f1d9SPaolo Bonzini pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt) 947d36db35SAvi Kivity { 9504262816SPaolo Bonzini return install_pte(cr3, 2, virt, 96d10d16e1SAlexander Gordeev phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK | PT_PAGE_SIZE_MASK, 0); 977d36db35SAvi Kivity } 987d36db35SAvi Kivity 994363f1d9SPaolo Bonzini pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt) 1007d36db35SAvi Kivity { 101d10d16e1SAlexander Gordeev return install_pte(cr3, 1, virt, phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK, 0); 1027d36db35SAvi Kivity } 1037d36db35SAvi Kivity 1044363f1d9SPaolo Bonzini void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt) 1051df80b57SPeter Feiner { 1064363f1d9SPaolo Bonzini phys_addr_t max = (u64)len + (u64)phys; 1071df80b57SPeter Feiner assert(phys % PAGE_SIZE == 0); 1084363f1d9SPaolo Bonzini assert((uintptr_t) virt % PAGE_SIZE == 0); 1091df80b57SPeter Feiner assert(len % PAGE_SIZE == 0); 1101df80b57SPeter Feiner 1111df80b57SPeter Feiner while (phys + PAGE_SIZE <= max) { 1121df80b57SPeter Feiner install_page(cr3, phys, virt); 1131df80b57SPeter Feiner phys += PAGE_SIZE; 1141df80b57SPeter Feiner virt = (char *) virt + PAGE_SIZE; 1151df80b57SPeter Feiner } 1161df80b57SPeter Feiner } 1171df80b57SPeter Feiner 1184363f1d9SPaolo Bonzini bool any_present_pages(pgd_t *cr3, void *virt, size_t len) 1191df80b57SPeter Feiner { 1204363f1d9SPaolo Bonzini uintptr_t max = (uintptr_t) virt + len; 1214363f1d9SPaolo Bonzini uintptr_t curr; 1221df80b57SPeter Feiner 1234363f1d9SPaolo Bonzini for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) { 1244363f1d9SPaolo Bonzini pteval_t *ptep = get_pte(cr3, (void *) curr); 1251df80b57SPeter Feiner if (ptep && (*ptep & PT_PRESENT_MASK)) 1261df80b57SPeter Feiner return true; 1271df80b57SPeter Feiner } 1281df80b57SPeter Feiner return false; 1291df80b57SPeter Feiner } 1307d36db35SAvi Kivity 1314363f1d9SPaolo Bonzini static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len) 13263254428SGleb Natapov { 13363254428SGleb Natapov u64 max = (u64)len + (u64)start; 13463254428SGleb Natapov u64 phys = start; 13563254428SGleb Natapov 13663254428SGleb Natapov while (phys + LARGE_PAGE_SIZE <= max) { 13763254428SGleb Natapov install_large_page(cr3, phys, (void *)(ulong)phys); 13863254428SGleb Natapov phys += LARGE_PAGE_SIZE; 13963254428SGleb Natapov } 1401df80b57SPeter Feiner install_pages(cr3, phys, max - phys, (void *)(ulong)phys); 14163254428SGleb Natapov } 14263254428SGleb Natapov 143*48f67910SCathy Avery static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info) 144*48f67910SCathy Avery { 145*48f67910SCathy Avery write_cr3(info->cr3); 146*48f67910SCathy Avery write_cr4(info->cr4); 147*48f67910SCathy Avery write_cr0(info->cr0); 148*48f67910SCathy Avery } 149*48f67910SCathy Avery 150937e2392SPaolo Bonzini void *setup_mmu(phys_addr_t end_of_memory) 1517d36db35SAvi Kivity { 1524363f1d9SPaolo Bonzini pgd_t *cr3 = alloc_page(); 153*48f67910SCathy Avery struct vm_vcpu_info info; 154*48f67910SCathy Avery int i; 1557d36db35SAvi Kivity 1567d36db35SAvi Kivity memset(cr3, 0, PAGE_SIZE); 15763254428SGleb Natapov 15863254428SGleb Natapov #ifdef __x86_64__ 159937e2392SPaolo Bonzini if (end_of_memory < (1ul << 32)) 160937e2392SPaolo Bonzini end_of_memory = (1ul << 32); /* map mmio 1:1 */ 16163254428SGleb Natapov 162937e2392SPaolo Bonzini setup_mmu_range(cr3, 0, end_of_memory); 16363254428SGleb Natapov #else 1646b9334e7SPaolo Bonzini setup_mmu_range(cr3, 0, (2ul << 30)); 16563254428SGleb Natapov setup_mmu_range(cr3, 3ul << 30, (1ul << 30)); 166efd8e5aaSPaolo Bonzini init_alloc_vpage((void*)(3ul << 30)); 16763254428SGleb Natapov #endif 16863254428SGleb Natapov 1697d36db35SAvi Kivity write_cr3(virt_to_phys(cr3)); 1707d36db35SAvi Kivity #ifndef __x86_64__ 1717d36db35SAvi Kivity write_cr4(X86_CR4_PSE); 1727d36db35SAvi Kivity #endif 17397011120SGleb Natapov write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP); 1747d36db35SAvi Kivity 1757d36db35SAvi Kivity printf("paging enabled\n"); 176b006d7ebSAndrew Jones printf("cr0 = %lx\n", read_cr0()); 177b006d7ebSAndrew Jones printf("cr3 = %lx\n", read_cr3()); 178b006d7ebSAndrew Jones printf("cr4 = %lx\n", read_cr4()); 179*48f67910SCathy Avery 180*48f67910SCathy Avery info.cr3 = read_cr3(); 181*48f67910SCathy Avery info.cr4 = read_cr4(); 182*48f67910SCathy Avery info.cr0 = read_cr0(); 183*48f67910SCathy Avery 184*48f67910SCathy Avery for (i = 1; i < cpu_count(); i++) 185*48f67910SCathy Avery on_cpu(i, (void *)set_additional_vcpu_vmregs, &info); 186*48f67910SCathy Avery 187937e2392SPaolo Bonzini return cr3; 1887d36db35SAvi Kivity } 1897d36db35SAvi Kivity 190c41e032aSPaolo Bonzini phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem) 191334cd2bfSGleb Natapov { 192c41e032aSPaolo Bonzini return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1)); 193334cd2bfSGleb Natapov } 1945868743aSMarc Orr 1955868743aSMarc Orr /* 1965868743aSMarc Orr * split_large_page: Split a 2M/1G large page into 512 smaller PTEs. 1975868743aSMarc Orr * @ptep : large page table entry to split 1985868743aSMarc Orr * @level : level of ptep (2 or 3) 1995868743aSMarc Orr */ 2005868743aSMarc Orr void split_large_page(unsigned long *ptep, int level) 2015868743aSMarc Orr { 2025868743aSMarc Orr unsigned long *new_pt; 2035868743aSMarc Orr unsigned long pa; 2045868743aSMarc Orr unsigned long pte; 2055868743aSMarc Orr unsigned long prototype; 2065868743aSMarc Orr int i; 2075868743aSMarc Orr 2085868743aSMarc Orr pte = *ptep; 2095868743aSMarc Orr assert(pte & PT_PRESENT_MASK); 2105868743aSMarc Orr assert(pte & PT_PAGE_SIZE_MASK); 2115868743aSMarc Orr assert(level == 2 || level == 3); 2125868743aSMarc Orr 2135868743aSMarc Orr new_pt = alloc_page(); 2145868743aSMarc Orr assert(new_pt); 2155868743aSMarc Orr 2165868743aSMarc Orr prototype = pte & ~PT_ADDR_MASK; 2175868743aSMarc Orr if (level == 2) 2185868743aSMarc Orr prototype &= ~PT_PAGE_SIZE_MASK; 2195868743aSMarc Orr 2205868743aSMarc Orr pa = pte & PT_ADDR_MASK; 2215868743aSMarc Orr for (i = 0; i < (1 << PGDIR_WIDTH); i++) { 2225868743aSMarc Orr new_pt[i] = prototype | pa; 2235868743aSMarc Orr pa += 1ul << PGDIR_BITS(level - 1); 2245868743aSMarc Orr } 2255868743aSMarc Orr 2265868743aSMarc Orr pte &= ~PT_PAGE_SIZE_MASK; 2275868743aSMarc Orr pte &= ~PT_ADDR_MASK; 2285868743aSMarc Orr pte |= virt_to_phys(new_pt); 2295868743aSMarc Orr 2305868743aSMarc Orr /* Modify the relevant paging-structure entry */ 2315868743aSMarc Orr *ptep = pte; 2325868743aSMarc Orr 2335868743aSMarc Orr /* 2345868743aSMarc Orr * Flush the TLB to eradicate stale mappings. 2355868743aSMarc Orr * 2365868743aSMarc Orr * Note: Removing specific TLB mappings is tricky because 2375868743aSMarc Orr * split_large_page() can be called to split the active code page 2385868743aSMarc Orr * backing the next set of instructions to be fetched and executed. 2395868743aSMarc Orr * Furthermore, Intel SDM volume 3 recommends to clear the present bit 2405868743aSMarc Orr * for the page being split, before invalidating any mappings. 2415868743aSMarc Orr * 2425868743aSMarc Orr * But clearing the mapping from the page table and removing it from the 2435868743aSMarc Orr * TLB (where it's not actually guaranteed to reside anyway) makes it 2445868743aSMarc Orr * impossible to continue fetching instructions! 2455868743aSMarc Orr */ 2465868743aSMarc Orr flush_tlb(); 2475868743aSMarc Orr } 2485868743aSMarc Orr 2495868743aSMarc Orr /* 2505868743aSMarc Orr * force_4k_page: Ensures that addr translate to a 4k page. 2515868743aSMarc Orr * 2525868743aSMarc Orr * This function uses split_large_page(), as needed, to ensure that target 2535868743aSMarc Orr * address, addr, translates to a 4k page. 2545868743aSMarc Orr * 2555868743aSMarc Orr * @addr: target address that should be mapped to a 4k page 2565868743aSMarc Orr */ 2575868743aSMarc Orr void force_4k_page(void *addr) 2585868743aSMarc Orr { 2595868743aSMarc Orr unsigned long *ptep; 2605868743aSMarc Orr unsigned long pte; 2615868743aSMarc Orr unsigned long *cr3 = current_page_table(); 2625868743aSMarc Orr 2635868743aSMarc Orr ptep = get_pte_level(cr3, addr, 3); 2645868743aSMarc Orr assert(ptep); 2655868743aSMarc Orr pte = *ptep; 2665868743aSMarc Orr assert(pte & PT_PRESENT_MASK); 2675868743aSMarc Orr if (pte & PT_PAGE_SIZE_MASK) 2685868743aSMarc Orr split_large_page(ptep, 3); 2695868743aSMarc Orr 2705868743aSMarc Orr ptep = get_pte_level(cr3, addr, 2); 2715868743aSMarc Orr assert(ptep); 2725868743aSMarc Orr pte = *ptep; 2735868743aSMarc Orr assert(pte & PT_PRESENT_MASK); 2745868743aSMarc Orr if (pte & PT_PAGE_SIZE_MASK) 2755868743aSMarc Orr split_large_page(ptep, 2); 2765868743aSMarc Orr } 277