17d36db35SAvi Kivity #include "vm.h" 27d36db35SAvi Kivity #include "libcflat.h" 3efd8e5aaSPaolo Bonzini #include "vmalloc.h" 45aca024eSPaolo Bonzini #include "alloc_page.h" 57d36db35SAvi Kivity 64363f1d9SPaolo Bonzini pteval_t *install_pte(pgd_t *cr3, 77d36db35SAvi Kivity int pte_level, 87d36db35SAvi Kivity void *virt, 94363f1d9SPaolo Bonzini pteval_t pte, 104363f1d9SPaolo Bonzini pteval_t *pt_page) 117d36db35SAvi Kivity { 127d36db35SAvi Kivity int level; 134363f1d9SPaolo Bonzini pteval_t *pt = cr3; 147d36db35SAvi Kivity unsigned offset; 157d36db35SAvi Kivity 167d36db35SAvi Kivity for (level = PAGE_LEVEL; level > pte_level; --level) { 174363f1d9SPaolo Bonzini offset = PGDIR_OFFSET((uintptr_t)virt, level); 18d10d16e1SAlexander Gordeev if (!(pt[offset] & PT_PRESENT_MASK)) { 194363f1d9SPaolo Bonzini pteval_t *new_pt = pt_page; 207d36db35SAvi Kivity if (!new_pt) 217d36db35SAvi Kivity new_pt = alloc_page(); 227d36db35SAvi Kivity else 237d36db35SAvi Kivity pt_page = 0; 247d36db35SAvi Kivity memset(new_pt, 0, PAGE_SIZE); 25d10d16e1SAlexander Gordeev pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 267d36db35SAvi Kivity } 27d10d16e1SAlexander Gordeev pt = phys_to_virt(pt[offset] & PT_ADDR_MASK); 287d36db35SAvi Kivity } 294363f1d9SPaolo Bonzini offset = PGDIR_OFFSET((uintptr_t)virt, level); 307d36db35SAvi Kivity pt[offset] = pte; 3104262816SPaolo Bonzini return &pt[offset]; 327d36db35SAvi Kivity } 337d36db35SAvi Kivity 341df80b57SPeter Feiner /* 351df80b57SPeter Feiner * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The 361df80b57SPeter Feiner * returned PTE isn't necessarily present, but its parent is. 371df80b57SPeter Feiner */ 384363f1d9SPaolo Bonzini struct pte_search find_pte_level(pgd_t *cr3, void *virt, 391df80b57SPeter Feiner int lowest_level) 407d36db35SAvi Kivity { 414363f1d9SPaolo Bonzini pteval_t *pt = cr3, pte; 427d36db35SAvi Kivity unsigned offset; 434363f1d9SPaolo Bonzini unsigned shift; 441df80b57SPeter Feiner struct pte_search r; 457d36db35SAvi Kivity 461df80b57SPeter Feiner assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL); 471df80b57SPeter Feiner 481df80b57SPeter Feiner for (r.level = PAGE_LEVEL;; --r.level) { 491df80b57SPeter Feiner shift = (r.level - 1) * PGDIR_WIDTH + 12; 504363f1d9SPaolo Bonzini offset = ((uintptr_t)virt >> shift) & PGDIR_MASK; 511df80b57SPeter Feiner r.pte = &pt[offset]; 521df80b57SPeter Feiner pte = *r.pte; 531df80b57SPeter Feiner 54d10d16e1SAlexander Gordeev if (!(pte & PT_PRESENT_MASK)) 551df80b57SPeter Feiner return r; 561df80b57SPeter Feiner 571df80b57SPeter Feiner if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK)) 581df80b57SPeter Feiner return r; 591df80b57SPeter Feiner 601df80b57SPeter Feiner if (r.level == lowest_level) 611df80b57SPeter Feiner return r; 621df80b57SPeter Feiner 631df80b57SPeter Feiner pt = phys_to_virt(pte & 0xffffffffff000ull); 647d36db35SAvi Kivity } 651df80b57SPeter Feiner } 661df80b57SPeter Feiner 671df80b57SPeter Feiner /* 681df80b57SPeter Feiner * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge 691df80b57SPeter Feiner * PTE). Returns NULL if no leaf PTE exists. 701df80b57SPeter Feiner */ 714363f1d9SPaolo Bonzini pteval_t *get_pte(pgd_t *cr3, void *virt) 721df80b57SPeter Feiner { 731df80b57SPeter Feiner struct pte_search search; 741df80b57SPeter Feiner 751df80b57SPeter Feiner search = find_pte_level(cr3, virt, 1); 761df80b57SPeter Feiner return found_leaf_pte(search) ? search.pte : NULL; 771df80b57SPeter Feiner } 781df80b57SPeter Feiner 791df80b57SPeter Feiner /* 801df80b57SPeter Feiner * Returns the PTE in the mapping of @virt at the given level @pte_level. 811df80b57SPeter Feiner * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at 821df80b57SPeter Feiner * @pte_level - 1 isn't present). 831df80b57SPeter Feiner */ 844363f1d9SPaolo Bonzini pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level) 851df80b57SPeter Feiner { 861df80b57SPeter Feiner struct pte_search search; 871df80b57SPeter Feiner 881df80b57SPeter Feiner search = find_pte_level(cr3, virt, pte_level); 891df80b57SPeter Feiner return search.level == pte_level ? search.pte : NULL; 907d36db35SAvi Kivity } 917d36db35SAvi Kivity 924363f1d9SPaolo Bonzini pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt) 937d36db35SAvi Kivity { 9404262816SPaolo Bonzini return install_pte(cr3, 2, virt, 95d10d16e1SAlexander Gordeev phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK | PT_PAGE_SIZE_MASK, 0); 967d36db35SAvi Kivity } 977d36db35SAvi Kivity 984363f1d9SPaolo Bonzini pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt) 997d36db35SAvi Kivity { 100d10d16e1SAlexander Gordeev return install_pte(cr3, 1, virt, phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK, 0); 1017d36db35SAvi Kivity } 1027d36db35SAvi Kivity 1034363f1d9SPaolo Bonzini void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt) 1041df80b57SPeter Feiner { 1054363f1d9SPaolo Bonzini phys_addr_t max = (u64)len + (u64)phys; 1061df80b57SPeter Feiner assert(phys % PAGE_SIZE == 0); 1074363f1d9SPaolo Bonzini assert((uintptr_t) virt % PAGE_SIZE == 0); 1081df80b57SPeter Feiner assert(len % PAGE_SIZE == 0); 1091df80b57SPeter Feiner 1101df80b57SPeter Feiner while (phys + PAGE_SIZE <= max) { 1111df80b57SPeter Feiner install_page(cr3, phys, virt); 1121df80b57SPeter Feiner phys += PAGE_SIZE; 1131df80b57SPeter Feiner virt = (char *) virt + PAGE_SIZE; 1141df80b57SPeter Feiner } 1151df80b57SPeter Feiner } 1161df80b57SPeter Feiner 1174363f1d9SPaolo Bonzini bool any_present_pages(pgd_t *cr3, void *virt, size_t len) 1181df80b57SPeter Feiner { 1194363f1d9SPaolo Bonzini uintptr_t max = (uintptr_t) virt + len; 1204363f1d9SPaolo Bonzini uintptr_t curr; 1211df80b57SPeter Feiner 1224363f1d9SPaolo Bonzini for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) { 1234363f1d9SPaolo Bonzini pteval_t *ptep = get_pte(cr3, (void *) curr); 1241df80b57SPeter Feiner if (ptep && (*ptep & PT_PRESENT_MASK)) 1251df80b57SPeter Feiner return true; 1261df80b57SPeter Feiner } 1271df80b57SPeter Feiner return false; 1281df80b57SPeter Feiner } 1297d36db35SAvi Kivity 1304363f1d9SPaolo Bonzini static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len) 13163254428SGleb Natapov { 13263254428SGleb Natapov u64 max = (u64)len + (u64)start; 13363254428SGleb Natapov u64 phys = start; 13463254428SGleb Natapov 13563254428SGleb Natapov while (phys + LARGE_PAGE_SIZE <= max) { 13663254428SGleb Natapov install_large_page(cr3, phys, (void *)(ulong)phys); 13763254428SGleb Natapov phys += LARGE_PAGE_SIZE; 13863254428SGleb Natapov } 1391df80b57SPeter Feiner install_pages(cr3, phys, max - phys, (void *)(ulong)phys); 14063254428SGleb Natapov } 14163254428SGleb Natapov 142937e2392SPaolo Bonzini void *setup_mmu(phys_addr_t end_of_memory) 1437d36db35SAvi Kivity { 1444363f1d9SPaolo Bonzini pgd_t *cr3 = alloc_page(); 1457d36db35SAvi Kivity 1467d36db35SAvi Kivity memset(cr3, 0, PAGE_SIZE); 14763254428SGleb Natapov 14863254428SGleb Natapov #ifdef __x86_64__ 149937e2392SPaolo Bonzini if (end_of_memory < (1ul << 32)) 150937e2392SPaolo Bonzini end_of_memory = (1ul << 32); /* map mmio 1:1 */ 15163254428SGleb Natapov 152937e2392SPaolo Bonzini setup_mmu_range(cr3, 0, end_of_memory); 15363254428SGleb Natapov #else 154937e2392SPaolo Bonzini if (end_of_memory > (1ul << 31)) 155937e2392SPaolo Bonzini end_of_memory = (1ul << 31); 15663254428SGleb Natapov 15763254428SGleb Natapov /* 0 - 2G memory, 2G-3G valloc area, 3G-4G mmio */ 158937e2392SPaolo Bonzini setup_mmu_range(cr3, 0, end_of_memory); 15963254428SGleb Natapov setup_mmu_range(cr3, 3ul << 30, (1ul << 30)); 160efd8e5aaSPaolo Bonzini init_alloc_vpage((void*)(3ul << 30)); 16163254428SGleb Natapov #endif 16263254428SGleb Natapov 1637d36db35SAvi Kivity write_cr3(virt_to_phys(cr3)); 1647d36db35SAvi Kivity #ifndef __x86_64__ 1657d36db35SAvi Kivity write_cr4(X86_CR4_PSE); 1667d36db35SAvi Kivity #endif 16797011120SGleb Natapov write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP); 1687d36db35SAvi Kivity 1697d36db35SAvi Kivity printf("paging enabled\n"); 170b006d7ebSAndrew Jones printf("cr0 = %lx\n", read_cr0()); 171b006d7ebSAndrew Jones printf("cr3 = %lx\n", read_cr3()); 172b006d7ebSAndrew Jones printf("cr4 = %lx\n", read_cr4()); 173937e2392SPaolo Bonzini return cr3; 1747d36db35SAvi Kivity } 1757d36db35SAvi Kivity 176c41e032aSPaolo Bonzini phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem) 177334cd2bfSGleb Natapov { 178c41e032aSPaolo Bonzini return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1)); 179334cd2bfSGleb Natapov } 180*5868743aSMarc Orr 181*5868743aSMarc Orr /* 182*5868743aSMarc Orr * split_large_page: Split a 2M/1G large page into 512 smaller PTEs. 183*5868743aSMarc Orr * @ptep : large page table entry to split 184*5868743aSMarc Orr * @level : level of ptep (2 or 3) 185*5868743aSMarc Orr */ 186*5868743aSMarc Orr void split_large_page(unsigned long *ptep, int level) 187*5868743aSMarc Orr { 188*5868743aSMarc Orr unsigned long *new_pt; 189*5868743aSMarc Orr unsigned long pa; 190*5868743aSMarc Orr unsigned long pte; 191*5868743aSMarc Orr unsigned long prototype; 192*5868743aSMarc Orr int i; 193*5868743aSMarc Orr 194*5868743aSMarc Orr pte = *ptep; 195*5868743aSMarc Orr assert(pte & PT_PRESENT_MASK); 196*5868743aSMarc Orr assert(pte & PT_PAGE_SIZE_MASK); 197*5868743aSMarc Orr assert(level == 2 || level == 3); 198*5868743aSMarc Orr 199*5868743aSMarc Orr new_pt = alloc_page(); 200*5868743aSMarc Orr assert(new_pt); 201*5868743aSMarc Orr 202*5868743aSMarc Orr prototype = pte & ~PT_ADDR_MASK; 203*5868743aSMarc Orr if (level == 2) 204*5868743aSMarc Orr prototype &= ~PT_PAGE_SIZE_MASK; 205*5868743aSMarc Orr 206*5868743aSMarc Orr pa = pte & PT_ADDR_MASK; 207*5868743aSMarc Orr for (i = 0; i < (1 << PGDIR_WIDTH); i++) { 208*5868743aSMarc Orr new_pt[i] = prototype | pa; 209*5868743aSMarc Orr pa += 1ul << PGDIR_BITS(level - 1); 210*5868743aSMarc Orr } 211*5868743aSMarc Orr 212*5868743aSMarc Orr pte &= ~PT_PAGE_SIZE_MASK; 213*5868743aSMarc Orr pte &= ~PT_ADDR_MASK; 214*5868743aSMarc Orr pte |= virt_to_phys(new_pt); 215*5868743aSMarc Orr 216*5868743aSMarc Orr /* Modify the relevant paging-structure entry */ 217*5868743aSMarc Orr *ptep = pte; 218*5868743aSMarc Orr 219*5868743aSMarc Orr /* 220*5868743aSMarc Orr * Flush the TLB to eradicate stale mappings. 221*5868743aSMarc Orr * 222*5868743aSMarc Orr * Note: Removing specific TLB mappings is tricky because 223*5868743aSMarc Orr * split_large_page() can be called to split the active code page 224*5868743aSMarc Orr * backing the next set of instructions to be fetched and executed. 225*5868743aSMarc Orr * Furthermore, Intel SDM volume 3 recommends to clear the present bit 226*5868743aSMarc Orr * for the page being split, before invalidating any mappings. 227*5868743aSMarc Orr * 228*5868743aSMarc Orr * But clearing the mapping from the page table and removing it from the 229*5868743aSMarc Orr * TLB (where it's not actually guaranteed to reside anyway) makes it 230*5868743aSMarc Orr * impossible to continue fetching instructions! 231*5868743aSMarc Orr */ 232*5868743aSMarc Orr flush_tlb(); 233*5868743aSMarc Orr } 234*5868743aSMarc Orr 235*5868743aSMarc Orr /* 236*5868743aSMarc Orr * force_4k_page: Ensures that addr translate to a 4k page. 237*5868743aSMarc Orr * 238*5868743aSMarc Orr * This function uses split_large_page(), as needed, to ensure that target 239*5868743aSMarc Orr * address, addr, translates to a 4k page. 240*5868743aSMarc Orr * 241*5868743aSMarc Orr * @addr: target address that should be mapped to a 4k page 242*5868743aSMarc Orr */ 243*5868743aSMarc Orr void force_4k_page(void *addr) 244*5868743aSMarc Orr { 245*5868743aSMarc Orr unsigned long *ptep; 246*5868743aSMarc Orr unsigned long pte; 247*5868743aSMarc Orr unsigned long *cr3 = current_page_table(); 248*5868743aSMarc Orr 249*5868743aSMarc Orr ptep = get_pte_level(cr3, addr, 3); 250*5868743aSMarc Orr assert(ptep); 251*5868743aSMarc Orr pte = *ptep; 252*5868743aSMarc Orr assert(pte & PT_PRESENT_MASK); 253*5868743aSMarc Orr if (pte & PT_PAGE_SIZE_MASK) 254*5868743aSMarc Orr split_large_page(ptep, 3); 255*5868743aSMarc Orr 256*5868743aSMarc Orr ptep = get_pte_level(cr3, addr, 2); 257*5868743aSMarc Orr assert(ptep); 258*5868743aSMarc Orr pte = *ptep; 259*5868743aSMarc Orr assert(pte & PT_PRESENT_MASK); 260*5868743aSMarc Orr if (pte & PT_PAGE_SIZE_MASK) 261*5868743aSMarc Orr split_large_page(ptep, 2); 262*5868743aSMarc Orr } 263