17d36db35SAvi Kivity #include "vm.h"
27d36db35SAvi Kivity #include "libcflat.h"
3efd8e5aaSPaolo Bonzini #include "vmalloc.h"
45aca024eSPaolo Bonzini #include "alloc_page.h"
548f67910SCathy Avery #include "smp.h"
67d36db35SAvi Kivity
75ae6c45fSSean Christopherson static pteval_t pte_opt_mask;
85ae6c45fSSean Christopherson
install_pte(pgd_t * cr3,int pte_level,void * virt,pteval_t pte,pteval_t * pt_page)94363f1d9SPaolo Bonzini pteval_t *install_pte(pgd_t *cr3,
107d36db35SAvi Kivity int pte_level,
117d36db35SAvi Kivity void *virt,
124363f1d9SPaolo Bonzini pteval_t pte,
134363f1d9SPaolo Bonzini pteval_t *pt_page)
147d36db35SAvi Kivity {
157d36db35SAvi Kivity int level;
164363f1d9SPaolo Bonzini pteval_t *pt = cr3;
177d36db35SAvi Kivity unsigned offset;
187d36db35SAvi Kivity
197d36db35SAvi Kivity for (level = PAGE_LEVEL; level > pte_level; --level) {
204363f1d9SPaolo Bonzini offset = PGDIR_OFFSET((uintptr_t)virt, level);
21d10d16e1SAlexander Gordeev if (!(pt[offset] & PT_PRESENT_MASK)) {
224363f1d9SPaolo Bonzini pteval_t *new_pt = pt_page;
237d36db35SAvi Kivity if (!new_pt)
247d36db35SAvi Kivity new_pt = alloc_page();
257d36db35SAvi Kivity else
267d36db35SAvi Kivity pt_page = 0;
277d36db35SAvi Kivity memset(new_pt, 0, PAGE_SIZE);
285ae6c45fSSean Christopherson pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask;
29c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI
3030203ea5SZixuan Wang pt[offset] |= get_amd_sev_c_bit_mask();
31c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */
327d36db35SAvi Kivity }
33d10d16e1SAlexander Gordeev pt = phys_to_virt(pt[offset] & PT_ADDR_MASK);
347d36db35SAvi Kivity }
354363f1d9SPaolo Bonzini offset = PGDIR_OFFSET((uintptr_t)virt, level);
367d36db35SAvi Kivity pt[offset] = pte;
3704262816SPaolo Bonzini return &pt[offset];
387d36db35SAvi Kivity }
397d36db35SAvi Kivity
401df80b57SPeter Feiner /*
411df80b57SPeter Feiner * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The
421df80b57SPeter Feiner * returned PTE isn't necessarily present, but its parent is.
431df80b57SPeter Feiner */
find_pte_level(pgd_t * cr3,void * virt,int lowest_level)444363f1d9SPaolo Bonzini struct pte_search find_pte_level(pgd_t *cr3, void *virt,
451df80b57SPeter Feiner int lowest_level)
467d36db35SAvi Kivity {
474363f1d9SPaolo Bonzini pteval_t *pt = cr3, pte;
487d36db35SAvi Kivity unsigned offset;
494363f1d9SPaolo Bonzini unsigned shift;
501df80b57SPeter Feiner struct pte_search r;
517d36db35SAvi Kivity
521df80b57SPeter Feiner assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL);
531df80b57SPeter Feiner
541df80b57SPeter Feiner for (r.level = PAGE_LEVEL;; --r.level) {
551df80b57SPeter Feiner shift = (r.level - 1) * PGDIR_WIDTH + 12;
564363f1d9SPaolo Bonzini offset = ((uintptr_t)virt >> shift) & PGDIR_MASK;
571df80b57SPeter Feiner r.pte = &pt[offset];
581df80b57SPeter Feiner pte = *r.pte;
591df80b57SPeter Feiner
60d10d16e1SAlexander Gordeev if (!(pte & PT_PRESENT_MASK))
611df80b57SPeter Feiner return r;
621df80b57SPeter Feiner
631df80b57SPeter Feiner if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK))
641df80b57SPeter Feiner return r;
651df80b57SPeter Feiner
661df80b57SPeter Feiner if (r.level == lowest_level)
671df80b57SPeter Feiner return r;
681df80b57SPeter Feiner
6930203ea5SZixuan Wang pt = phys_to_virt(pte & PT_ADDR_MASK);
707d36db35SAvi Kivity }
711df80b57SPeter Feiner }
721df80b57SPeter Feiner
731df80b57SPeter Feiner /*
741df80b57SPeter Feiner * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge
751df80b57SPeter Feiner * PTE). Returns NULL if no leaf PTE exists.
761df80b57SPeter Feiner */
get_pte(pgd_t * cr3,void * virt)774363f1d9SPaolo Bonzini pteval_t *get_pte(pgd_t *cr3, void *virt)
781df80b57SPeter Feiner {
791df80b57SPeter Feiner struct pte_search search;
801df80b57SPeter Feiner
811df80b57SPeter Feiner search = find_pte_level(cr3, virt, 1);
821df80b57SPeter Feiner return found_leaf_pte(search) ? search.pte : NULL;
831df80b57SPeter Feiner }
841df80b57SPeter Feiner
851df80b57SPeter Feiner /*
861df80b57SPeter Feiner * Returns the PTE in the mapping of @virt at the given level @pte_level.
871df80b57SPeter Feiner * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at
881df80b57SPeter Feiner * @pte_level - 1 isn't present).
891df80b57SPeter Feiner */
get_pte_level(pgd_t * cr3,void * virt,int pte_level)904363f1d9SPaolo Bonzini pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level)
911df80b57SPeter Feiner {
921df80b57SPeter Feiner struct pte_search search;
931df80b57SPeter Feiner
941df80b57SPeter Feiner search = find_pte_level(cr3, virt, pte_level);
951df80b57SPeter Feiner return search.level == pte_level ? search.pte : NULL;
967d36db35SAvi Kivity }
977d36db35SAvi Kivity
install_large_page(pgd_t * cr3,phys_addr_t phys,void * virt)984363f1d9SPaolo Bonzini pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt)
997d36db35SAvi Kivity {
10030203ea5SZixuan Wang phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask | PT_PAGE_SIZE_MASK;
101c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI
10230203ea5SZixuan Wang flags |= get_amd_sev_c_bit_mask();
103c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */
10430203ea5SZixuan Wang return install_pte(cr3, 2, virt, phys | flags, 0);
1057d36db35SAvi Kivity }
1067d36db35SAvi Kivity
install_page(pgd_t * cr3,phys_addr_t phys,void * virt)1074363f1d9SPaolo Bonzini pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt)
1087d36db35SAvi Kivity {
10930203ea5SZixuan Wang phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask;
110c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI
11130203ea5SZixuan Wang flags |= get_amd_sev_c_bit_mask();
112c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */
11330203ea5SZixuan Wang return install_pte(cr3, 1, virt, phys | flags, 0);
1147d36db35SAvi Kivity }
1157d36db35SAvi Kivity
install_pages(pgd_t * cr3,phys_addr_t phys,size_t len,void * virt)1164363f1d9SPaolo Bonzini void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt)
1171df80b57SPeter Feiner {
1184363f1d9SPaolo Bonzini phys_addr_t max = (u64)len + (u64)phys;
1191df80b57SPeter Feiner assert(phys % PAGE_SIZE == 0);
1204363f1d9SPaolo Bonzini assert((uintptr_t) virt % PAGE_SIZE == 0);
1211df80b57SPeter Feiner assert(len % PAGE_SIZE == 0);
1221df80b57SPeter Feiner
1231df80b57SPeter Feiner while (phys + PAGE_SIZE <= max) {
1241df80b57SPeter Feiner install_page(cr3, phys, virt);
1251df80b57SPeter Feiner phys += PAGE_SIZE;
1261df80b57SPeter Feiner virt = (char *) virt + PAGE_SIZE;
1271df80b57SPeter Feiner }
1281df80b57SPeter Feiner }
1291df80b57SPeter Feiner
any_present_pages(pgd_t * cr3,void * virt,size_t len)1304363f1d9SPaolo Bonzini bool any_present_pages(pgd_t *cr3, void *virt, size_t len)
1311df80b57SPeter Feiner {
1324363f1d9SPaolo Bonzini uintptr_t max = (uintptr_t) virt + len;
1334363f1d9SPaolo Bonzini uintptr_t curr;
1341df80b57SPeter Feiner
1354363f1d9SPaolo Bonzini for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) {
1364363f1d9SPaolo Bonzini pteval_t *ptep = get_pte(cr3, (void *) curr);
1371df80b57SPeter Feiner if (ptep && (*ptep & PT_PRESENT_MASK))
1381df80b57SPeter Feiner return true;
1391df80b57SPeter Feiner }
1401df80b57SPeter Feiner return false;
1411df80b57SPeter Feiner }
1427d36db35SAvi Kivity
__setup_mmu_range(pgd_t * cr3,phys_addr_t start,size_t len,enum x86_mmu_flags mmu_flags)143*c85124d2SManali Shukla void __setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len,
144*c85124d2SManali Shukla enum x86_mmu_flags mmu_flags)
14563254428SGleb Natapov {
146*c85124d2SManali Shukla u64 orig_opt_mask = pte_opt_mask;
14763254428SGleb Natapov u64 max = (u64)len + (u64)start;
14863254428SGleb Natapov u64 phys = start;
14963254428SGleb Natapov
150*c85124d2SManali Shukla if (mmu_flags & X86_MMU_MAP_USER)
151*c85124d2SManali Shukla pte_opt_mask |= PT_USER_MASK;
152*c85124d2SManali Shukla
153*c85124d2SManali Shukla if (mmu_flags & X86_MMU_MAP_HUGE) {
15463254428SGleb Natapov while (phys + LARGE_PAGE_SIZE <= max) {
15563254428SGleb Natapov install_large_page(cr3, phys, (void *)(ulong)phys);
15663254428SGleb Natapov phys += LARGE_PAGE_SIZE;
15763254428SGleb Natapov }
158*c85124d2SManali Shukla }
1591df80b57SPeter Feiner install_pages(cr3, phys, max - phys, (void *)(ulong)phys);
160*c85124d2SManali Shukla
161*c85124d2SManali Shukla pte_opt_mask = orig_opt_mask;
162*c85124d2SManali Shukla }
163*c85124d2SManali Shukla
setup_mmu_range(pgd_t * cr3,phys_addr_t start,size_t len)164*c85124d2SManali Shukla static inline void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len)
165*c85124d2SManali Shukla {
166*c85124d2SManali Shukla __setup_mmu_range(cr3, start, len, X86_MMU_MAP_HUGE);
16763254428SGleb Natapov }
16863254428SGleb Natapov
set_additional_vcpu_vmregs(struct vm_vcpu_info * info)16948f67910SCathy Avery static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info)
17048f67910SCathy Avery {
17148f67910SCathy Avery write_cr3(info->cr3);
17248f67910SCathy Avery write_cr4(info->cr4);
17348f67910SCathy Avery write_cr0(info->cr0);
17448f67910SCathy Avery }
17548f67910SCathy Avery
setup_mmu(phys_addr_t end_of_memory,void * opt_mask)1765ae6c45fSSean Christopherson void *setup_mmu(phys_addr_t end_of_memory, void *opt_mask)
1777d36db35SAvi Kivity {
1784363f1d9SPaolo Bonzini pgd_t *cr3 = alloc_page();
17948f67910SCathy Avery struct vm_vcpu_info info;
18048f67910SCathy Avery int i;
1817d36db35SAvi Kivity
1825ae6c45fSSean Christopherson if (opt_mask)
1835ae6c45fSSean Christopherson pte_opt_mask = *(pteval_t *)opt_mask;
1845ae6c45fSSean Christopherson else
1855ae6c45fSSean Christopherson pte_opt_mask = PT_USER_MASK;
1865ae6c45fSSean Christopherson
1877d36db35SAvi Kivity memset(cr3, 0, PAGE_SIZE);
18863254428SGleb Natapov
18963254428SGleb Natapov #ifdef __x86_64__
190937e2392SPaolo Bonzini if (end_of_memory < (1ul << 32))
191937e2392SPaolo Bonzini end_of_memory = (1ul << 32); /* map mmio 1:1 */
19263254428SGleb Natapov
193937e2392SPaolo Bonzini setup_mmu_range(cr3, 0, end_of_memory);
19463254428SGleb Natapov #else
1956b9334e7SPaolo Bonzini setup_mmu_range(cr3, 0, (2ul << 30));
19663254428SGleb Natapov setup_mmu_range(cr3, 3ul << 30, (1ul << 30));
197efd8e5aaSPaolo Bonzini init_alloc_vpage((void*)(3ul << 30));
19863254428SGleb Natapov #endif
19963254428SGleb Natapov
2007d36db35SAvi Kivity write_cr3(virt_to_phys(cr3));
2017d36db35SAvi Kivity #ifndef __x86_64__
2027d36db35SAvi Kivity write_cr4(X86_CR4_PSE);
2037d36db35SAvi Kivity #endif
20497011120SGleb Natapov write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP);
2057d36db35SAvi Kivity
2067d36db35SAvi Kivity printf("paging enabled\n");
207b006d7ebSAndrew Jones printf("cr0 = %lx\n", read_cr0());
208b006d7ebSAndrew Jones printf("cr3 = %lx\n", read_cr3());
209b006d7ebSAndrew Jones printf("cr4 = %lx\n", read_cr4());
21048f67910SCathy Avery
21148f67910SCathy Avery info.cr3 = read_cr3();
21248f67910SCathy Avery info.cr4 = read_cr4();
21348f67910SCathy Avery info.cr0 = read_cr0();
21448f67910SCathy Avery
21548f67910SCathy Avery for (i = 1; i < cpu_count(); i++)
21648f67910SCathy Avery on_cpu(i, (void *)set_additional_vcpu_vmregs, &info);
21748f67910SCathy Avery
218937e2392SPaolo Bonzini return cr3;
2197d36db35SAvi Kivity }
2207d36db35SAvi Kivity
virt_to_pte_phys(pgd_t * cr3,void * mem)221c41e032aSPaolo Bonzini phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem)
222334cd2bfSGleb Natapov {
223c41e032aSPaolo Bonzini return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1));
224334cd2bfSGleb Natapov }
2255868743aSMarc Orr
2265868743aSMarc Orr /*
2275868743aSMarc Orr * split_large_page: Split a 2M/1G large page into 512 smaller PTEs.
2285868743aSMarc Orr * @ptep : large page table entry to split
2295868743aSMarc Orr * @level : level of ptep (2 or 3)
2305868743aSMarc Orr */
split_large_page(unsigned long * ptep,int level)2315868743aSMarc Orr void split_large_page(unsigned long *ptep, int level)
2325868743aSMarc Orr {
2335868743aSMarc Orr unsigned long *new_pt;
2345868743aSMarc Orr unsigned long pa;
2355868743aSMarc Orr unsigned long pte;
2365868743aSMarc Orr unsigned long prototype;
2375868743aSMarc Orr int i;
2385868743aSMarc Orr
2395868743aSMarc Orr pte = *ptep;
2405868743aSMarc Orr assert(pte & PT_PRESENT_MASK);
2415868743aSMarc Orr assert(pte & PT_PAGE_SIZE_MASK);
2425868743aSMarc Orr assert(level == 2 || level == 3);
2435868743aSMarc Orr
2445868743aSMarc Orr new_pt = alloc_page();
2455868743aSMarc Orr assert(new_pt);
2465868743aSMarc Orr
2475868743aSMarc Orr prototype = pte & ~PT_ADDR_MASK;
2485868743aSMarc Orr if (level == 2)
2495868743aSMarc Orr prototype &= ~PT_PAGE_SIZE_MASK;
2505868743aSMarc Orr
2515868743aSMarc Orr pa = pte & PT_ADDR_MASK;
2525868743aSMarc Orr for (i = 0; i < (1 << PGDIR_WIDTH); i++) {
2535868743aSMarc Orr new_pt[i] = prototype | pa;
2545868743aSMarc Orr pa += 1ul << PGDIR_BITS(level - 1);
2555868743aSMarc Orr }
2565868743aSMarc Orr
2575868743aSMarc Orr pte &= ~PT_PAGE_SIZE_MASK;
2585868743aSMarc Orr pte &= ~PT_ADDR_MASK;
2595868743aSMarc Orr pte |= virt_to_phys(new_pt);
2605868743aSMarc Orr
2615868743aSMarc Orr /* Modify the relevant paging-structure entry */
2625868743aSMarc Orr *ptep = pte;
2635868743aSMarc Orr
2645868743aSMarc Orr /*
2655868743aSMarc Orr * Flush the TLB to eradicate stale mappings.
2665868743aSMarc Orr *
2675868743aSMarc Orr * Note: Removing specific TLB mappings is tricky because
2685868743aSMarc Orr * split_large_page() can be called to split the active code page
2695868743aSMarc Orr * backing the next set of instructions to be fetched and executed.
2705868743aSMarc Orr * Furthermore, Intel SDM volume 3 recommends to clear the present bit
2715868743aSMarc Orr * for the page being split, before invalidating any mappings.
2725868743aSMarc Orr *
2735868743aSMarc Orr * But clearing the mapping from the page table and removing it from the
2745868743aSMarc Orr * TLB (where it's not actually guaranteed to reside anyway) makes it
2755868743aSMarc Orr * impossible to continue fetching instructions!
2765868743aSMarc Orr */
2775868743aSMarc Orr flush_tlb();
2785868743aSMarc Orr }
2795868743aSMarc Orr
2805868743aSMarc Orr /*
2815868743aSMarc Orr * force_4k_page: Ensures that addr translate to a 4k page.
2825868743aSMarc Orr *
2835868743aSMarc Orr * This function uses split_large_page(), as needed, to ensure that target
2845868743aSMarc Orr * address, addr, translates to a 4k page.
2855868743aSMarc Orr *
2865868743aSMarc Orr * @addr: target address that should be mapped to a 4k page
2875868743aSMarc Orr */
force_4k_page(void * addr)2885868743aSMarc Orr void force_4k_page(void *addr)
2895868743aSMarc Orr {
2905868743aSMarc Orr unsigned long *ptep;
2915868743aSMarc Orr unsigned long pte;
2925868743aSMarc Orr unsigned long *cr3 = current_page_table();
2935868743aSMarc Orr
2945868743aSMarc Orr ptep = get_pte_level(cr3, addr, 3);
2955868743aSMarc Orr assert(ptep);
2965868743aSMarc Orr pte = *ptep;
2975868743aSMarc Orr assert(pte & PT_PRESENT_MASK);
2985868743aSMarc Orr if (pte & PT_PAGE_SIZE_MASK)
2995868743aSMarc Orr split_large_page(ptep, 3);
3005868743aSMarc Orr
3015868743aSMarc Orr ptep = get_pte_level(cr3, addr, 2);
3025868743aSMarc Orr assert(ptep);
3035868743aSMarc Orr pte = *ptep;
3045868743aSMarc Orr assert(pte & PT_PRESENT_MASK);
3055868743aSMarc Orr if (pte & PT_PAGE_SIZE_MASK)
3065868743aSMarc Orr split_large_page(ptep, 2);
3075868743aSMarc Orr }
308f3e081d7SAaron Lewis
309f3e081d7SAaron Lewis /*
310f3e081d7SAaron Lewis * Call the callback on each page from virt to virt + len.
311f3e081d7SAaron Lewis */
walk_pte(void * virt,size_t len,pte_callback_t callback)312f3e081d7SAaron Lewis void walk_pte(void *virt, size_t len, pte_callback_t callback)
313f3e081d7SAaron Lewis {
314f3e081d7SAaron Lewis pgd_t *cr3 = current_page_table();
315f3e081d7SAaron Lewis uintptr_t start = (uintptr_t)virt;
316f3e081d7SAaron Lewis uintptr_t end = (uintptr_t)virt + len;
317f3e081d7SAaron Lewis struct pte_search search;
318f3e081d7SAaron Lewis size_t page_size;
319f3e081d7SAaron Lewis uintptr_t curr;
320f3e081d7SAaron Lewis
321f3e081d7SAaron Lewis for (curr = start; curr < end; curr = ALIGN_DOWN(curr + page_size, page_size)) {
322f3e081d7SAaron Lewis search = find_pte_level(cr3, (void *)curr, 1);
323f3e081d7SAaron Lewis assert(found_leaf_pte(search));
324f3e081d7SAaron Lewis page_size = 1ul << PGDIR_BITS(search.level);
325f3e081d7SAaron Lewis
326f3e081d7SAaron Lewis callback(search, (void *)curr);
327f3e081d7SAaron Lewis }
328f3e081d7SAaron Lewis }
329