xref: /kvm-unit-tests/lib/x86/vm.c (revision c85124d28a51ea3619e91cf6da788142677f4a4d)
17d36db35SAvi Kivity #include "vm.h"
27d36db35SAvi Kivity #include "libcflat.h"
3efd8e5aaSPaolo Bonzini #include "vmalloc.h"
45aca024eSPaolo Bonzini #include "alloc_page.h"
548f67910SCathy Avery #include "smp.h"
67d36db35SAvi Kivity 
75ae6c45fSSean Christopherson static pteval_t pte_opt_mask;
85ae6c45fSSean Christopherson 
install_pte(pgd_t * cr3,int pte_level,void * virt,pteval_t pte,pteval_t * pt_page)94363f1d9SPaolo Bonzini pteval_t *install_pte(pgd_t *cr3,
107d36db35SAvi Kivity 		      int pte_level,
117d36db35SAvi Kivity 		      void *virt,
124363f1d9SPaolo Bonzini 		      pteval_t pte,
134363f1d9SPaolo Bonzini 		      pteval_t *pt_page)
147d36db35SAvi Kivity {
157d36db35SAvi Kivity     int level;
164363f1d9SPaolo Bonzini     pteval_t *pt = cr3;
177d36db35SAvi Kivity     unsigned offset;
187d36db35SAvi Kivity 
197d36db35SAvi Kivity     for (level = PAGE_LEVEL; level > pte_level; --level) {
204363f1d9SPaolo Bonzini 	offset = PGDIR_OFFSET((uintptr_t)virt, level);
21d10d16e1SAlexander Gordeev 	if (!(pt[offset] & PT_PRESENT_MASK)) {
224363f1d9SPaolo Bonzini 	    pteval_t *new_pt = pt_page;
237d36db35SAvi Kivity             if (!new_pt)
247d36db35SAvi Kivity                 new_pt = alloc_page();
257d36db35SAvi Kivity             else
267d36db35SAvi Kivity                 pt_page = 0;
277d36db35SAvi Kivity 	    memset(new_pt, 0, PAGE_SIZE);
285ae6c45fSSean Christopherson 	    pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask;
29c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI
3030203ea5SZixuan Wang 	    pt[offset] |= get_amd_sev_c_bit_mask();
31c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */
327d36db35SAvi Kivity 	}
33d10d16e1SAlexander Gordeev 	pt = phys_to_virt(pt[offset] & PT_ADDR_MASK);
347d36db35SAvi Kivity     }
354363f1d9SPaolo Bonzini     offset = PGDIR_OFFSET((uintptr_t)virt, level);
367d36db35SAvi Kivity     pt[offset] = pte;
3704262816SPaolo Bonzini     return &pt[offset];
387d36db35SAvi Kivity }
397d36db35SAvi Kivity 
401df80b57SPeter Feiner /*
411df80b57SPeter Feiner  * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The
421df80b57SPeter Feiner  * returned PTE isn't necessarily present, but its parent is.
431df80b57SPeter Feiner  */
find_pte_level(pgd_t * cr3,void * virt,int lowest_level)444363f1d9SPaolo Bonzini struct pte_search find_pte_level(pgd_t *cr3, void *virt,
451df80b57SPeter Feiner 				 int lowest_level)
467d36db35SAvi Kivity {
474363f1d9SPaolo Bonzini 	pteval_t *pt = cr3, pte;
487d36db35SAvi Kivity 	unsigned offset;
494363f1d9SPaolo Bonzini 	unsigned shift;
501df80b57SPeter Feiner 	struct pte_search r;
517d36db35SAvi Kivity 
521df80b57SPeter Feiner 	assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL);
531df80b57SPeter Feiner 
541df80b57SPeter Feiner 	for (r.level = PAGE_LEVEL;; --r.level) {
551df80b57SPeter Feiner 		shift = (r.level - 1) * PGDIR_WIDTH + 12;
564363f1d9SPaolo Bonzini 		offset = ((uintptr_t)virt >> shift) & PGDIR_MASK;
571df80b57SPeter Feiner 		r.pte = &pt[offset];
581df80b57SPeter Feiner 		pte = *r.pte;
591df80b57SPeter Feiner 
60d10d16e1SAlexander Gordeev 		if (!(pte & PT_PRESENT_MASK))
611df80b57SPeter Feiner 			return r;
621df80b57SPeter Feiner 
631df80b57SPeter Feiner 		if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK))
641df80b57SPeter Feiner 			return r;
651df80b57SPeter Feiner 
661df80b57SPeter Feiner 		if (r.level == lowest_level)
671df80b57SPeter Feiner 			return r;
681df80b57SPeter Feiner 
6930203ea5SZixuan Wang 		pt = phys_to_virt(pte & PT_ADDR_MASK);
707d36db35SAvi Kivity 	}
711df80b57SPeter Feiner }
721df80b57SPeter Feiner 
731df80b57SPeter Feiner /*
741df80b57SPeter Feiner  * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge
751df80b57SPeter Feiner  * PTE). Returns NULL if no leaf PTE exists.
761df80b57SPeter Feiner  */
get_pte(pgd_t * cr3,void * virt)774363f1d9SPaolo Bonzini pteval_t *get_pte(pgd_t *cr3, void *virt)
781df80b57SPeter Feiner {
791df80b57SPeter Feiner 	struct pte_search search;
801df80b57SPeter Feiner 
811df80b57SPeter Feiner 	search = find_pte_level(cr3, virt, 1);
821df80b57SPeter Feiner 	return found_leaf_pte(search) ? search.pte : NULL;
831df80b57SPeter Feiner }
841df80b57SPeter Feiner 
851df80b57SPeter Feiner /*
861df80b57SPeter Feiner  * Returns the PTE in the mapping of @virt at the given level @pte_level.
871df80b57SPeter Feiner  * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at
881df80b57SPeter Feiner  * @pte_level - 1 isn't present).
891df80b57SPeter Feiner  */
get_pte_level(pgd_t * cr3,void * virt,int pte_level)904363f1d9SPaolo Bonzini pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level)
911df80b57SPeter Feiner {
921df80b57SPeter Feiner 	struct pte_search search;
931df80b57SPeter Feiner 
941df80b57SPeter Feiner 	search = find_pte_level(cr3, virt, pte_level);
951df80b57SPeter Feiner 	return search.level == pte_level ? search.pte : NULL;
967d36db35SAvi Kivity }
977d36db35SAvi Kivity 
install_large_page(pgd_t * cr3,phys_addr_t phys,void * virt)984363f1d9SPaolo Bonzini pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt)
997d36db35SAvi Kivity {
10030203ea5SZixuan Wang     phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask | PT_PAGE_SIZE_MASK;
101c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI
10230203ea5SZixuan Wang     flags |= get_amd_sev_c_bit_mask();
103c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */
10430203ea5SZixuan Wang     return install_pte(cr3, 2, virt, phys | flags, 0);
1057d36db35SAvi Kivity }
1067d36db35SAvi Kivity 
install_page(pgd_t * cr3,phys_addr_t phys,void * virt)1074363f1d9SPaolo Bonzini pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt)
1087d36db35SAvi Kivity {
10930203ea5SZixuan Wang     phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask;
110c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI
11130203ea5SZixuan Wang     flags |= get_amd_sev_c_bit_mask();
112c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */
11330203ea5SZixuan Wang     return install_pte(cr3, 1, virt, phys | flags, 0);
1147d36db35SAvi Kivity }
1157d36db35SAvi Kivity 
install_pages(pgd_t * cr3,phys_addr_t phys,size_t len,void * virt)1164363f1d9SPaolo Bonzini void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt)
1171df80b57SPeter Feiner {
1184363f1d9SPaolo Bonzini 	phys_addr_t max = (u64)len + (u64)phys;
1191df80b57SPeter Feiner 	assert(phys % PAGE_SIZE == 0);
1204363f1d9SPaolo Bonzini 	assert((uintptr_t) virt % PAGE_SIZE == 0);
1211df80b57SPeter Feiner 	assert(len % PAGE_SIZE == 0);
1221df80b57SPeter Feiner 
1231df80b57SPeter Feiner 	while (phys + PAGE_SIZE <= max) {
1241df80b57SPeter Feiner 		install_page(cr3, phys, virt);
1251df80b57SPeter Feiner 		phys += PAGE_SIZE;
1261df80b57SPeter Feiner 		virt = (char *) virt + PAGE_SIZE;
1271df80b57SPeter Feiner 	}
1281df80b57SPeter Feiner }
1291df80b57SPeter Feiner 
any_present_pages(pgd_t * cr3,void * virt,size_t len)1304363f1d9SPaolo Bonzini bool any_present_pages(pgd_t *cr3, void *virt, size_t len)
1311df80b57SPeter Feiner {
1324363f1d9SPaolo Bonzini 	uintptr_t max = (uintptr_t) virt + len;
1334363f1d9SPaolo Bonzini 	uintptr_t curr;
1341df80b57SPeter Feiner 
1354363f1d9SPaolo Bonzini 	for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) {
1364363f1d9SPaolo Bonzini 		pteval_t *ptep = get_pte(cr3, (void *) curr);
1371df80b57SPeter Feiner 		if (ptep && (*ptep & PT_PRESENT_MASK))
1381df80b57SPeter Feiner 			return true;
1391df80b57SPeter Feiner 	}
1401df80b57SPeter Feiner 	return false;
1411df80b57SPeter Feiner }
1427d36db35SAvi Kivity 
__setup_mmu_range(pgd_t * cr3,phys_addr_t start,size_t len,enum x86_mmu_flags mmu_flags)143*c85124d2SManali Shukla void __setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len,
144*c85124d2SManali Shukla 		       enum x86_mmu_flags mmu_flags)
14563254428SGleb Natapov {
146*c85124d2SManali Shukla 	u64 orig_opt_mask = pte_opt_mask;
14763254428SGleb Natapov 	u64 max = (u64)len + (u64)start;
14863254428SGleb Natapov 	u64 phys = start;
14963254428SGleb Natapov 
150*c85124d2SManali Shukla 	if (mmu_flags & X86_MMU_MAP_USER)
151*c85124d2SManali Shukla 		pte_opt_mask |= PT_USER_MASK;
152*c85124d2SManali Shukla 
153*c85124d2SManali Shukla 	if (mmu_flags & X86_MMU_MAP_HUGE) {
15463254428SGleb Natapov 		while (phys + LARGE_PAGE_SIZE <= max) {
15563254428SGleb Natapov 			install_large_page(cr3, phys, (void *)(ulong)phys);
15663254428SGleb Natapov 			phys += LARGE_PAGE_SIZE;
15763254428SGleb Natapov 		}
158*c85124d2SManali Shukla 	}
1591df80b57SPeter Feiner 	install_pages(cr3, phys, max - phys, (void *)(ulong)phys);
160*c85124d2SManali Shukla 
161*c85124d2SManali Shukla 	pte_opt_mask = orig_opt_mask;
162*c85124d2SManali Shukla }
163*c85124d2SManali Shukla 
setup_mmu_range(pgd_t * cr3,phys_addr_t start,size_t len)164*c85124d2SManali Shukla static inline void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len)
165*c85124d2SManali Shukla {
166*c85124d2SManali Shukla 	__setup_mmu_range(cr3, start, len, X86_MMU_MAP_HUGE);
16763254428SGleb Natapov }
16863254428SGleb Natapov 
set_additional_vcpu_vmregs(struct vm_vcpu_info * info)16948f67910SCathy Avery static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info)
17048f67910SCathy Avery {
17148f67910SCathy Avery 	write_cr3(info->cr3);
17248f67910SCathy Avery 	write_cr4(info->cr4);
17348f67910SCathy Avery 	write_cr0(info->cr0);
17448f67910SCathy Avery }
17548f67910SCathy Avery 
setup_mmu(phys_addr_t end_of_memory,void * opt_mask)1765ae6c45fSSean Christopherson void *setup_mmu(phys_addr_t end_of_memory, void *opt_mask)
1777d36db35SAvi Kivity {
1784363f1d9SPaolo Bonzini     pgd_t *cr3 = alloc_page();
17948f67910SCathy Avery     struct vm_vcpu_info info;
18048f67910SCathy Avery     int i;
1817d36db35SAvi Kivity 
1825ae6c45fSSean Christopherson     if (opt_mask)
1835ae6c45fSSean Christopherson 	pte_opt_mask = *(pteval_t *)opt_mask;
1845ae6c45fSSean Christopherson     else
1855ae6c45fSSean Christopherson 	pte_opt_mask = PT_USER_MASK;
1865ae6c45fSSean Christopherson 
1877d36db35SAvi Kivity     memset(cr3, 0, PAGE_SIZE);
18863254428SGleb Natapov 
18963254428SGleb Natapov #ifdef __x86_64__
190937e2392SPaolo Bonzini     if (end_of_memory < (1ul << 32))
191937e2392SPaolo Bonzini         end_of_memory = (1ul << 32);  /* map mmio 1:1 */
19263254428SGleb Natapov 
193937e2392SPaolo Bonzini     setup_mmu_range(cr3, 0, end_of_memory);
19463254428SGleb Natapov #else
1956b9334e7SPaolo Bonzini     setup_mmu_range(cr3, 0, (2ul << 30));
19663254428SGleb Natapov     setup_mmu_range(cr3, 3ul << 30, (1ul << 30));
197efd8e5aaSPaolo Bonzini     init_alloc_vpage((void*)(3ul << 30));
19863254428SGleb Natapov #endif
19963254428SGleb Natapov 
2007d36db35SAvi Kivity     write_cr3(virt_to_phys(cr3));
2017d36db35SAvi Kivity #ifndef __x86_64__
2027d36db35SAvi Kivity     write_cr4(X86_CR4_PSE);
2037d36db35SAvi Kivity #endif
20497011120SGleb Natapov     write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP);
2057d36db35SAvi Kivity 
2067d36db35SAvi Kivity     printf("paging enabled\n");
207b006d7ebSAndrew Jones     printf("cr0 = %lx\n", read_cr0());
208b006d7ebSAndrew Jones     printf("cr3 = %lx\n", read_cr3());
209b006d7ebSAndrew Jones     printf("cr4 = %lx\n", read_cr4());
21048f67910SCathy Avery 
21148f67910SCathy Avery     info.cr3 = read_cr3();
21248f67910SCathy Avery     info.cr4 = read_cr4();
21348f67910SCathy Avery     info.cr0 = read_cr0();
21448f67910SCathy Avery 
21548f67910SCathy Avery     for (i = 1; i < cpu_count(); i++)
21648f67910SCathy Avery         on_cpu(i, (void *)set_additional_vcpu_vmregs, &info);
21748f67910SCathy Avery 
218937e2392SPaolo Bonzini     return cr3;
2197d36db35SAvi Kivity }
2207d36db35SAvi Kivity 
virt_to_pte_phys(pgd_t * cr3,void * mem)221c41e032aSPaolo Bonzini phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem)
222334cd2bfSGleb Natapov {
223c41e032aSPaolo Bonzini     return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1));
224334cd2bfSGleb Natapov }
2255868743aSMarc Orr 
2265868743aSMarc Orr /*
2275868743aSMarc Orr  * split_large_page: Split a 2M/1G large page into 512 smaller PTEs.
2285868743aSMarc Orr  *   @ptep : large page table entry to split
2295868743aSMarc Orr  *   @level : level of ptep (2 or 3)
2305868743aSMarc Orr  */
split_large_page(unsigned long * ptep,int level)2315868743aSMarc Orr void split_large_page(unsigned long *ptep, int level)
2325868743aSMarc Orr {
2335868743aSMarc Orr 	unsigned long *new_pt;
2345868743aSMarc Orr 	unsigned long pa;
2355868743aSMarc Orr 	unsigned long pte;
2365868743aSMarc Orr 	unsigned long prototype;
2375868743aSMarc Orr 	int i;
2385868743aSMarc Orr 
2395868743aSMarc Orr 	pte = *ptep;
2405868743aSMarc Orr 	assert(pte & PT_PRESENT_MASK);
2415868743aSMarc Orr 	assert(pte & PT_PAGE_SIZE_MASK);
2425868743aSMarc Orr 	assert(level == 2 || level == 3);
2435868743aSMarc Orr 
2445868743aSMarc Orr 	new_pt = alloc_page();
2455868743aSMarc Orr 	assert(new_pt);
2465868743aSMarc Orr 
2475868743aSMarc Orr 	prototype = pte & ~PT_ADDR_MASK;
2485868743aSMarc Orr 	if (level == 2)
2495868743aSMarc Orr 		prototype &= ~PT_PAGE_SIZE_MASK;
2505868743aSMarc Orr 
2515868743aSMarc Orr 	pa = pte & PT_ADDR_MASK;
2525868743aSMarc Orr 	for (i = 0; i < (1 << PGDIR_WIDTH); i++) {
2535868743aSMarc Orr 		new_pt[i] = prototype | pa;
2545868743aSMarc Orr 		pa += 1ul << PGDIR_BITS(level - 1);
2555868743aSMarc Orr 	}
2565868743aSMarc Orr 
2575868743aSMarc Orr 	pte &= ~PT_PAGE_SIZE_MASK;
2585868743aSMarc Orr 	pte &= ~PT_ADDR_MASK;
2595868743aSMarc Orr 	pte |= virt_to_phys(new_pt);
2605868743aSMarc Orr 
2615868743aSMarc Orr 	/* Modify the relevant paging-structure entry */
2625868743aSMarc Orr 	*ptep = pte;
2635868743aSMarc Orr 
2645868743aSMarc Orr 	/*
2655868743aSMarc Orr 	 * Flush the TLB to eradicate stale mappings.
2665868743aSMarc Orr 	 *
2675868743aSMarc Orr 	 * Note: Removing specific TLB mappings is tricky because
2685868743aSMarc Orr 	 * split_large_page() can be called to split the active code page
2695868743aSMarc Orr 	 * backing the next set of instructions to be fetched and executed.
2705868743aSMarc Orr 	 * Furthermore, Intel SDM volume 3 recommends to clear the present bit
2715868743aSMarc Orr 	 * for the page being split, before invalidating any mappings.
2725868743aSMarc Orr 	 *
2735868743aSMarc Orr 	 * But clearing the mapping from the page table and removing it from the
2745868743aSMarc Orr 	 * TLB (where it's not actually guaranteed to reside anyway) makes it
2755868743aSMarc Orr 	 * impossible to continue fetching instructions!
2765868743aSMarc Orr 	 */
2775868743aSMarc Orr 	flush_tlb();
2785868743aSMarc Orr }
2795868743aSMarc Orr 
2805868743aSMarc Orr /*
2815868743aSMarc Orr  * force_4k_page: Ensures that addr translate to a 4k page.
2825868743aSMarc Orr  *
2835868743aSMarc Orr  * This function uses split_large_page(), as needed, to ensure that target
2845868743aSMarc Orr  * address, addr, translates to a 4k page.
2855868743aSMarc Orr  *
2865868743aSMarc Orr  *   @addr: target address that should be mapped to a 4k page
2875868743aSMarc Orr  */
force_4k_page(void * addr)2885868743aSMarc Orr void force_4k_page(void *addr)
2895868743aSMarc Orr {
2905868743aSMarc Orr 	unsigned long *ptep;
2915868743aSMarc Orr 	unsigned long pte;
2925868743aSMarc Orr 	unsigned long *cr3 = current_page_table();
2935868743aSMarc Orr 
2945868743aSMarc Orr 	ptep = get_pte_level(cr3, addr, 3);
2955868743aSMarc Orr 	assert(ptep);
2965868743aSMarc Orr 	pte = *ptep;
2975868743aSMarc Orr 	assert(pte & PT_PRESENT_MASK);
2985868743aSMarc Orr 	if (pte & PT_PAGE_SIZE_MASK)
2995868743aSMarc Orr 		split_large_page(ptep, 3);
3005868743aSMarc Orr 
3015868743aSMarc Orr 	ptep = get_pte_level(cr3, addr, 2);
3025868743aSMarc Orr 	assert(ptep);
3035868743aSMarc Orr 	pte = *ptep;
3045868743aSMarc Orr 	assert(pte & PT_PRESENT_MASK);
3055868743aSMarc Orr 	if (pte & PT_PAGE_SIZE_MASK)
3065868743aSMarc Orr 		split_large_page(ptep, 2);
3075868743aSMarc Orr }
308f3e081d7SAaron Lewis 
309f3e081d7SAaron Lewis /*
310f3e081d7SAaron Lewis  * Call the callback on each page from virt to virt + len.
311f3e081d7SAaron Lewis  */
walk_pte(void * virt,size_t len,pte_callback_t callback)312f3e081d7SAaron Lewis void walk_pte(void *virt, size_t len, pte_callback_t callback)
313f3e081d7SAaron Lewis {
314f3e081d7SAaron Lewis     pgd_t *cr3 = current_page_table();
315f3e081d7SAaron Lewis     uintptr_t start = (uintptr_t)virt;
316f3e081d7SAaron Lewis     uintptr_t end = (uintptr_t)virt + len;
317f3e081d7SAaron Lewis     struct pte_search search;
318f3e081d7SAaron Lewis     size_t page_size;
319f3e081d7SAaron Lewis     uintptr_t curr;
320f3e081d7SAaron Lewis 
321f3e081d7SAaron Lewis     for (curr = start; curr < end; curr = ALIGN_DOWN(curr + page_size, page_size)) {
322f3e081d7SAaron Lewis         search = find_pte_level(cr3, (void *)curr, 1);
323f3e081d7SAaron Lewis         assert(found_leaf_pte(search));
324f3e081d7SAaron Lewis         page_size = 1ul << PGDIR_BITS(search.level);
325f3e081d7SAaron Lewis 
326f3e081d7SAaron Lewis         callback(search, (void *)curr);
327f3e081d7SAaron Lewis     }
328f3e081d7SAaron Lewis }
329