xref: /kvm-unit-tests/lib/x86/vm.c (revision c98ce6e0f823e2aaccdf6af60103a71853ad6f92)
17d36db35SAvi Kivity #include "vm.h"
27d36db35SAvi Kivity #include "libcflat.h"
3efd8e5aaSPaolo Bonzini #include "vmalloc.h"
45aca024eSPaolo Bonzini #include "alloc_page.h"
548f67910SCathy Avery #include "smp.h"
67d36db35SAvi Kivity 
75ae6c45fSSean Christopherson static pteval_t pte_opt_mask;
85ae6c45fSSean Christopherson 
94363f1d9SPaolo Bonzini pteval_t *install_pte(pgd_t *cr3,
107d36db35SAvi Kivity 		      int pte_level,
117d36db35SAvi Kivity 		      void *virt,
124363f1d9SPaolo Bonzini 		      pteval_t pte,
134363f1d9SPaolo Bonzini 		      pteval_t *pt_page)
147d36db35SAvi Kivity {
157d36db35SAvi Kivity     int level;
164363f1d9SPaolo Bonzini     pteval_t *pt = cr3;
177d36db35SAvi Kivity     unsigned offset;
187d36db35SAvi Kivity 
197d36db35SAvi Kivity     for (level = PAGE_LEVEL; level > pte_level; --level) {
204363f1d9SPaolo Bonzini 	offset = PGDIR_OFFSET((uintptr_t)virt, level);
21d10d16e1SAlexander Gordeev 	if (!(pt[offset] & PT_PRESENT_MASK)) {
224363f1d9SPaolo Bonzini 	    pteval_t *new_pt = pt_page;
237d36db35SAvi Kivity             if (!new_pt)
247d36db35SAvi Kivity                 new_pt = alloc_page();
257d36db35SAvi Kivity             else
267d36db35SAvi Kivity                 pt_page = 0;
277d36db35SAvi Kivity 	    memset(new_pt, 0, PAGE_SIZE);
285ae6c45fSSean Christopherson 	    pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask;
29*c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI
3030203ea5SZixuan Wang 	    pt[offset] |= get_amd_sev_c_bit_mask();
31*c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */
327d36db35SAvi Kivity 	}
33d10d16e1SAlexander Gordeev 	pt = phys_to_virt(pt[offset] & PT_ADDR_MASK);
347d36db35SAvi Kivity     }
354363f1d9SPaolo Bonzini     offset = PGDIR_OFFSET((uintptr_t)virt, level);
367d36db35SAvi Kivity     pt[offset] = pte;
3704262816SPaolo Bonzini     return &pt[offset];
387d36db35SAvi Kivity }
397d36db35SAvi Kivity 
401df80b57SPeter Feiner /*
411df80b57SPeter Feiner  * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The
421df80b57SPeter Feiner  * returned PTE isn't necessarily present, but its parent is.
431df80b57SPeter Feiner  */
444363f1d9SPaolo Bonzini struct pte_search find_pte_level(pgd_t *cr3, void *virt,
451df80b57SPeter Feiner 				 int lowest_level)
467d36db35SAvi Kivity {
474363f1d9SPaolo Bonzini 	pteval_t *pt = cr3, pte;
487d36db35SAvi Kivity 	unsigned offset;
494363f1d9SPaolo Bonzini 	unsigned shift;
501df80b57SPeter Feiner 	struct pte_search r;
517d36db35SAvi Kivity 
521df80b57SPeter Feiner 	assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL);
531df80b57SPeter Feiner 
541df80b57SPeter Feiner 	for (r.level = PAGE_LEVEL;; --r.level) {
551df80b57SPeter Feiner 		shift = (r.level - 1) * PGDIR_WIDTH + 12;
564363f1d9SPaolo Bonzini 		offset = ((uintptr_t)virt >> shift) & PGDIR_MASK;
571df80b57SPeter Feiner 		r.pte = &pt[offset];
581df80b57SPeter Feiner 		pte = *r.pte;
591df80b57SPeter Feiner 
60d10d16e1SAlexander Gordeev 		if (!(pte & PT_PRESENT_MASK))
611df80b57SPeter Feiner 			return r;
621df80b57SPeter Feiner 
631df80b57SPeter Feiner 		if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK))
641df80b57SPeter Feiner 			return r;
651df80b57SPeter Feiner 
661df80b57SPeter Feiner 		if (r.level == lowest_level)
671df80b57SPeter Feiner 			return r;
681df80b57SPeter Feiner 
6930203ea5SZixuan Wang 		pt = phys_to_virt(pte & PT_ADDR_MASK);
707d36db35SAvi Kivity 	}
711df80b57SPeter Feiner }
721df80b57SPeter Feiner 
731df80b57SPeter Feiner /*
741df80b57SPeter Feiner  * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge
751df80b57SPeter Feiner  * PTE). Returns NULL if no leaf PTE exists.
761df80b57SPeter Feiner  */
774363f1d9SPaolo Bonzini pteval_t *get_pte(pgd_t *cr3, void *virt)
781df80b57SPeter Feiner {
791df80b57SPeter Feiner 	struct pte_search search;
801df80b57SPeter Feiner 
811df80b57SPeter Feiner 	search = find_pte_level(cr3, virt, 1);
821df80b57SPeter Feiner 	return found_leaf_pte(search) ? search.pte : NULL;
831df80b57SPeter Feiner }
841df80b57SPeter Feiner 
851df80b57SPeter Feiner /*
861df80b57SPeter Feiner  * Returns the PTE in the mapping of @virt at the given level @pte_level.
871df80b57SPeter Feiner  * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at
881df80b57SPeter Feiner  * @pte_level - 1 isn't present).
891df80b57SPeter Feiner  */
904363f1d9SPaolo Bonzini pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level)
911df80b57SPeter Feiner {
921df80b57SPeter Feiner 	struct pte_search search;
931df80b57SPeter Feiner 
941df80b57SPeter Feiner 	search = find_pte_level(cr3, virt, pte_level);
951df80b57SPeter Feiner 	return search.level == pte_level ? search.pte : NULL;
967d36db35SAvi Kivity }
977d36db35SAvi Kivity 
984363f1d9SPaolo Bonzini pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt)
997d36db35SAvi Kivity {
10030203ea5SZixuan Wang     phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask | PT_PAGE_SIZE_MASK;
101*c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI
10230203ea5SZixuan Wang     flags |= get_amd_sev_c_bit_mask();
103*c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */
10430203ea5SZixuan Wang     return install_pte(cr3, 2, virt, phys | flags, 0);
1057d36db35SAvi Kivity }
1067d36db35SAvi Kivity 
1074363f1d9SPaolo Bonzini pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt)
1087d36db35SAvi Kivity {
10930203ea5SZixuan Wang     phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask;
110*c98ce6e0SAlexandru Elisei #ifdef CONFIG_EFI
11130203ea5SZixuan Wang     flags |= get_amd_sev_c_bit_mask();
112*c98ce6e0SAlexandru Elisei #endif /* CONFIG_EFI */
11330203ea5SZixuan Wang     return install_pte(cr3, 1, virt, phys | flags, 0);
1147d36db35SAvi Kivity }
1157d36db35SAvi Kivity 
1164363f1d9SPaolo Bonzini void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt)
1171df80b57SPeter Feiner {
1184363f1d9SPaolo Bonzini 	phys_addr_t max = (u64)len + (u64)phys;
1191df80b57SPeter Feiner 	assert(phys % PAGE_SIZE == 0);
1204363f1d9SPaolo Bonzini 	assert((uintptr_t) virt % PAGE_SIZE == 0);
1211df80b57SPeter Feiner 	assert(len % PAGE_SIZE == 0);
1221df80b57SPeter Feiner 
1231df80b57SPeter Feiner 	while (phys + PAGE_SIZE <= max) {
1241df80b57SPeter Feiner 		install_page(cr3, phys, virt);
1251df80b57SPeter Feiner 		phys += PAGE_SIZE;
1261df80b57SPeter Feiner 		virt = (char *) virt + PAGE_SIZE;
1271df80b57SPeter Feiner 	}
1281df80b57SPeter Feiner }
1291df80b57SPeter Feiner 
1304363f1d9SPaolo Bonzini bool any_present_pages(pgd_t *cr3, void *virt, size_t len)
1311df80b57SPeter Feiner {
1324363f1d9SPaolo Bonzini 	uintptr_t max = (uintptr_t) virt + len;
1334363f1d9SPaolo Bonzini 	uintptr_t curr;
1341df80b57SPeter Feiner 
1354363f1d9SPaolo Bonzini 	for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) {
1364363f1d9SPaolo Bonzini 		pteval_t *ptep = get_pte(cr3, (void *) curr);
1371df80b57SPeter Feiner 		if (ptep && (*ptep & PT_PRESENT_MASK))
1381df80b57SPeter Feiner 			return true;
1391df80b57SPeter Feiner 	}
1401df80b57SPeter Feiner 	return false;
1411df80b57SPeter Feiner }
1427d36db35SAvi Kivity 
1434363f1d9SPaolo Bonzini static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len)
14463254428SGleb Natapov {
14563254428SGleb Natapov 	u64 max = (u64)len + (u64)start;
14663254428SGleb Natapov 	u64 phys = start;
14763254428SGleb Natapov 
14863254428SGleb Natapov 	while (phys + LARGE_PAGE_SIZE <= max) {
14963254428SGleb Natapov 		install_large_page(cr3, phys, (void *)(ulong)phys);
15063254428SGleb Natapov 		phys += LARGE_PAGE_SIZE;
15163254428SGleb Natapov 	}
1521df80b57SPeter Feiner 	install_pages(cr3, phys, max - phys, (void *)(ulong)phys);
15363254428SGleb Natapov }
15463254428SGleb Natapov 
15548f67910SCathy Avery static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info)
15648f67910SCathy Avery {
15748f67910SCathy Avery 	write_cr3(info->cr3);
15848f67910SCathy Avery 	write_cr4(info->cr4);
15948f67910SCathy Avery 	write_cr0(info->cr0);
16048f67910SCathy Avery }
16148f67910SCathy Avery 
1625ae6c45fSSean Christopherson void *setup_mmu(phys_addr_t end_of_memory, void *opt_mask)
1637d36db35SAvi Kivity {
1644363f1d9SPaolo Bonzini     pgd_t *cr3 = alloc_page();
16548f67910SCathy Avery     struct vm_vcpu_info info;
16648f67910SCathy Avery     int i;
1677d36db35SAvi Kivity 
1685ae6c45fSSean Christopherson     if (opt_mask)
1695ae6c45fSSean Christopherson 	pte_opt_mask = *(pteval_t *)opt_mask;
1705ae6c45fSSean Christopherson     else
1715ae6c45fSSean Christopherson 	pte_opt_mask = PT_USER_MASK;
1725ae6c45fSSean Christopherson 
1737d36db35SAvi Kivity     memset(cr3, 0, PAGE_SIZE);
17463254428SGleb Natapov 
17563254428SGleb Natapov #ifdef __x86_64__
176937e2392SPaolo Bonzini     if (end_of_memory < (1ul << 32))
177937e2392SPaolo Bonzini         end_of_memory = (1ul << 32);  /* map mmio 1:1 */
17863254428SGleb Natapov 
179937e2392SPaolo Bonzini     setup_mmu_range(cr3, 0, end_of_memory);
18063254428SGleb Natapov #else
1816b9334e7SPaolo Bonzini     setup_mmu_range(cr3, 0, (2ul << 30));
18263254428SGleb Natapov     setup_mmu_range(cr3, 3ul << 30, (1ul << 30));
183efd8e5aaSPaolo Bonzini     init_alloc_vpage((void*)(3ul << 30));
18463254428SGleb Natapov #endif
18563254428SGleb Natapov 
1867d36db35SAvi Kivity     write_cr3(virt_to_phys(cr3));
1877d36db35SAvi Kivity #ifndef __x86_64__
1887d36db35SAvi Kivity     write_cr4(X86_CR4_PSE);
1897d36db35SAvi Kivity #endif
19097011120SGleb Natapov     write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP);
1917d36db35SAvi Kivity 
1927d36db35SAvi Kivity     printf("paging enabled\n");
193b006d7ebSAndrew Jones     printf("cr0 = %lx\n", read_cr0());
194b006d7ebSAndrew Jones     printf("cr3 = %lx\n", read_cr3());
195b006d7ebSAndrew Jones     printf("cr4 = %lx\n", read_cr4());
19648f67910SCathy Avery 
19748f67910SCathy Avery     info.cr3 = read_cr3();
19848f67910SCathy Avery     info.cr4 = read_cr4();
19948f67910SCathy Avery     info.cr0 = read_cr0();
20048f67910SCathy Avery 
20148f67910SCathy Avery     for (i = 1; i < cpu_count(); i++)
20248f67910SCathy Avery         on_cpu(i, (void *)set_additional_vcpu_vmregs, &info);
20348f67910SCathy Avery 
204937e2392SPaolo Bonzini     return cr3;
2057d36db35SAvi Kivity }
2067d36db35SAvi Kivity 
207c41e032aSPaolo Bonzini phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem)
208334cd2bfSGleb Natapov {
209c41e032aSPaolo Bonzini     return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1));
210334cd2bfSGleb Natapov }
2115868743aSMarc Orr 
2125868743aSMarc Orr /*
2135868743aSMarc Orr  * split_large_page: Split a 2M/1G large page into 512 smaller PTEs.
2145868743aSMarc Orr  *   @ptep : large page table entry to split
2155868743aSMarc Orr  *   @level : level of ptep (2 or 3)
2165868743aSMarc Orr  */
2175868743aSMarc Orr void split_large_page(unsigned long *ptep, int level)
2185868743aSMarc Orr {
2195868743aSMarc Orr 	unsigned long *new_pt;
2205868743aSMarc Orr 	unsigned long pa;
2215868743aSMarc Orr 	unsigned long pte;
2225868743aSMarc Orr 	unsigned long prototype;
2235868743aSMarc Orr 	int i;
2245868743aSMarc Orr 
2255868743aSMarc Orr 	pte = *ptep;
2265868743aSMarc Orr 	assert(pte & PT_PRESENT_MASK);
2275868743aSMarc Orr 	assert(pte & PT_PAGE_SIZE_MASK);
2285868743aSMarc Orr 	assert(level == 2 || level == 3);
2295868743aSMarc Orr 
2305868743aSMarc Orr 	new_pt = alloc_page();
2315868743aSMarc Orr 	assert(new_pt);
2325868743aSMarc Orr 
2335868743aSMarc Orr 	prototype = pte & ~PT_ADDR_MASK;
2345868743aSMarc Orr 	if (level == 2)
2355868743aSMarc Orr 		prototype &= ~PT_PAGE_SIZE_MASK;
2365868743aSMarc Orr 
2375868743aSMarc Orr 	pa = pte & PT_ADDR_MASK;
2385868743aSMarc Orr 	for (i = 0; i < (1 << PGDIR_WIDTH); i++) {
2395868743aSMarc Orr 		new_pt[i] = prototype | pa;
2405868743aSMarc Orr 		pa += 1ul << PGDIR_BITS(level - 1);
2415868743aSMarc Orr 	}
2425868743aSMarc Orr 
2435868743aSMarc Orr 	pte &= ~PT_PAGE_SIZE_MASK;
2445868743aSMarc Orr 	pte &= ~PT_ADDR_MASK;
2455868743aSMarc Orr 	pte |= virt_to_phys(new_pt);
2465868743aSMarc Orr 
2475868743aSMarc Orr 	/* Modify the relevant paging-structure entry */
2485868743aSMarc Orr 	*ptep = pte;
2495868743aSMarc Orr 
2505868743aSMarc Orr 	/*
2515868743aSMarc Orr 	 * Flush the TLB to eradicate stale mappings.
2525868743aSMarc Orr 	 *
2535868743aSMarc Orr 	 * Note: Removing specific TLB mappings is tricky because
2545868743aSMarc Orr 	 * split_large_page() can be called to split the active code page
2555868743aSMarc Orr 	 * backing the next set of instructions to be fetched and executed.
2565868743aSMarc Orr 	 * Furthermore, Intel SDM volume 3 recommends to clear the present bit
2575868743aSMarc Orr 	 * for the page being split, before invalidating any mappings.
2585868743aSMarc Orr 	 *
2595868743aSMarc Orr 	 * But clearing the mapping from the page table and removing it from the
2605868743aSMarc Orr 	 * TLB (where it's not actually guaranteed to reside anyway) makes it
2615868743aSMarc Orr 	 * impossible to continue fetching instructions!
2625868743aSMarc Orr 	 */
2635868743aSMarc Orr 	flush_tlb();
2645868743aSMarc Orr }
2655868743aSMarc Orr 
2665868743aSMarc Orr /*
2675868743aSMarc Orr  * force_4k_page: Ensures that addr translate to a 4k page.
2685868743aSMarc Orr  *
2695868743aSMarc Orr  * This function uses split_large_page(), as needed, to ensure that target
2705868743aSMarc Orr  * address, addr, translates to a 4k page.
2715868743aSMarc Orr  *
2725868743aSMarc Orr  *   @addr: target address that should be mapped to a 4k page
2735868743aSMarc Orr  */
2745868743aSMarc Orr void force_4k_page(void *addr)
2755868743aSMarc Orr {
2765868743aSMarc Orr 	unsigned long *ptep;
2775868743aSMarc Orr 	unsigned long pte;
2785868743aSMarc Orr 	unsigned long *cr3 = current_page_table();
2795868743aSMarc Orr 
2805868743aSMarc Orr 	ptep = get_pte_level(cr3, addr, 3);
2815868743aSMarc Orr 	assert(ptep);
2825868743aSMarc Orr 	pte = *ptep;
2835868743aSMarc Orr 	assert(pte & PT_PRESENT_MASK);
2845868743aSMarc Orr 	if (pte & PT_PAGE_SIZE_MASK)
2855868743aSMarc Orr 		split_large_page(ptep, 3);
2865868743aSMarc Orr 
2875868743aSMarc Orr 	ptep = get_pte_level(cr3, addr, 2);
2885868743aSMarc Orr 	assert(ptep);
2895868743aSMarc Orr 	pte = *ptep;
2905868743aSMarc Orr 	assert(pte & PT_PRESENT_MASK);
2915868743aSMarc Orr 	if (pte & PT_PAGE_SIZE_MASK)
2925868743aSMarc Orr 		split_large_page(ptep, 2);
2935868743aSMarc Orr }
294f3e081d7SAaron Lewis 
295f3e081d7SAaron Lewis /*
296f3e081d7SAaron Lewis  * Call the callback on each page from virt to virt + len.
297f3e081d7SAaron Lewis  */
298f3e081d7SAaron Lewis void walk_pte(void *virt, size_t len, pte_callback_t callback)
299f3e081d7SAaron Lewis {
300f3e081d7SAaron Lewis     pgd_t *cr3 = current_page_table();
301f3e081d7SAaron Lewis     uintptr_t start = (uintptr_t)virt;
302f3e081d7SAaron Lewis     uintptr_t end = (uintptr_t)virt + len;
303f3e081d7SAaron Lewis     struct pte_search search;
304f3e081d7SAaron Lewis     size_t page_size;
305f3e081d7SAaron Lewis     uintptr_t curr;
306f3e081d7SAaron Lewis 
307f3e081d7SAaron Lewis     for (curr = start; curr < end; curr = ALIGN_DOWN(curr + page_size, page_size)) {
308f3e081d7SAaron Lewis         search = find_pte_level(cr3, (void *)curr, 1);
309f3e081d7SAaron Lewis         assert(found_leaf_pte(search));
310f3e081d7SAaron Lewis         page_size = 1ul << PGDIR_BITS(search.level);
311f3e081d7SAaron Lewis 
312f3e081d7SAaron Lewis         callback(search, (void *)curr);
313f3e081d7SAaron Lewis     }
314f3e081d7SAaron Lewis }
315