xref: /kvm-unit-tests/lib/x86/vm.c (revision 48f6791013dafc10b5ce039f8c7d31efcadd2a64)
17d36db35SAvi Kivity #include "vm.h"
27d36db35SAvi Kivity #include "libcflat.h"
3efd8e5aaSPaolo Bonzini #include "vmalloc.h"
45aca024eSPaolo Bonzini #include "alloc_page.h"
5*48f67910SCathy Avery #include "smp.h"
67d36db35SAvi Kivity 
74363f1d9SPaolo Bonzini pteval_t *install_pte(pgd_t *cr3,
87d36db35SAvi Kivity 		      int pte_level,
97d36db35SAvi Kivity 		      void *virt,
104363f1d9SPaolo Bonzini 		      pteval_t pte,
114363f1d9SPaolo Bonzini 		      pteval_t *pt_page)
127d36db35SAvi Kivity {
137d36db35SAvi Kivity     int level;
144363f1d9SPaolo Bonzini     pteval_t *pt = cr3;
157d36db35SAvi Kivity     unsigned offset;
167d36db35SAvi Kivity 
177d36db35SAvi Kivity     for (level = PAGE_LEVEL; level > pte_level; --level) {
184363f1d9SPaolo Bonzini 	offset = PGDIR_OFFSET((uintptr_t)virt, level);
19d10d16e1SAlexander Gordeev 	if (!(pt[offset] & PT_PRESENT_MASK)) {
204363f1d9SPaolo Bonzini 	    pteval_t *new_pt = pt_page;
217d36db35SAvi Kivity             if (!new_pt)
227d36db35SAvi Kivity                 new_pt = alloc_page();
237d36db35SAvi Kivity             else
247d36db35SAvi Kivity                 pt_page = 0;
257d36db35SAvi Kivity 	    memset(new_pt, 0, PAGE_SIZE);
26d10d16e1SAlexander Gordeev 	    pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
277d36db35SAvi Kivity 	}
28d10d16e1SAlexander Gordeev 	pt = phys_to_virt(pt[offset] & PT_ADDR_MASK);
297d36db35SAvi Kivity     }
304363f1d9SPaolo Bonzini     offset = PGDIR_OFFSET((uintptr_t)virt, level);
317d36db35SAvi Kivity     pt[offset] = pte;
3204262816SPaolo Bonzini     return &pt[offset];
337d36db35SAvi Kivity }
347d36db35SAvi Kivity 
351df80b57SPeter Feiner /*
361df80b57SPeter Feiner  * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The
371df80b57SPeter Feiner  * returned PTE isn't necessarily present, but its parent is.
381df80b57SPeter Feiner  */
394363f1d9SPaolo Bonzini struct pte_search find_pte_level(pgd_t *cr3, void *virt,
401df80b57SPeter Feiner 				 int lowest_level)
417d36db35SAvi Kivity {
424363f1d9SPaolo Bonzini 	pteval_t *pt = cr3, pte;
437d36db35SAvi Kivity 	unsigned offset;
444363f1d9SPaolo Bonzini 	unsigned shift;
451df80b57SPeter Feiner 	struct pte_search r;
467d36db35SAvi Kivity 
471df80b57SPeter Feiner 	assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL);
481df80b57SPeter Feiner 
491df80b57SPeter Feiner 	for (r.level = PAGE_LEVEL;; --r.level) {
501df80b57SPeter Feiner 		shift = (r.level - 1) * PGDIR_WIDTH + 12;
514363f1d9SPaolo Bonzini 		offset = ((uintptr_t)virt >> shift) & PGDIR_MASK;
521df80b57SPeter Feiner 		r.pte = &pt[offset];
531df80b57SPeter Feiner 		pte = *r.pte;
541df80b57SPeter Feiner 
55d10d16e1SAlexander Gordeev 		if (!(pte & PT_PRESENT_MASK))
561df80b57SPeter Feiner 			return r;
571df80b57SPeter Feiner 
581df80b57SPeter Feiner 		if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK))
591df80b57SPeter Feiner 			return r;
601df80b57SPeter Feiner 
611df80b57SPeter Feiner 		if (r.level == lowest_level)
621df80b57SPeter Feiner 			return r;
631df80b57SPeter Feiner 
641df80b57SPeter Feiner 		pt = phys_to_virt(pte & 0xffffffffff000ull);
657d36db35SAvi Kivity 	}
661df80b57SPeter Feiner }
671df80b57SPeter Feiner 
681df80b57SPeter Feiner /*
691df80b57SPeter Feiner  * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge
701df80b57SPeter Feiner  * PTE). Returns NULL if no leaf PTE exists.
711df80b57SPeter Feiner  */
724363f1d9SPaolo Bonzini pteval_t *get_pte(pgd_t *cr3, void *virt)
731df80b57SPeter Feiner {
741df80b57SPeter Feiner 	struct pte_search search;
751df80b57SPeter Feiner 
761df80b57SPeter Feiner 	search = find_pte_level(cr3, virt, 1);
771df80b57SPeter Feiner 	return found_leaf_pte(search) ? search.pte : NULL;
781df80b57SPeter Feiner }
791df80b57SPeter Feiner 
801df80b57SPeter Feiner /*
811df80b57SPeter Feiner  * Returns the PTE in the mapping of @virt at the given level @pte_level.
821df80b57SPeter Feiner  * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at
831df80b57SPeter Feiner  * @pte_level - 1 isn't present).
841df80b57SPeter Feiner  */
854363f1d9SPaolo Bonzini pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level)
861df80b57SPeter Feiner {
871df80b57SPeter Feiner 	struct pte_search search;
881df80b57SPeter Feiner 
891df80b57SPeter Feiner 	search = find_pte_level(cr3, virt, pte_level);
901df80b57SPeter Feiner 	return search.level == pte_level ? search.pte : NULL;
917d36db35SAvi Kivity }
927d36db35SAvi Kivity 
934363f1d9SPaolo Bonzini pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt)
947d36db35SAvi Kivity {
9504262816SPaolo Bonzini     return install_pte(cr3, 2, virt,
96d10d16e1SAlexander Gordeev 		       phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK | PT_PAGE_SIZE_MASK, 0);
977d36db35SAvi Kivity }
987d36db35SAvi Kivity 
994363f1d9SPaolo Bonzini pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt)
1007d36db35SAvi Kivity {
101d10d16e1SAlexander Gordeev     return install_pte(cr3, 1, virt, phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK, 0);
1027d36db35SAvi Kivity }
1037d36db35SAvi Kivity 
1044363f1d9SPaolo Bonzini void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt)
1051df80b57SPeter Feiner {
1064363f1d9SPaolo Bonzini 	phys_addr_t max = (u64)len + (u64)phys;
1071df80b57SPeter Feiner 	assert(phys % PAGE_SIZE == 0);
1084363f1d9SPaolo Bonzini 	assert((uintptr_t) virt % PAGE_SIZE == 0);
1091df80b57SPeter Feiner 	assert(len % PAGE_SIZE == 0);
1101df80b57SPeter Feiner 
1111df80b57SPeter Feiner 	while (phys + PAGE_SIZE <= max) {
1121df80b57SPeter Feiner 		install_page(cr3, phys, virt);
1131df80b57SPeter Feiner 		phys += PAGE_SIZE;
1141df80b57SPeter Feiner 		virt = (char *) virt + PAGE_SIZE;
1151df80b57SPeter Feiner 	}
1161df80b57SPeter Feiner }
1171df80b57SPeter Feiner 
1184363f1d9SPaolo Bonzini bool any_present_pages(pgd_t *cr3, void *virt, size_t len)
1191df80b57SPeter Feiner {
1204363f1d9SPaolo Bonzini 	uintptr_t max = (uintptr_t) virt + len;
1214363f1d9SPaolo Bonzini 	uintptr_t curr;
1221df80b57SPeter Feiner 
1234363f1d9SPaolo Bonzini 	for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) {
1244363f1d9SPaolo Bonzini 		pteval_t *ptep = get_pte(cr3, (void *) curr);
1251df80b57SPeter Feiner 		if (ptep && (*ptep & PT_PRESENT_MASK))
1261df80b57SPeter Feiner 			return true;
1271df80b57SPeter Feiner 	}
1281df80b57SPeter Feiner 	return false;
1291df80b57SPeter Feiner }
1307d36db35SAvi Kivity 
1314363f1d9SPaolo Bonzini static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len)
13263254428SGleb Natapov {
13363254428SGleb Natapov 	u64 max = (u64)len + (u64)start;
13463254428SGleb Natapov 	u64 phys = start;
13563254428SGleb Natapov 
13663254428SGleb Natapov 	while (phys + LARGE_PAGE_SIZE <= max) {
13763254428SGleb Natapov 		install_large_page(cr3, phys, (void *)(ulong)phys);
13863254428SGleb Natapov 		phys += LARGE_PAGE_SIZE;
13963254428SGleb Natapov 	}
1401df80b57SPeter Feiner 	install_pages(cr3, phys, max - phys, (void *)(ulong)phys);
14163254428SGleb Natapov }
14263254428SGleb Natapov 
143*48f67910SCathy Avery static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info)
144*48f67910SCathy Avery {
145*48f67910SCathy Avery 	write_cr3(info->cr3);
146*48f67910SCathy Avery 	write_cr4(info->cr4);
147*48f67910SCathy Avery 	write_cr0(info->cr0);
148*48f67910SCathy Avery }
149*48f67910SCathy Avery 
150937e2392SPaolo Bonzini void *setup_mmu(phys_addr_t end_of_memory)
1517d36db35SAvi Kivity {
1524363f1d9SPaolo Bonzini     pgd_t *cr3 = alloc_page();
153*48f67910SCathy Avery     struct vm_vcpu_info info;
154*48f67910SCathy Avery     int i;
1557d36db35SAvi Kivity 
1567d36db35SAvi Kivity     memset(cr3, 0, PAGE_SIZE);
15763254428SGleb Natapov 
15863254428SGleb Natapov #ifdef __x86_64__
159937e2392SPaolo Bonzini     if (end_of_memory < (1ul << 32))
160937e2392SPaolo Bonzini         end_of_memory = (1ul << 32);  /* map mmio 1:1 */
16163254428SGleb Natapov 
162937e2392SPaolo Bonzini     setup_mmu_range(cr3, 0, end_of_memory);
16363254428SGleb Natapov #else
1646b9334e7SPaolo Bonzini     setup_mmu_range(cr3, 0, (2ul << 30));
16563254428SGleb Natapov     setup_mmu_range(cr3, 3ul << 30, (1ul << 30));
166efd8e5aaSPaolo Bonzini     init_alloc_vpage((void*)(3ul << 30));
16763254428SGleb Natapov #endif
16863254428SGleb Natapov 
1697d36db35SAvi Kivity     write_cr3(virt_to_phys(cr3));
1707d36db35SAvi Kivity #ifndef __x86_64__
1717d36db35SAvi Kivity     write_cr4(X86_CR4_PSE);
1727d36db35SAvi Kivity #endif
17397011120SGleb Natapov     write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP);
1747d36db35SAvi Kivity 
1757d36db35SAvi Kivity     printf("paging enabled\n");
176b006d7ebSAndrew Jones     printf("cr0 = %lx\n", read_cr0());
177b006d7ebSAndrew Jones     printf("cr3 = %lx\n", read_cr3());
178b006d7ebSAndrew Jones     printf("cr4 = %lx\n", read_cr4());
179*48f67910SCathy Avery 
180*48f67910SCathy Avery     info.cr3 = read_cr3();
181*48f67910SCathy Avery     info.cr4 = read_cr4();
182*48f67910SCathy Avery     info.cr0 = read_cr0();
183*48f67910SCathy Avery 
184*48f67910SCathy Avery     for (i = 1; i < cpu_count(); i++)
185*48f67910SCathy Avery         on_cpu(i, (void *)set_additional_vcpu_vmregs, &info);
186*48f67910SCathy Avery 
187937e2392SPaolo Bonzini     return cr3;
1887d36db35SAvi Kivity }
1897d36db35SAvi Kivity 
190c41e032aSPaolo Bonzini phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem)
191334cd2bfSGleb Natapov {
192c41e032aSPaolo Bonzini     return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1));
193334cd2bfSGleb Natapov }
1945868743aSMarc Orr 
1955868743aSMarc Orr /*
1965868743aSMarc Orr  * split_large_page: Split a 2M/1G large page into 512 smaller PTEs.
1975868743aSMarc Orr  *   @ptep : large page table entry to split
1985868743aSMarc Orr  *   @level : level of ptep (2 or 3)
1995868743aSMarc Orr  */
2005868743aSMarc Orr void split_large_page(unsigned long *ptep, int level)
2015868743aSMarc Orr {
2025868743aSMarc Orr 	unsigned long *new_pt;
2035868743aSMarc Orr 	unsigned long pa;
2045868743aSMarc Orr 	unsigned long pte;
2055868743aSMarc Orr 	unsigned long prototype;
2065868743aSMarc Orr 	int i;
2075868743aSMarc Orr 
2085868743aSMarc Orr 	pte = *ptep;
2095868743aSMarc Orr 	assert(pte & PT_PRESENT_MASK);
2105868743aSMarc Orr 	assert(pte & PT_PAGE_SIZE_MASK);
2115868743aSMarc Orr 	assert(level == 2 || level == 3);
2125868743aSMarc Orr 
2135868743aSMarc Orr 	new_pt = alloc_page();
2145868743aSMarc Orr 	assert(new_pt);
2155868743aSMarc Orr 
2165868743aSMarc Orr 	prototype = pte & ~PT_ADDR_MASK;
2175868743aSMarc Orr 	if (level == 2)
2185868743aSMarc Orr 		prototype &= ~PT_PAGE_SIZE_MASK;
2195868743aSMarc Orr 
2205868743aSMarc Orr 	pa = pte & PT_ADDR_MASK;
2215868743aSMarc Orr 	for (i = 0; i < (1 << PGDIR_WIDTH); i++) {
2225868743aSMarc Orr 		new_pt[i] = prototype | pa;
2235868743aSMarc Orr 		pa += 1ul << PGDIR_BITS(level - 1);
2245868743aSMarc Orr 	}
2255868743aSMarc Orr 
2265868743aSMarc Orr 	pte &= ~PT_PAGE_SIZE_MASK;
2275868743aSMarc Orr 	pte &= ~PT_ADDR_MASK;
2285868743aSMarc Orr 	pte |= virt_to_phys(new_pt);
2295868743aSMarc Orr 
2305868743aSMarc Orr 	/* Modify the relevant paging-structure entry */
2315868743aSMarc Orr 	*ptep = pte;
2325868743aSMarc Orr 
2335868743aSMarc Orr 	/*
2345868743aSMarc Orr 	 * Flush the TLB to eradicate stale mappings.
2355868743aSMarc Orr 	 *
2365868743aSMarc Orr 	 * Note: Removing specific TLB mappings is tricky because
2375868743aSMarc Orr 	 * split_large_page() can be called to split the active code page
2385868743aSMarc Orr 	 * backing the next set of instructions to be fetched and executed.
2395868743aSMarc Orr 	 * Furthermore, Intel SDM volume 3 recommends to clear the present bit
2405868743aSMarc Orr 	 * for the page being split, before invalidating any mappings.
2415868743aSMarc Orr 	 *
2425868743aSMarc Orr 	 * But clearing the mapping from the page table and removing it from the
2435868743aSMarc Orr 	 * TLB (where it's not actually guaranteed to reside anyway) makes it
2445868743aSMarc Orr 	 * impossible to continue fetching instructions!
2455868743aSMarc Orr 	 */
2465868743aSMarc Orr 	flush_tlb();
2475868743aSMarc Orr }
2485868743aSMarc Orr 
2495868743aSMarc Orr /*
2505868743aSMarc Orr  * force_4k_page: Ensures that addr translate to a 4k page.
2515868743aSMarc Orr  *
2525868743aSMarc Orr  * This function uses split_large_page(), as needed, to ensure that target
2535868743aSMarc Orr  * address, addr, translates to a 4k page.
2545868743aSMarc Orr  *
2555868743aSMarc Orr  *   @addr: target address that should be mapped to a 4k page
2565868743aSMarc Orr  */
2575868743aSMarc Orr void force_4k_page(void *addr)
2585868743aSMarc Orr {
2595868743aSMarc Orr 	unsigned long *ptep;
2605868743aSMarc Orr 	unsigned long pte;
2615868743aSMarc Orr 	unsigned long *cr3 = current_page_table();
2625868743aSMarc Orr 
2635868743aSMarc Orr 	ptep = get_pte_level(cr3, addr, 3);
2645868743aSMarc Orr 	assert(ptep);
2655868743aSMarc Orr 	pte = *ptep;
2665868743aSMarc Orr 	assert(pte & PT_PRESENT_MASK);
2675868743aSMarc Orr 	if (pte & PT_PAGE_SIZE_MASK)
2685868743aSMarc Orr 		split_large_page(ptep, 3);
2695868743aSMarc Orr 
2705868743aSMarc Orr 	ptep = get_pte_level(cr3, addr, 2);
2715868743aSMarc Orr 	assert(ptep);
2725868743aSMarc Orr 	pte = *ptep;
2735868743aSMarc Orr 	assert(pte & PT_PRESENT_MASK);
2745868743aSMarc Orr 	if (pte & PT_PAGE_SIZE_MASK)
2755868743aSMarc Orr 		split_large_page(ptep, 2);
2765868743aSMarc Orr }
277