xref: /kvm-unit-tests/lib/x86/vm.c (revision d74708246bd9a593e03ecca476a5f1ed36e47288)
1 #include "vm.h"
2 #include "libcflat.h"
3 #include "vmalloc.h"
4 #include "alloc_page.h"
5 #include "smp.h"
6 
7 pteval_t *install_pte(pgd_t *cr3,
8 		      int pte_level,
9 		      void *virt,
10 		      pteval_t pte,
11 		      pteval_t *pt_page)
12 {
13     int level;
14     pteval_t *pt = cr3;
15     unsigned offset;
16 
17     for (level = PAGE_LEVEL; level > pte_level; --level) {
18 	offset = PGDIR_OFFSET((uintptr_t)virt, level);
19 	if (!(pt[offset] & PT_PRESENT_MASK)) {
20 	    pteval_t *new_pt = pt_page;
21             if (!new_pt)
22                 new_pt = alloc_page();
23             else
24                 pt_page = 0;
25 	    memset(new_pt, 0, PAGE_SIZE);
26 	    pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
27 	}
28 	pt = phys_to_virt(pt[offset] & PT_ADDR_MASK);
29     }
30     offset = PGDIR_OFFSET((uintptr_t)virt, level);
31     pt[offset] = pte;
32     return &pt[offset];
33 }
34 
35 /*
36  * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The
37  * returned PTE isn't necessarily present, but its parent is.
38  */
39 struct pte_search find_pte_level(pgd_t *cr3, void *virt,
40 				 int lowest_level)
41 {
42 	pteval_t *pt = cr3, pte;
43 	unsigned offset;
44 	unsigned shift;
45 	struct pte_search r;
46 
47 	assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL);
48 
49 	for (r.level = PAGE_LEVEL;; --r.level) {
50 		shift = (r.level - 1) * PGDIR_WIDTH + 12;
51 		offset = ((uintptr_t)virt >> shift) & PGDIR_MASK;
52 		r.pte = &pt[offset];
53 		pte = *r.pte;
54 
55 		if (!(pte & PT_PRESENT_MASK))
56 			return r;
57 
58 		if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK))
59 			return r;
60 
61 		if (r.level == lowest_level)
62 			return r;
63 
64 		pt = phys_to_virt(pte & 0xffffffffff000ull);
65 	}
66 }
67 
68 /*
69  * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge
70  * PTE). Returns NULL if no leaf PTE exists.
71  */
72 pteval_t *get_pte(pgd_t *cr3, void *virt)
73 {
74 	struct pte_search search;
75 
76 	search = find_pte_level(cr3, virt, 1);
77 	return found_leaf_pte(search) ? search.pte : NULL;
78 }
79 
80 /*
81  * Returns the PTE in the mapping of @virt at the given level @pte_level.
82  * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at
83  * @pte_level - 1 isn't present).
84  */
85 pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level)
86 {
87 	struct pte_search search;
88 
89 	search = find_pte_level(cr3, virt, pte_level);
90 	return search.level == pte_level ? search.pte : NULL;
91 }
92 
93 pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt)
94 {
95     return install_pte(cr3, 2, virt,
96 		       phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK | PT_PAGE_SIZE_MASK, 0);
97 }
98 
99 pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt)
100 {
101     return install_pte(cr3, 1, virt, phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK, 0);
102 }
103 
104 void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt)
105 {
106 	phys_addr_t max = (u64)len + (u64)phys;
107 	assert(phys % PAGE_SIZE == 0);
108 	assert((uintptr_t) virt % PAGE_SIZE == 0);
109 	assert(len % PAGE_SIZE == 0);
110 
111 	while (phys + PAGE_SIZE <= max) {
112 		install_page(cr3, phys, virt);
113 		phys += PAGE_SIZE;
114 		virt = (char *) virt + PAGE_SIZE;
115 	}
116 }
117 
118 bool any_present_pages(pgd_t *cr3, void *virt, size_t len)
119 {
120 	uintptr_t max = (uintptr_t) virt + len;
121 	uintptr_t curr;
122 
123 	for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) {
124 		pteval_t *ptep = get_pte(cr3, (void *) curr);
125 		if (ptep && (*ptep & PT_PRESENT_MASK))
126 			return true;
127 	}
128 	return false;
129 }
130 
131 static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len)
132 {
133 	u64 max = (u64)len + (u64)start;
134 	u64 phys = start;
135 
136 	while (phys + LARGE_PAGE_SIZE <= max) {
137 		install_large_page(cr3, phys, (void *)(ulong)phys);
138 		phys += LARGE_PAGE_SIZE;
139 	}
140 	install_pages(cr3, phys, max - phys, (void *)(ulong)phys);
141 }
142 
143 static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info)
144 {
145 	write_cr3(info->cr3);
146 	write_cr4(info->cr4);
147 	write_cr0(info->cr0);
148 }
149 
150 void *setup_mmu(phys_addr_t end_of_memory)
151 {
152     pgd_t *cr3 = alloc_page();
153     struct vm_vcpu_info info;
154     int i;
155 
156     memset(cr3, 0, PAGE_SIZE);
157 
158 #ifdef __x86_64__
159     if (end_of_memory < (1ul << 32))
160         end_of_memory = (1ul << 32);  /* map mmio 1:1 */
161 
162     setup_mmu_range(cr3, 0, end_of_memory);
163 #else
164     setup_mmu_range(cr3, 0, (2ul << 30));
165     setup_mmu_range(cr3, 3ul << 30, (1ul << 30));
166     init_alloc_vpage((void*)(3ul << 30));
167 #endif
168 
169     write_cr3(virt_to_phys(cr3));
170 #ifndef __x86_64__
171     write_cr4(X86_CR4_PSE);
172 #endif
173     write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP);
174 
175     printf("paging enabled\n");
176     printf("cr0 = %lx\n", read_cr0());
177     printf("cr3 = %lx\n", read_cr3());
178     printf("cr4 = %lx\n", read_cr4());
179 
180     info.cr3 = read_cr3();
181     info.cr4 = read_cr4();
182     info.cr0 = read_cr0();
183 
184     for (i = 1; i < cpu_count(); i++)
185         on_cpu(i, (void *)set_additional_vcpu_vmregs, &info);
186 
187     return cr3;
188 }
189 
190 phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem)
191 {
192     return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1));
193 }
194 
195 /*
196  * split_large_page: Split a 2M/1G large page into 512 smaller PTEs.
197  *   @ptep : large page table entry to split
198  *   @level : level of ptep (2 or 3)
199  */
200 void split_large_page(unsigned long *ptep, int level)
201 {
202 	unsigned long *new_pt;
203 	unsigned long pa;
204 	unsigned long pte;
205 	unsigned long prototype;
206 	int i;
207 
208 	pte = *ptep;
209 	assert(pte & PT_PRESENT_MASK);
210 	assert(pte & PT_PAGE_SIZE_MASK);
211 	assert(level == 2 || level == 3);
212 
213 	new_pt = alloc_page();
214 	assert(new_pt);
215 
216 	prototype = pte & ~PT_ADDR_MASK;
217 	if (level == 2)
218 		prototype &= ~PT_PAGE_SIZE_MASK;
219 
220 	pa = pte & PT_ADDR_MASK;
221 	for (i = 0; i < (1 << PGDIR_WIDTH); i++) {
222 		new_pt[i] = prototype | pa;
223 		pa += 1ul << PGDIR_BITS(level - 1);
224 	}
225 
226 	pte &= ~PT_PAGE_SIZE_MASK;
227 	pte &= ~PT_ADDR_MASK;
228 	pte |= virt_to_phys(new_pt);
229 
230 	/* Modify the relevant paging-structure entry */
231 	*ptep = pte;
232 
233 	/*
234 	 * Flush the TLB to eradicate stale mappings.
235 	 *
236 	 * Note: Removing specific TLB mappings is tricky because
237 	 * split_large_page() can be called to split the active code page
238 	 * backing the next set of instructions to be fetched and executed.
239 	 * Furthermore, Intel SDM volume 3 recommends to clear the present bit
240 	 * for the page being split, before invalidating any mappings.
241 	 *
242 	 * But clearing the mapping from the page table and removing it from the
243 	 * TLB (where it's not actually guaranteed to reside anyway) makes it
244 	 * impossible to continue fetching instructions!
245 	 */
246 	flush_tlb();
247 }
248 
249 /*
250  * force_4k_page: Ensures that addr translate to a 4k page.
251  *
252  * This function uses split_large_page(), as needed, to ensure that target
253  * address, addr, translates to a 4k page.
254  *
255  *   @addr: target address that should be mapped to a 4k page
256  */
257 void force_4k_page(void *addr)
258 {
259 	unsigned long *ptep;
260 	unsigned long pte;
261 	unsigned long *cr3 = current_page_table();
262 
263 	ptep = get_pte_level(cr3, addr, 3);
264 	assert(ptep);
265 	pte = *ptep;
266 	assert(pte & PT_PRESENT_MASK);
267 	if (pte & PT_PAGE_SIZE_MASK)
268 		split_large_page(ptep, 3);
269 
270 	ptep = get_pte_level(cr3, addr, 2);
271 	assert(ptep);
272 	pte = *ptep;
273 	assert(pte & PT_PRESENT_MASK);
274 	if (pte & PT_PAGE_SIZE_MASK)
275 		split_large_page(ptep, 2);
276 }
277