xref: /kvm-unit-tests/lib/x86/vm.c (revision dbd3800490429358367c717669aab76678429ef1)
1 #include "vm.h"
2 #include "libcflat.h"
3 #include "vmalloc.h"
4 #include "alloc_page.h"
5 #include "smp.h"
6 
7 static pteval_t pte_opt_mask;
8 
9 pteval_t *install_pte(pgd_t *cr3,
10 		      int pte_level,
11 		      void *virt,
12 		      pteval_t pte,
13 		      pteval_t *pt_page)
14 {
15     int level;
16     pteval_t *pt = cr3;
17     unsigned offset;
18 
19     for (level = PAGE_LEVEL; level > pte_level; --level) {
20 	offset = PGDIR_OFFSET((uintptr_t)virt, level);
21 	if (!(pt[offset] & PT_PRESENT_MASK)) {
22 	    pteval_t *new_pt = pt_page;
23             if (!new_pt)
24                 new_pt = alloc_page();
25             else
26                 pt_page = 0;
27 	    memset(new_pt, 0, PAGE_SIZE);
28 	    pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask;
29 	}
30 	pt = phys_to_virt(pt[offset] & PT_ADDR_MASK);
31     }
32     offset = PGDIR_OFFSET((uintptr_t)virt, level);
33     pt[offset] = pte;
34     return &pt[offset];
35 }
36 
37 /*
38  * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The
39  * returned PTE isn't necessarily present, but its parent is.
40  */
41 struct pte_search find_pte_level(pgd_t *cr3, void *virt,
42 				 int lowest_level)
43 {
44 	pteval_t *pt = cr3, pte;
45 	unsigned offset;
46 	unsigned shift;
47 	struct pte_search r;
48 
49 	assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL);
50 
51 	for (r.level = PAGE_LEVEL;; --r.level) {
52 		shift = (r.level - 1) * PGDIR_WIDTH + 12;
53 		offset = ((uintptr_t)virt >> shift) & PGDIR_MASK;
54 		r.pte = &pt[offset];
55 		pte = *r.pte;
56 
57 		if (!(pte & PT_PRESENT_MASK))
58 			return r;
59 
60 		if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK))
61 			return r;
62 
63 		if (r.level == lowest_level)
64 			return r;
65 
66 		pt = phys_to_virt(pte & 0xffffffffff000ull);
67 	}
68 }
69 
70 /*
71  * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge
72  * PTE). Returns NULL if no leaf PTE exists.
73  */
74 pteval_t *get_pte(pgd_t *cr3, void *virt)
75 {
76 	struct pte_search search;
77 
78 	search = find_pte_level(cr3, virt, 1);
79 	return found_leaf_pte(search) ? search.pte : NULL;
80 }
81 
82 /*
83  * Returns the PTE in the mapping of @virt at the given level @pte_level.
84  * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at
85  * @pte_level - 1 isn't present).
86  */
87 pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level)
88 {
89 	struct pte_search search;
90 
91 	search = find_pte_level(cr3, virt, pte_level);
92 	return search.level == pte_level ? search.pte : NULL;
93 }
94 
95 pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt)
96 {
97     return install_pte(cr3, 2, virt,
98 		       phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask | PT_PAGE_SIZE_MASK, 0);
99 }
100 
101 pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt)
102 {
103     return install_pte(cr3, 1, virt, phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask, 0);
104 }
105 
106 void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt)
107 {
108 	phys_addr_t max = (u64)len + (u64)phys;
109 	assert(phys % PAGE_SIZE == 0);
110 	assert((uintptr_t) virt % PAGE_SIZE == 0);
111 	assert(len % PAGE_SIZE == 0);
112 
113 	while (phys + PAGE_SIZE <= max) {
114 		install_page(cr3, phys, virt);
115 		phys += PAGE_SIZE;
116 		virt = (char *) virt + PAGE_SIZE;
117 	}
118 }
119 
120 bool any_present_pages(pgd_t *cr3, void *virt, size_t len)
121 {
122 	uintptr_t max = (uintptr_t) virt + len;
123 	uintptr_t curr;
124 
125 	for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) {
126 		pteval_t *ptep = get_pte(cr3, (void *) curr);
127 		if (ptep && (*ptep & PT_PRESENT_MASK))
128 			return true;
129 	}
130 	return false;
131 }
132 
133 static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len)
134 {
135 	u64 max = (u64)len + (u64)start;
136 	u64 phys = start;
137 
138 	while (phys + LARGE_PAGE_SIZE <= max) {
139 		install_large_page(cr3, phys, (void *)(ulong)phys);
140 		phys += LARGE_PAGE_SIZE;
141 	}
142 	install_pages(cr3, phys, max - phys, (void *)(ulong)phys);
143 }
144 
145 static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info)
146 {
147 	write_cr3(info->cr3);
148 	write_cr4(info->cr4);
149 	write_cr0(info->cr0);
150 }
151 
152 void *setup_mmu(phys_addr_t end_of_memory, void *opt_mask)
153 {
154     pgd_t *cr3 = alloc_page();
155     struct vm_vcpu_info info;
156     int i;
157 
158     if (opt_mask)
159 	pte_opt_mask = *(pteval_t *)opt_mask;
160     else
161 	pte_opt_mask = PT_USER_MASK;
162 
163     memset(cr3, 0, PAGE_SIZE);
164 
165 #ifdef __x86_64__
166     if (end_of_memory < (1ul << 32))
167         end_of_memory = (1ul << 32);  /* map mmio 1:1 */
168 
169     setup_mmu_range(cr3, 0, end_of_memory);
170 #else
171     setup_mmu_range(cr3, 0, (2ul << 30));
172     setup_mmu_range(cr3, 3ul << 30, (1ul << 30));
173     init_alloc_vpage((void*)(3ul << 30));
174 #endif
175 
176     write_cr3(virt_to_phys(cr3));
177 #ifndef __x86_64__
178     write_cr4(X86_CR4_PSE);
179 #endif
180     write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP);
181 
182     printf("paging enabled\n");
183     printf("cr0 = %lx\n", read_cr0());
184     printf("cr3 = %lx\n", read_cr3());
185     printf("cr4 = %lx\n", read_cr4());
186 
187     info.cr3 = read_cr3();
188     info.cr4 = read_cr4();
189     info.cr0 = read_cr0();
190 
191     for (i = 1; i < cpu_count(); i++)
192         on_cpu(i, (void *)set_additional_vcpu_vmregs, &info);
193 
194     return cr3;
195 }
196 
197 phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem)
198 {
199     return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1));
200 }
201 
202 /*
203  * split_large_page: Split a 2M/1G large page into 512 smaller PTEs.
204  *   @ptep : large page table entry to split
205  *   @level : level of ptep (2 or 3)
206  */
207 void split_large_page(unsigned long *ptep, int level)
208 {
209 	unsigned long *new_pt;
210 	unsigned long pa;
211 	unsigned long pte;
212 	unsigned long prototype;
213 	int i;
214 
215 	pte = *ptep;
216 	assert(pte & PT_PRESENT_MASK);
217 	assert(pte & PT_PAGE_SIZE_MASK);
218 	assert(level == 2 || level == 3);
219 
220 	new_pt = alloc_page();
221 	assert(new_pt);
222 
223 	prototype = pte & ~PT_ADDR_MASK;
224 	if (level == 2)
225 		prototype &= ~PT_PAGE_SIZE_MASK;
226 
227 	pa = pte & PT_ADDR_MASK;
228 	for (i = 0; i < (1 << PGDIR_WIDTH); i++) {
229 		new_pt[i] = prototype | pa;
230 		pa += 1ul << PGDIR_BITS(level - 1);
231 	}
232 
233 	pte &= ~PT_PAGE_SIZE_MASK;
234 	pte &= ~PT_ADDR_MASK;
235 	pte |= virt_to_phys(new_pt);
236 
237 	/* Modify the relevant paging-structure entry */
238 	*ptep = pte;
239 
240 	/*
241 	 * Flush the TLB to eradicate stale mappings.
242 	 *
243 	 * Note: Removing specific TLB mappings is tricky because
244 	 * split_large_page() can be called to split the active code page
245 	 * backing the next set of instructions to be fetched and executed.
246 	 * Furthermore, Intel SDM volume 3 recommends to clear the present bit
247 	 * for the page being split, before invalidating any mappings.
248 	 *
249 	 * But clearing the mapping from the page table and removing it from the
250 	 * TLB (where it's not actually guaranteed to reside anyway) makes it
251 	 * impossible to continue fetching instructions!
252 	 */
253 	flush_tlb();
254 }
255 
256 /*
257  * force_4k_page: Ensures that addr translate to a 4k page.
258  *
259  * This function uses split_large_page(), as needed, to ensure that target
260  * address, addr, translates to a 4k page.
261  *
262  *   @addr: target address that should be mapped to a 4k page
263  */
264 void force_4k_page(void *addr)
265 {
266 	unsigned long *ptep;
267 	unsigned long pte;
268 	unsigned long *cr3 = current_page_table();
269 
270 	ptep = get_pte_level(cr3, addr, 3);
271 	assert(ptep);
272 	pte = *ptep;
273 	assert(pte & PT_PRESENT_MASK);
274 	if (pte & PT_PAGE_SIZE_MASK)
275 		split_large_page(ptep, 3);
276 
277 	ptep = get_pte_level(cr3, addr, 2);
278 	assert(ptep);
279 	pte = *ptep;
280 	assert(pte & PT_PRESENT_MASK);
281 	if (pte & PT_PAGE_SIZE_MASK)
282 		split_large_page(ptep, 2);
283 }
284