1 #include "vm.h"
2 #include "libcflat.h"
3 #include "vmalloc.h"
4 #include "alloc_page.h"
5 #include "smp.h"
6
7 static pteval_t pte_opt_mask;
8
install_pte(pgd_t * cr3,int pte_level,void * virt,pteval_t pte,pteval_t * pt_page)9 pteval_t *install_pte(pgd_t *cr3,
10 int pte_level,
11 void *virt,
12 pteval_t pte,
13 pteval_t *pt_page)
14 {
15 int level;
16 pteval_t *pt = cr3;
17 unsigned offset;
18
19 for (level = PAGE_LEVEL; level > pte_level; --level) {
20 offset = PGDIR_OFFSET((uintptr_t)virt, level);
21 if (!(pt[offset] & PT_PRESENT_MASK)) {
22 pteval_t *new_pt = pt_page;
23 if (!new_pt)
24 new_pt = alloc_page();
25 else
26 pt_page = 0;
27 memset(new_pt, 0, PAGE_SIZE);
28 pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask;
29 #ifdef CONFIG_EFI
30 pt[offset] |= get_amd_sev_c_bit_mask();
31 #endif /* CONFIG_EFI */
32 }
33 pt = phys_to_virt(pt[offset] & PT_ADDR_MASK);
34 }
35 offset = PGDIR_OFFSET((uintptr_t)virt, level);
36 pt[offset] = pte;
37 return &pt[offset];
38 }
39
40 /*
41 * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The
42 * returned PTE isn't necessarily present, but its parent is.
43 */
find_pte_level(pgd_t * cr3,void * virt,int lowest_level)44 struct pte_search find_pte_level(pgd_t *cr3, void *virt,
45 int lowest_level)
46 {
47 pteval_t *pt = cr3, pte;
48 unsigned offset;
49 unsigned shift;
50 struct pte_search r;
51
52 assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL);
53
54 for (r.level = PAGE_LEVEL;; --r.level) {
55 shift = (r.level - 1) * PGDIR_WIDTH + 12;
56 offset = ((uintptr_t)virt >> shift) & PGDIR_MASK;
57 r.pte = &pt[offset];
58 pte = *r.pte;
59
60 if (!(pte & PT_PRESENT_MASK))
61 return r;
62
63 if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK))
64 return r;
65
66 if (r.level == lowest_level)
67 return r;
68
69 pt = phys_to_virt(pte & PT_ADDR_MASK);
70 }
71 }
72
73 /*
74 * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge
75 * PTE). Returns NULL if no leaf PTE exists.
76 */
get_pte(pgd_t * cr3,void * virt)77 pteval_t *get_pte(pgd_t *cr3, void *virt)
78 {
79 struct pte_search search;
80
81 search = find_pte_level(cr3, virt, 1);
82 return found_leaf_pte(search) ? search.pte : NULL;
83 }
84
85 /*
86 * Returns the PTE in the mapping of @virt at the given level @pte_level.
87 * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at
88 * @pte_level - 1 isn't present).
89 */
get_pte_level(pgd_t * cr3,void * virt,int pte_level)90 pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level)
91 {
92 struct pte_search search;
93
94 search = find_pte_level(cr3, virt, pte_level);
95 return search.level == pte_level ? search.pte : NULL;
96 }
97
install_large_page(pgd_t * cr3,phys_addr_t phys,void * virt)98 pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt)
99 {
100 phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask | PT_PAGE_SIZE_MASK;
101 #ifdef CONFIG_EFI
102 flags |= get_amd_sev_c_bit_mask();
103 #endif /* CONFIG_EFI */
104 return install_pte(cr3, 2, virt, phys | flags, 0);
105 }
106
install_page(pgd_t * cr3,phys_addr_t phys,void * virt)107 pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt)
108 {
109 phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask;
110 #ifdef CONFIG_EFI
111 flags |= get_amd_sev_c_bit_mask();
112 #endif /* CONFIG_EFI */
113 return install_pte(cr3, 1, virt, phys | flags, 0);
114 }
115
install_pages(pgd_t * cr3,phys_addr_t phys,size_t len,void * virt)116 void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt)
117 {
118 phys_addr_t max = (u64)len + (u64)phys;
119 assert(phys % PAGE_SIZE == 0);
120 assert((uintptr_t) virt % PAGE_SIZE == 0);
121 assert(len % PAGE_SIZE == 0);
122
123 while (phys + PAGE_SIZE <= max) {
124 install_page(cr3, phys, virt);
125 phys += PAGE_SIZE;
126 virt = (char *) virt + PAGE_SIZE;
127 }
128 }
129
any_present_pages(pgd_t * cr3,void * virt,size_t len)130 bool any_present_pages(pgd_t *cr3, void *virt, size_t len)
131 {
132 uintptr_t max = (uintptr_t) virt + len;
133 uintptr_t curr;
134
135 for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) {
136 pteval_t *ptep = get_pte(cr3, (void *) curr);
137 if (ptep && (*ptep & PT_PRESENT_MASK))
138 return true;
139 }
140 return false;
141 }
142
__setup_mmu_range(pgd_t * cr3,phys_addr_t start,size_t len,enum x86_mmu_flags mmu_flags)143 void __setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len,
144 enum x86_mmu_flags mmu_flags)
145 {
146 u64 orig_opt_mask = pte_opt_mask;
147 u64 max = (u64)len + (u64)start;
148 u64 phys = start;
149
150 if (mmu_flags & X86_MMU_MAP_USER)
151 pte_opt_mask |= PT_USER_MASK;
152
153 if (mmu_flags & X86_MMU_MAP_HUGE) {
154 while (phys + LARGE_PAGE_SIZE <= max) {
155 install_large_page(cr3, phys, (void *)(ulong)phys);
156 phys += LARGE_PAGE_SIZE;
157 }
158 }
159 install_pages(cr3, phys, max - phys, (void *)(ulong)phys);
160
161 pte_opt_mask = orig_opt_mask;
162 }
163
setup_mmu_range(pgd_t * cr3,phys_addr_t start,size_t len)164 static inline void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len)
165 {
166 __setup_mmu_range(cr3, start, len, X86_MMU_MAP_HUGE);
167 }
168
set_additional_vcpu_vmregs(struct vm_vcpu_info * info)169 static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info)
170 {
171 write_cr3(info->cr3);
172 write_cr4(info->cr4);
173 write_cr0(info->cr0);
174 }
175
setup_mmu(phys_addr_t end_of_memory,void * opt_mask)176 void *setup_mmu(phys_addr_t end_of_memory, void *opt_mask)
177 {
178 pgd_t *cr3 = alloc_page();
179 struct vm_vcpu_info info;
180 int i;
181
182 if (opt_mask)
183 pte_opt_mask = *(pteval_t *)opt_mask;
184 else
185 pte_opt_mask = PT_USER_MASK;
186
187 memset(cr3, 0, PAGE_SIZE);
188
189 #ifdef __x86_64__
190 if (end_of_memory < (1ul << 32))
191 end_of_memory = (1ul << 32); /* map mmio 1:1 */
192
193 setup_mmu_range(cr3, 0, end_of_memory);
194 #else
195 setup_mmu_range(cr3, 0, (2ul << 30));
196 setup_mmu_range(cr3, 3ul << 30, (1ul << 30));
197 init_alloc_vpage((void*)(3ul << 30));
198 #endif
199
200 write_cr3(virt_to_phys(cr3));
201 #ifndef __x86_64__
202 write_cr4(X86_CR4_PSE);
203 #endif
204 write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP);
205
206 printf("paging enabled\n");
207 printf("cr0 = %lx\n", read_cr0());
208 printf("cr3 = %lx\n", read_cr3());
209 printf("cr4 = %lx\n", read_cr4());
210
211 info.cr3 = read_cr3();
212 info.cr4 = read_cr4();
213 info.cr0 = read_cr0();
214
215 for (i = 1; i < cpu_count(); i++)
216 on_cpu(i, (void *)set_additional_vcpu_vmregs, &info);
217
218 return cr3;
219 }
220
virt_to_pte_phys(pgd_t * cr3,void * mem)221 phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem)
222 {
223 return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1));
224 }
225
226 /*
227 * split_large_page: Split a 2M/1G large page into 512 smaller PTEs.
228 * @ptep : large page table entry to split
229 * @level : level of ptep (2 or 3)
230 */
split_large_page(unsigned long * ptep,int level)231 void split_large_page(unsigned long *ptep, int level)
232 {
233 unsigned long *new_pt;
234 unsigned long pa;
235 unsigned long pte;
236 unsigned long prototype;
237 int i;
238
239 pte = *ptep;
240 assert(pte & PT_PRESENT_MASK);
241 assert(pte & PT_PAGE_SIZE_MASK);
242 assert(level == 2 || level == 3);
243
244 new_pt = alloc_page();
245 assert(new_pt);
246
247 prototype = pte & ~PT_ADDR_MASK;
248 if (level == 2)
249 prototype &= ~PT_PAGE_SIZE_MASK;
250
251 pa = pte & PT_ADDR_MASK;
252 for (i = 0; i < (1 << PGDIR_WIDTH); i++) {
253 new_pt[i] = prototype | pa;
254 pa += 1ul << PGDIR_BITS(level - 1);
255 }
256
257 pte &= ~PT_PAGE_SIZE_MASK;
258 pte &= ~PT_ADDR_MASK;
259 pte |= virt_to_phys(new_pt);
260
261 /* Modify the relevant paging-structure entry */
262 *ptep = pte;
263
264 /*
265 * Flush the TLB to eradicate stale mappings.
266 *
267 * Note: Removing specific TLB mappings is tricky because
268 * split_large_page() can be called to split the active code page
269 * backing the next set of instructions to be fetched and executed.
270 * Furthermore, Intel SDM volume 3 recommends to clear the present bit
271 * for the page being split, before invalidating any mappings.
272 *
273 * But clearing the mapping from the page table and removing it from the
274 * TLB (where it's not actually guaranteed to reside anyway) makes it
275 * impossible to continue fetching instructions!
276 */
277 flush_tlb();
278 }
279
280 /*
281 * force_4k_page: Ensures that addr translate to a 4k page.
282 *
283 * This function uses split_large_page(), as needed, to ensure that target
284 * address, addr, translates to a 4k page.
285 *
286 * @addr: target address that should be mapped to a 4k page
287 */
force_4k_page(void * addr)288 void force_4k_page(void *addr)
289 {
290 unsigned long *ptep;
291 unsigned long pte;
292 unsigned long *cr3 = current_page_table();
293
294 ptep = get_pte_level(cr3, addr, 3);
295 assert(ptep);
296 pte = *ptep;
297 assert(pte & PT_PRESENT_MASK);
298 if (pte & PT_PAGE_SIZE_MASK)
299 split_large_page(ptep, 3);
300
301 ptep = get_pte_level(cr3, addr, 2);
302 assert(ptep);
303 pte = *ptep;
304 assert(pte & PT_PRESENT_MASK);
305 if (pte & PT_PAGE_SIZE_MASK)
306 split_large_page(ptep, 2);
307 }
308
309 /*
310 * Call the callback on each page from virt to virt + len.
311 */
walk_pte(void * virt,size_t len,pte_callback_t callback)312 void walk_pte(void *virt, size_t len, pte_callback_t callback)
313 {
314 pgd_t *cr3 = current_page_table();
315 uintptr_t start = (uintptr_t)virt;
316 uintptr_t end = (uintptr_t)virt + len;
317 struct pte_search search;
318 size_t page_size;
319 uintptr_t curr;
320
321 for (curr = start; curr < end; curr = ALIGN_DOWN(curr + page_size, page_size)) {
322 search = find_pte_level(cr3, (void *)curr, 1);
323 assert(found_leaf_pte(search));
324 page_size = 1ul << PGDIR_BITS(search.level);
325
326 callback(search, (void *)curr);
327 }
328 }
329