1 #include "vm.h" 2 #include "libcflat.h" 3 #include "vmalloc.h" 4 #include "alloc_page.h" 5 #include "smp.h" 6 7 static pteval_t pte_opt_mask; 8 9 pteval_t *install_pte(pgd_t *cr3, 10 int pte_level, 11 void *virt, 12 pteval_t pte, 13 pteval_t *pt_page) 14 { 15 int level; 16 pteval_t *pt = cr3; 17 unsigned offset; 18 19 for (level = PAGE_LEVEL; level > pte_level; --level) { 20 offset = PGDIR_OFFSET((uintptr_t)virt, level); 21 if (!(pt[offset] & PT_PRESENT_MASK)) { 22 pteval_t *new_pt = pt_page; 23 if (!new_pt) 24 new_pt = alloc_page(); 25 else 26 pt_page = 0; 27 memset(new_pt, 0, PAGE_SIZE); 28 pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask; 29 #ifdef CONFIG_EFI 30 pt[offset] |= get_amd_sev_c_bit_mask(); 31 #endif /* CONFIG_EFI */ 32 } 33 pt = phys_to_virt(pt[offset] & PT_ADDR_MASK); 34 } 35 offset = PGDIR_OFFSET((uintptr_t)virt, level); 36 pt[offset] = pte; 37 return &pt[offset]; 38 } 39 40 /* 41 * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The 42 * returned PTE isn't necessarily present, but its parent is. 43 */ 44 struct pte_search find_pte_level(pgd_t *cr3, void *virt, 45 int lowest_level) 46 { 47 pteval_t *pt = cr3, pte; 48 unsigned offset; 49 unsigned shift; 50 struct pte_search r; 51 52 assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL); 53 54 for (r.level = PAGE_LEVEL;; --r.level) { 55 shift = (r.level - 1) * PGDIR_WIDTH + 12; 56 offset = ((uintptr_t)virt >> shift) & PGDIR_MASK; 57 r.pte = &pt[offset]; 58 pte = *r.pte; 59 60 if (!(pte & PT_PRESENT_MASK)) 61 return r; 62 63 if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK)) 64 return r; 65 66 if (r.level == lowest_level) 67 return r; 68 69 pt = phys_to_virt(pte & PT_ADDR_MASK); 70 } 71 } 72 73 /* 74 * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge 75 * PTE). Returns NULL if no leaf PTE exists. 76 */ 77 pteval_t *get_pte(pgd_t *cr3, void *virt) 78 { 79 struct pte_search search; 80 81 search = find_pte_level(cr3, virt, 1); 82 return found_leaf_pte(search) ? search.pte : NULL; 83 } 84 85 /* 86 * Returns the PTE in the mapping of @virt at the given level @pte_level. 87 * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at 88 * @pte_level - 1 isn't present). 89 */ 90 pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level) 91 { 92 struct pte_search search; 93 94 search = find_pte_level(cr3, virt, pte_level); 95 return search.level == pte_level ? search.pte : NULL; 96 } 97 98 pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt) 99 { 100 phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask | PT_PAGE_SIZE_MASK; 101 #ifdef CONFIG_EFI 102 flags |= get_amd_sev_c_bit_mask(); 103 #endif /* CONFIG_EFI */ 104 return install_pte(cr3, 2, virt, phys | flags, 0); 105 } 106 107 pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt) 108 { 109 phys_addr_t flags = PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask; 110 #ifdef CONFIG_EFI 111 flags |= get_amd_sev_c_bit_mask(); 112 #endif /* CONFIG_EFI */ 113 return install_pte(cr3, 1, virt, phys | flags, 0); 114 } 115 116 void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt) 117 { 118 phys_addr_t max = (u64)len + (u64)phys; 119 assert(phys % PAGE_SIZE == 0); 120 assert((uintptr_t) virt % PAGE_SIZE == 0); 121 assert(len % PAGE_SIZE == 0); 122 123 while (phys + PAGE_SIZE <= max) { 124 install_page(cr3, phys, virt); 125 phys += PAGE_SIZE; 126 virt = (char *) virt + PAGE_SIZE; 127 } 128 } 129 130 bool any_present_pages(pgd_t *cr3, void *virt, size_t len) 131 { 132 uintptr_t max = (uintptr_t) virt + len; 133 uintptr_t curr; 134 135 for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) { 136 pteval_t *ptep = get_pte(cr3, (void *) curr); 137 if (ptep && (*ptep & PT_PRESENT_MASK)) 138 return true; 139 } 140 return false; 141 } 142 143 void __setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len, 144 enum x86_mmu_flags mmu_flags) 145 { 146 u64 orig_opt_mask = pte_opt_mask; 147 u64 max = (u64)len + (u64)start; 148 u64 phys = start; 149 150 if (mmu_flags & X86_MMU_MAP_USER) 151 pte_opt_mask |= PT_USER_MASK; 152 153 if (mmu_flags & X86_MMU_MAP_HUGE) { 154 while (phys + LARGE_PAGE_SIZE <= max) { 155 install_large_page(cr3, phys, (void *)(ulong)phys); 156 phys += LARGE_PAGE_SIZE; 157 } 158 } 159 install_pages(cr3, phys, max - phys, (void *)(ulong)phys); 160 161 pte_opt_mask = orig_opt_mask; 162 } 163 164 static inline void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len) 165 { 166 __setup_mmu_range(cr3, start, len, X86_MMU_MAP_HUGE); 167 } 168 169 static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info) 170 { 171 write_cr3(info->cr3); 172 write_cr4(info->cr4); 173 write_cr0(info->cr0); 174 } 175 176 void *setup_mmu(phys_addr_t end_of_memory, void *opt_mask) 177 { 178 pgd_t *cr3 = alloc_page(); 179 struct vm_vcpu_info info; 180 int i; 181 182 if (opt_mask) 183 pte_opt_mask = *(pteval_t *)opt_mask; 184 else 185 pte_opt_mask = PT_USER_MASK; 186 187 memset(cr3, 0, PAGE_SIZE); 188 189 #ifdef __x86_64__ 190 if (end_of_memory < (1ul << 32)) 191 end_of_memory = (1ul << 32); /* map mmio 1:1 */ 192 193 setup_mmu_range(cr3, 0, end_of_memory); 194 #else 195 setup_mmu_range(cr3, 0, (2ul << 30)); 196 setup_mmu_range(cr3, 3ul << 30, (1ul << 30)); 197 init_alloc_vpage((void*)(3ul << 30)); 198 #endif 199 200 write_cr3(virt_to_phys(cr3)); 201 #ifndef __x86_64__ 202 write_cr4(X86_CR4_PSE); 203 #endif 204 write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP); 205 206 printf("paging enabled\n"); 207 printf("cr0 = %lx\n", read_cr0()); 208 printf("cr3 = %lx\n", read_cr3()); 209 printf("cr4 = %lx\n", read_cr4()); 210 211 info.cr3 = read_cr3(); 212 info.cr4 = read_cr4(); 213 info.cr0 = read_cr0(); 214 215 for (i = 1; i < cpu_count(); i++) 216 on_cpu(i, (void *)set_additional_vcpu_vmregs, &info); 217 218 return cr3; 219 } 220 221 phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem) 222 { 223 return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1)); 224 } 225 226 /* 227 * split_large_page: Split a 2M/1G large page into 512 smaller PTEs. 228 * @ptep : large page table entry to split 229 * @level : level of ptep (2 or 3) 230 */ 231 void split_large_page(unsigned long *ptep, int level) 232 { 233 unsigned long *new_pt; 234 unsigned long pa; 235 unsigned long pte; 236 unsigned long prototype; 237 int i; 238 239 pte = *ptep; 240 assert(pte & PT_PRESENT_MASK); 241 assert(pte & PT_PAGE_SIZE_MASK); 242 assert(level == 2 || level == 3); 243 244 new_pt = alloc_page(); 245 assert(new_pt); 246 247 prototype = pte & ~PT_ADDR_MASK; 248 if (level == 2) 249 prototype &= ~PT_PAGE_SIZE_MASK; 250 251 pa = pte & PT_ADDR_MASK; 252 for (i = 0; i < (1 << PGDIR_WIDTH); i++) { 253 new_pt[i] = prototype | pa; 254 pa += 1ul << PGDIR_BITS(level - 1); 255 } 256 257 pte &= ~PT_PAGE_SIZE_MASK; 258 pte &= ~PT_ADDR_MASK; 259 pte |= virt_to_phys(new_pt); 260 261 /* Modify the relevant paging-structure entry */ 262 *ptep = pte; 263 264 /* 265 * Flush the TLB to eradicate stale mappings. 266 * 267 * Note: Removing specific TLB mappings is tricky because 268 * split_large_page() can be called to split the active code page 269 * backing the next set of instructions to be fetched and executed. 270 * Furthermore, Intel SDM volume 3 recommends to clear the present bit 271 * for the page being split, before invalidating any mappings. 272 * 273 * But clearing the mapping from the page table and removing it from the 274 * TLB (where it's not actually guaranteed to reside anyway) makes it 275 * impossible to continue fetching instructions! 276 */ 277 flush_tlb(); 278 } 279 280 /* 281 * force_4k_page: Ensures that addr translate to a 4k page. 282 * 283 * This function uses split_large_page(), as needed, to ensure that target 284 * address, addr, translates to a 4k page. 285 * 286 * @addr: target address that should be mapped to a 4k page 287 */ 288 void force_4k_page(void *addr) 289 { 290 unsigned long *ptep; 291 unsigned long pte; 292 unsigned long *cr3 = current_page_table(); 293 294 ptep = get_pte_level(cr3, addr, 3); 295 assert(ptep); 296 pte = *ptep; 297 assert(pte & PT_PRESENT_MASK); 298 if (pte & PT_PAGE_SIZE_MASK) 299 split_large_page(ptep, 3); 300 301 ptep = get_pte_level(cr3, addr, 2); 302 assert(ptep); 303 pte = *ptep; 304 assert(pte & PT_PRESENT_MASK); 305 if (pte & PT_PAGE_SIZE_MASK) 306 split_large_page(ptep, 2); 307 } 308 309 /* 310 * Call the callback on each page from virt to virt + len. 311 */ 312 void walk_pte(void *virt, size_t len, pte_callback_t callback) 313 { 314 pgd_t *cr3 = current_page_table(); 315 uintptr_t start = (uintptr_t)virt; 316 uintptr_t end = (uintptr_t)virt + len; 317 struct pte_search search; 318 size_t page_size; 319 uintptr_t curr; 320 321 for (curr = start; curr < end; curr = ALIGN_DOWN(curr + page_size, page_size)) { 322 search = find_pte_level(cr3, (void *)curr, 1); 323 assert(found_leaf_pte(search)); 324 page_size = 1ul << PGDIR_BITS(search.level); 325 326 callback(search, (void *)curr); 327 } 328 } 329