1 #include "vm.h" 2 #include "libcflat.h" 3 #include "vmalloc.h" 4 #include "alloc_page.h" 5 #include "smp.h" 6 7 static pteval_t pte_opt_mask; 8 9 pteval_t *install_pte(pgd_t *cr3, 10 int pte_level, 11 void *virt, 12 pteval_t pte, 13 pteval_t *pt_page) 14 { 15 int level; 16 pteval_t *pt = cr3; 17 unsigned offset; 18 19 for (level = PAGE_LEVEL; level > pte_level; --level) { 20 offset = PGDIR_OFFSET((uintptr_t)virt, level); 21 if (!(pt[offset] & PT_PRESENT_MASK)) { 22 pteval_t *new_pt = pt_page; 23 if (!new_pt) 24 new_pt = alloc_page(); 25 else 26 pt_page = 0; 27 memset(new_pt, 0, PAGE_SIZE); 28 pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask; 29 } 30 pt = phys_to_virt(pt[offset] & PT_ADDR_MASK); 31 } 32 offset = PGDIR_OFFSET((uintptr_t)virt, level); 33 pt[offset] = pte; 34 return &pt[offset]; 35 } 36 37 /* 38 * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The 39 * returned PTE isn't necessarily present, but its parent is. 40 */ 41 struct pte_search find_pte_level(pgd_t *cr3, void *virt, 42 int lowest_level) 43 { 44 pteval_t *pt = cr3, pte; 45 unsigned offset; 46 unsigned shift; 47 struct pte_search r; 48 49 assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL); 50 51 for (r.level = PAGE_LEVEL;; --r.level) { 52 shift = (r.level - 1) * PGDIR_WIDTH + 12; 53 offset = ((uintptr_t)virt >> shift) & PGDIR_MASK; 54 r.pte = &pt[offset]; 55 pte = *r.pte; 56 57 if (!(pte & PT_PRESENT_MASK)) 58 return r; 59 60 if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK)) 61 return r; 62 63 if (r.level == lowest_level) 64 return r; 65 66 pt = phys_to_virt(pte & 0xffffffffff000ull); 67 } 68 } 69 70 /* 71 * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge 72 * PTE). Returns NULL if no leaf PTE exists. 73 */ 74 pteval_t *get_pte(pgd_t *cr3, void *virt) 75 { 76 struct pte_search search; 77 78 search = find_pte_level(cr3, virt, 1); 79 return found_leaf_pte(search) ? search.pte : NULL; 80 } 81 82 /* 83 * Returns the PTE in the mapping of @virt at the given level @pte_level. 84 * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at 85 * @pte_level - 1 isn't present). 86 */ 87 pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level) 88 { 89 struct pte_search search; 90 91 search = find_pte_level(cr3, virt, pte_level); 92 return search.level == pte_level ? search.pte : NULL; 93 } 94 95 pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt) 96 { 97 return install_pte(cr3, 2, virt, 98 phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask | PT_PAGE_SIZE_MASK, 0); 99 } 100 101 pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt) 102 { 103 return install_pte(cr3, 1, virt, phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | pte_opt_mask, 0); 104 } 105 106 void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt) 107 { 108 phys_addr_t max = (u64)len + (u64)phys; 109 assert(phys % PAGE_SIZE == 0); 110 assert((uintptr_t) virt % PAGE_SIZE == 0); 111 assert(len % PAGE_SIZE == 0); 112 113 while (phys + PAGE_SIZE <= max) { 114 install_page(cr3, phys, virt); 115 phys += PAGE_SIZE; 116 virt = (char *) virt + PAGE_SIZE; 117 } 118 } 119 120 bool any_present_pages(pgd_t *cr3, void *virt, size_t len) 121 { 122 uintptr_t max = (uintptr_t) virt + len; 123 uintptr_t curr; 124 125 for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) { 126 pteval_t *ptep = get_pte(cr3, (void *) curr); 127 if (ptep && (*ptep & PT_PRESENT_MASK)) 128 return true; 129 } 130 return false; 131 } 132 133 static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len) 134 { 135 u64 max = (u64)len + (u64)start; 136 u64 phys = start; 137 138 while (phys + LARGE_PAGE_SIZE <= max) { 139 install_large_page(cr3, phys, (void *)(ulong)phys); 140 phys += LARGE_PAGE_SIZE; 141 } 142 install_pages(cr3, phys, max - phys, (void *)(ulong)phys); 143 } 144 145 static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info) 146 { 147 write_cr3(info->cr3); 148 write_cr4(info->cr4); 149 write_cr0(info->cr0); 150 } 151 152 void *setup_mmu(phys_addr_t end_of_memory, void *opt_mask) 153 { 154 pgd_t *cr3 = alloc_page(); 155 struct vm_vcpu_info info; 156 int i; 157 158 if (opt_mask) 159 pte_opt_mask = *(pteval_t *)opt_mask; 160 else 161 pte_opt_mask = PT_USER_MASK; 162 163 memset(cr3, 0, PAGE_SIZE); 164 165 #ifdef __x86_64__ 166 if (end_of_memory < (1ul << 32)) 167 end_of_memory = (1ul << 32); /* map mmio 1:1 */ 168 169 setup_mmu_range(cr3, 0, end_of_memory); 170 #else 171 setup_mmu_range(cr3, 0, (2ul << 30)); 172 setup_mmu_range(cr3, 3ul << 30, (1ul << 30)); 173 init_alloc_vpage((void*)(3ul << 30)); 174 #endif 175 176 write_cr3(virt_to_phys(cr3)); 177 #ifndef __x86_64__ 178 write_cr4(X86_CR4_PSE); 179 #endif 180 write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP); 181 182 printf("paging enabled\n"); 183 printf("cr0 = %lx\n", read_cr0()); 184 printf("cr3 = %lx\n", read_cr3()); 185 printf("cr4 = %lx\n", read_cr4()); 186 187 info.cr3 = read_cr3(); 188 info.cr4 = read_cr4(); 189 info.cr0 = read_cr0(); 190 191 for (i = 1; i < cpu_count(); i++) 192 on_cpu(i, (void *)set_additional_vcpu_vmregs, &info); 193 194 return cr3; 195 } 196 197 phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem) 198 { 199 return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1)); 200 } 201 202 /* 203 * split_large_page: Split a 2M/1G large page into 512 smaller PTEs. 204 * @ptep : large page table entry to split 205 * @level : level of ptep (2 or 3) 206 */ 207 void split_large_page(unsigned long *ptep, int level) 208 { 209 unsigned long *new_pt; 210 unsigned long pa; 211 unsigned long pte; 212 unsigned long prototype; 213 int i; 214 215 pte = *ptep; 216 assert(pte & PT_PRESENT_MASK); 217 assert(pte & PT_PAGE_SIZE_MASK); 218 assert(level == 2 || level == 3); 219 220 new_pt = alloc_page(); 221 assert(new_pt); 222 223 prototype = pte & ~PT_ADDR_MASK; 224 if (level == 2) 225 prototype &= ~PT_PAGE_SIZE_MASK; 226 227 pa = pte & PT_ADDR_MASK; 228 for (i = 0; i < (1 << PGDIR_WIDTH); i++) { 229 new_pt[i] = prototype | pa; 230 pa += 1ul << PGDIR_BITS(level - 1); 231 } 232 233 pte &= ~PT_PAGE_SIZE_MASK; 234 pte &= ~PT_ADDR_MASK; 235 pte |= virt_to_phys(new_pt); 236 237 /* Modify the relevant paging-structure entry */ 238 *ptep = pte; 239 240 /* 241 * Flush the TLB to eradicate stale mappings. 242 * 243 * Note: Removing specific TLB mappings is tricky because 244 * split_large_page() can be called to split the active code page 245 * backing the next set of instructions to be fetched and executed. 246 * Furthermore, Intel SDM volume 3 recommends to clear the present bit 247 * for the page being split, before invalidating any mappings. 248 * 249 * But clearing the mapping from the page table and removing it from the 250 * TLB (where it's not actually guaranteed to reside anyway) makes it 251 * impossible to continue fetching instructions! 252 */ 253 flush_tlb(); 254 } 255 256 /* 257 * force_4k_page: Ensures that addr translate to a 4k page. 258 * 259 * This function uses split_large_page(), as needed, to ensure that target 260 * address, addr, translates to a 4k page. 261 * 262 * @addr: target address that should be mapped to a 4k page 263 */ 264 void force_4k_page(void *addr) 265 { 266 unsigned long *ptep; 267 unsigned long pte; 268 unsigned long *cr3 = current_page_table(); 269 270 ptep = get_pte_level(cr3, addr, 3); 271 assert(ptep); 272 pte = *ptep; 273 assert(pte & PT_PRESENT_MASK); 274 if (pte & PT_PAGE_SIZE_MASK) 275 split_large_page(ptep, 3); 276 277 ptep = get_pte_level(cr3, addr, 2); 278 assert(ptep); 279 pte = *ptep; 280 assert(pte & PT_PRESENT_MASK); 281 if (pte & PT_PAGE_SIZE_MASK) 282 split_large_page(ptep, 2); 283 } 284