1 #include "vm.h" 2 #include "libcflat.h" 3 #include "vmalloc.h" 4 #include "alloc_page.h" 5 #include "smp.h" 6 7 pteval_t *install_pte(pgd_t *cr3, 8 int pte_level, 9 void *virt, 10 pteval_t pte, 11 pteval_t *pt_page) 12 { 13 int level; 14 pteval_t *pt = cr3; 15 unsigned offset; 16 17 for (level = PAGE_LEVEL; level > pte_level; --level) { 18 offset = PGDIR_OFFSET((uintptr_t)virt, level); 19 if (!(pt[offset] & PT_PRESENT_MASK)) { 20 pteval_t *new_pt = pt_page; 21 if (!new_pt) 22 new_pt = alloc_page(); 23 else 24 pt_page = 0; 25 memset(new_pt, 0, PAGE_SIZE); 26 pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 27 } 28 pt = phys_to_virt(pt[offset] & PT_ADDR_MASK); 29 } 30 offset = PGDIR_OFFSET((uintptr_t)virt, level); 31 pt[offset] = pte; 32 return &pt[offset]; 33 } 34 35 /* 36 * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The 37 * returned PTE isn't necessarily present, but its parent is. 38 */ 39 struct pte_search find_pte_level(pgd_t *cr3, void *virt, 40 int lowest_level) 41 { 42 pteval_t *pt = cr3, pte; 43 unsigned offset; 44 unsigned shift; 45 struct pte_search r; 46 47 assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL); 48 49 for (r.level = PAGE_LEVEL;; --r.level) { 50 shift = (r.level - 1) * PGDIR_WIDTH + 12; 51 offset = ((uintptr_t)virt >> shift) & PGDIR_MASK; 52 r.pte = &pt[offset]; 53 pte = *r.pte; 54 55 if (!(pte & PT_PRESENT_MASK)) 56 return r; 57 58 if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK)) 59 return r; 60 61 if (r.level == lowest_level) 62 return r; 63 64 pt = phys_to_virt(pte & 0xffffffffff000ull); 65 } 66 } 67 68 /* 69 * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge 70 * PTE). Returns NULL if no leaf PTE exists. 71 */ 72 pteval_t *get_pte(pgd_t *cr3, void *virt) 73 { 74 struct pte_search search; 75 76 search = find_pte_level(cr3, virt, 1); 77 return found_leaf_pte(search) ? search.pte : NULL; 78 } 79 80 /* 81 * Returns the PTE in the mapping of @virt at the given level @pte_level. 82 * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at 83 * @pte_level - 1 isn't present). 84 */ 85 pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level) 86 { 87 struct pte_search search; 88 89 search = find_pte_level(cr3, virt, pte_level); 90 return search.level == pte_level ? search.pte : NULL; 91 } 92 93 pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt) 94 { 95 return install_pte(cr3, 2, virt, 96 phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK | PT_PAGE_SIZE_MASK, 0); 97 } 98 99 pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt) 100 { 101 return install_pte(cr3, 1, virt, phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK, 0); 102 } 103 104 void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt) 105 { 106 phys_addr_t max = (u64)len + (u64)phys; 107 assert(phys % PAGE_SIZE == 0); 108 assert((uintptr_t) virt % PAGE_SIZE == 0); 109 assert(len % PAGE_SIZE == 0); 110 111 while (phys + PAGE_SIZE <= max) { 112 install_page(cr3, phys, virt); 113 phys += PAGE_SIZE; 114 virt = (char *) virt + PAGE_SIZE; 115 } 116 } 117 118 bool any_present_pages(pgd_t *cr3, void *virt, size_t len) 119 { 120 uintptr_t max = (uintptr_t) virt + len; 121 uintptr_t curr; 122 123 for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) { 124 pteval_t *ptep = get_pte(cr3, (void *) curr); 125 if (ptep && (*ptep & PT_PRESENT_MASK)) 126 return true; 127 } 128 return false; 129 } 130 131 static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len) 132 { 133 u64 max = (u64)len + (u64)start; 134 u64 phys = start; 135 136 while (phys + LARGE_PAGE_SIZE <= max) { 137 install_large_page(cr3, phys, (void *)(ulong)phys); 138 phys += LARGE_PAGE_SIZE; 139 } 140 install_pages(cr3, phys, max - phys, (void *)(ulong)phys); 141 } 142 143 static void set_additional_vcpu_vmregs(struct vm_vcpu_info *info) 144 { 145 write_cr3(info->cr3); 146 write_cr4(info->cr4); 147 write_cr0(info->cr0); 148 } 149 150 void *setup_mmu(phys_addr_t end_of_memory) 151 { 152 pgd_t *cr3 = alloc_page(); 153 struct vm_vcpu_info info; 154 int i; 155 156 memset(cr3, 0, PAGE_SIZE); 157 158 #ifdef __x86_64__ 159 if (end_of_memory < (1ul << 32)) 160 end_of_memory = (1ul << 32); /* map mmio 1:1 */ 161 162 setup_mmu_range(cr3, 0, end_of_memory); 163 #else 164 setup_mmu_range(cr3, 0, (2ul << 30)); 165 setup_mmu_range(cr3, 3ul << 30, (1ul << 30)); 166 init_alloc_vpage((void*)(3ul << 30)); 167 #endif 168 169 write_cr3(virt_to_phys(cr3)); 170 #ifndef __x86_64__ 171 write_cr4(X86_CR4_PSE); 172 #endif 173 write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP); 174 175 printf("paging enabled\n"); 176 printf("cr0 = %lx\n", read_cr0()); 177 printf("cr3 = %lx\n", read_cr3()); 178 printf("cr4 = %lx\n", read_cr4()); 179 180 info.cr3 = read_cr3(); 181 info.cr4 = read_cr4(); 182 info.cr0 = read_cr0(); 183 184 for (i = 1; i < cpu_count(); i++) 185 on_cpu(i, (void *)set_additional_vcpu_vmregs, &info); 186 187 return cr3; 188 } 189 190 phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem) 191 { 192 return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1)); 193 } 194 195 /* 196 * split_large_page: Split a 2M/1G large page into 512 smaller PTEs. 197 * @ptep : large page table entry to split 198 * @level : level of ptep (2 or 3) 199 */ 200 void split_large_page(unsigned long *ptep, int level) 201 { 202 unsigned long *new_pt; 203 unsigned long pa; 204 unsigned long pte; 205 unsigned long prototype; 206 int i; 207 208 pte = *ptep; 209 assert(pte & PT_PRESENT_MASK); 210 assert(pte & PT_PAGE_SIZE_MASK); 211 assert(level == 2 || level == 3); 212 213 new_pt = alloc_page(); 214 assert(new_pt); 215 216 prototype = pte & ~PT_ADDR_MASK; 217 if (level == 2) 218 prototype &= ~PT_PAGE_SIZE_MASK; 219 220 pa = pte & PT_ADDR_MASK; 221 for (i = 0; i < (1 << PGDIR_WIDTH); i++) { 222 new_pt[i] = prototype | pa; 223 pa += 1ul << PGDIR_BITS(level - 1); 224 } 225 226 pte &= ~PT_PAGE_SIZE_MASK; 227 pte &= ~PT_ADDR_MASK; 228 pte |= virt_to_phys(new_pt); 229 230 /* Modify the relevant paging-structure entry */ 231 *ptep = pte; 232 233 /* 234 * Flush the TLB to eradicate stale mappings. 235 * 236 * Note: Removing specific TLB mappings is tricky because 237 * split_large_page() can be called to split the active code page 238 * backing the next set of instructions to be fetched and executed. 239 * Furthermore, Intel SDM volume 3 recommends to clear the present bit 240 * for the page being split, before invalidating any mappings. 241 * 242 * But clearing the mapping from the page table and removing it from the 243 * TLB (where it's not actually guaranteed to reside anyway) makes it 244 * impossible to continue fetching instructions! 245 */ 246 flush_tlb(); 247 } 248 249 /* 250 * force_4k_page: Ensures that addr translate to a 4k page. 251 * 252 * This function uses split_large_page(), as needed, to ensure that target 253 * address, addr, translates to a 4k page. 254 * 255 * @addr: target address that should be mapped to a 4k page 256 */ 257 void force_4k_page(void *addr) 258 { 259 unsigned long *ptep; 260 unsigned long pte; 261 unsigned long *cr3 = current_page_table(); 262 263 ptep = get_pte_level(cr3, addr, 3); 264 assert(ptep); 265 pte = *ptep; 266 assert(pte & PT_PRESENT_MASK); 267 if (pte & PT_PAGE_SIZE_MASK) 268 split_large_page(ptep, 3); 269 270 ptep = get_pte_level(cr3, addr, 2); 271 assert(ptep); 272 pte = *ptep; 273 assert(pte & PT_PRESENT_MASK); 274 if (pte & PT_PAGE_SIZE_MASK) 275 split_large_page(ptep, 2); 276 } 277