xref: /kvm-unit-tests/lib/x86/vm.c (revision a41100276a89128703db61361f6be878c6473005)
1 #include "vm.h"
2 #include "libcflat.h"
3 #include "vmalloc.h"
4 #include "alloc_page.h"
5 
6 pteval_t *install_pte(pgd_t *cr3,
7 		      int pte_level,
8 		      void *virt,
9 		      pteval_t pte,
10 		      pteval_t *pt_page)
11 {
12     int level;
13     pteval_t *pt = cr3;
14     unsigned offset;
15 
16     for (level = PAGE_LEVEL; level > pte_level; --level) {
17 	offset = PGDIR_OFFSET((uintptr_t)virt, level);
18 	if (!(pt[offset] & PT_PRESENT_MASK)) {
19 	    pteval_t *new_pt = pt_page;
20             if (!new_pt)
21                 new_pt = alloc_page();
22             else
23                 pt_page = 0;
24 	    memset(new_pt, 0, PAGE_SIZE);
25 	    pt[offset] = virt_to_phys(new_pt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
26 	}
27 	pt = phys_to_virt(pt[offset] & PT_ADDR_MASK);
28     }
29     offset = PGDIR_OFFSET((uintptr_t)virt, level);
30     pt[offset] = pte;
31     return &pt[offset];
32 }
33 
34 /*
35  * Finds last PTE in the mapping of @virt that's at or above @lowest_level. The
36  * returned PTE isn't necessarily present, but its parent is.
37  */
38 struct pte_search find_pte_level(pgd_t *cr3, void *virt,
39 				 int lowest_level)
40 {
41 	pteval_t *pt = cr3, pte;
42 	unsigned offset;
43 	unsigned shift;
44 	struct pte_search r;
45 
46 	assert(lowest_level >= 1 && lowest_level <= PAGE_LEVEL);
47 
48 	for (r.level = PAGE_LEVEL;; --r.level) {
49 		shift = (r.level - 1) * PGDIR_WIDTH + 12;
50 		offset = ((uintptr_t)virt >> shift) & PGDIR_MASK;
51 		r.pte = &pt[offset];
52 		pte = *r.pte;
53 
54 		if (!(pte & PT_PRESENT_MASK))
55 			return r;
56 
57 		if ((r.level == 2 || r.level == 3) && (pte & PT_PAGE_SIZE_MASK))
58 			return r;
59 
60 		if (r.level == lowest_level)
61 			return r;
62 
63 		pt = phys_to_virt(pte & 0xffffffffff000ull);
64 	}
65 }
66 
67 /*
68  * Returns the leaf PTE in the mapping of @virt (i.e., 4K PTE or a present huge
69  * PTE). Returns NULL if no leaf PTE exists.
70  */
71 pteval_t *get_pte(pgd_t *cr3, void *virt)
72 {
73 	struct pte_search search;
74 
75 	search = find_pte_level(cr3, virt, 1);
76 	return found_leaf_pte(search) ? search.pte : NULL;
77 }
78 
79 /*
80  * Returns the PTE in the mapping of @virt at the given level @pte_level.
81  * Returns NULL if the PT at @pte_level isn't present (i.e., the mapping at
82  * @pte_level - 1 isn't present).
83  */
84 pteval_t *get_pte_level(pgd_t *cr3, void *virt, int pte_level)
85 {
86 	struct pte_search search;
87 
88 	search = find_pte_level(cr3, virt, pte_level);
89 	return search.level == pte_level ? search.pte : NULL;
90 }
91 
92 pteval_t *install_large_page(pgd_t *cr3, phys_addr_t phys, void *virt)
93 {
94     return install_pte(cr3, 2, virt,
95 		       phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK | PT_PAGE_SIZE_MASK, 0);
96 }
97 
98 pteval_t *install_page(pgd_t *cr3, phys_addr_t phys, void *virt)
99 {
100     return install_pte(cr3, 1, virt, phys | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK, 0);
101 }
102 
103 void install_pages(pgd_t *cr3, phys_addr_t phys, size_t len, void *virt)
104 {
105 	phys_addr_t max = (u64)len + (u64)phys;
106 	assert(phys % PAGE_SIZE == 0);
107 	assert((uintptr_t) virt % PAGE_SIZE == 0);
108 	assert(len % PAGE_SIZE == 0);
109 
110 	while (phys + PAGE_SIZE <= max) {
111 		install_page(cr3, phys, virt);
112 		phys += PAGE_SIZE;
113 		virt = (char *) virt + PAGE_SIZE;
114 	}
115 }
116 
117 bool any_present_pages(pgd_t *cr3, void *virt, size_t len)
118 {
119 	uintptr_t max = (uintptr_t) virt + len;
120 	uintptr_t curr;
121 
122 	for (curr = (uintptr_t) virt; curr < max; curr += PAGE_SIZE) {
123 		pteval_t *ptep = get_pte(cr3, (void *) curr);
124 		if (ptep && (*ptep & PT_PRESENT_MASK))
125 			return true;
126 	}
127 	return false;
128 }
129 
130 static void setup_mmu_range(pgd_t *cr3, phys_addr_t start, size_t len)
131 {
132 	u64 max = (u64)len + (u64)start;
133 	u64 phys = start;
134 
135 	while (phys + LARGE_PAGE_SIZE <= max) {
136 		install_large_page(cr3, phys, (void *)(ulong)phys);
137 		phys += LARGE_PAGE_SIZE;
138 	}
139 	install_pages(cr3, phys, max - phys, (void *)(ulong)phys);
140 }
141 
142 void *setup_mmu(phys_addr_t end_of_memory)
143 {
144     pgd_t *cr3 = alloc_page();
145 
146     memset(cr3, 0, PAGE_SIZE);
147 
148 #ifdef __x86_64__
149     if (end_of_memory < (1ul << 32))
150         end_of_memory = (1ul << 32);  /* map mmio 1:1 */
151 
152     setup_mmu_range(cr3, 0, end_of_memory);
153 #else
154     setup_mmu_range(cr3, 0, (2ul << 30));
155     setup_mmu_range(cr3, 3ul << 30, (1ul << 30));
156     init_alloc_vpage((void*)(3ul << 30));
157 #endif
158 
159     write_cr3(virt_to_phys(cr3));
160 #ifndef __x86_64__
161     write_cr4(X86_CR4_PSE);
162 #endif
163     write_cr0(X86_CR0_PG |X86_CR0_PE | X86_CR0_WP);
164 
165     printf("paging enabled\n");
166     printf("cr0 = %lx\n", read_cr0());
167     printf("cr3 = %lx\n", read_cr3());
168     printf("cr4 = %lx\n", read_cr4());
169     return cr3;
170 }
171 
172 phys_addr_t virt_to_pte_phys(pgd_t *cr3, void *mem)
173 {
174     return (*get_pte(cr3, mem) & PT_ADDR_MASK) + ((ulong)mem & (PAGE_SIZE - 1));
175 }
176 
177 /*
178  * split_large_page: Split a 2M/1G large page into 512 smaller PTEs.
179  *   @ptep : large page table entry to split
180  *   @level : level of ptep (2 or 3)
181  */
182 void split_large_page(unsigned long *ptep, int level)
183 {
184 	unsigned long *new_pt;
185 	unsigned long pa;
186 	unsigned long pte;
187 	unsigned long prototype;
188 	int i;
189 
190 	pte = *ptep;
191 	assert(pte & PT_PRESENT_MASK);
192 	assert(pte & PT_PAGE_SIZE_MASK);
193 	assert(level == 2 || level == 3);
194 
195 	new_pt = alloc_page();
196 	assert(new_pt);
197 
198 	prototype = pte & ~PT_ADDR_MASK;
199 	if (level == 2)
200 		prototype &= ~PT_PAGE_SIZE_MASK;
201 
202 	pa = pte & PT_ADDR_MASK;
203 	for (i = 0; i < (1 << PGDIR_WIDTH); i++) {
204 		new_pt[i] = prototype | pa;
205 		pa += 1ul << PGDIR_BITS(level - 1);
206 	}
207 
208 	pte &= ~PT_PAGE_SIZE_MASK;
209 	pte &= ~PT_ADDR_MASK;
210 	pte |= virt_to_phys(new_pt);
211 
212 	/* Modify the relevant paging-structure entry */
213 	*ptep = pte;
214 
215 	/*
216 	 * Flush the TLB to eradicate stale mappings.
217 	 *
218 	 * Note: Removing specific TLB mappings is tricky because
219 	 * split_large_page() can be called to split the active code page
220 	 * backing the next set of instructions to be fetched and executed.
221 	 * Furthermore, Intel SDM volume 3 recommends to clear the present bit
222 	 * for the page being split, before invalidating any mappings.
223 	 *
224 	 * But clearing the mapping from the page table and removing it from the
225 	 * TLB (where it's not actually guaranteed to reside anyway) makes it
226 	 * impossible to continue fetching instructions!
227 	 */
228 	flush_tlb();
229 }
230 
231 /*
232  * force_4k_page: Ensures that addr translate to a 4k page.
233  *
234  * This function uses split_large_page(), as needed, to ensure that target
235  * address, addr, translates to a 4k page.
236  *
237  *   @addr: target address that should be mapped to a 4k page
238  */
239 void force_4k_page(void *addr)
240 {
241 	unsigned long *ptep;
242 	unsigned long pte;
243 	unsigned long *cr3 = current_page_table();
244 
245 	ptep = get_pte_level(cr3, addr, 3);
246 	assert(ptep);
247 	pte = *ptep;
248 	assert(pte & PT_PRESENT_MASK);
249 	if (pte & PT_PAGE_SIZE_MASK)
250 		split_large_page(ptep, 3);
251 
252 	ptep = get_pte_level(cr3, addr, 2);
253 	assert(ptep);
254 	pte = *ptep;
255 	assert(pte & PT_PRESENT_MASK);
256 	if (pte & PT_PAGE_SIZE_MASK)
257 		split_large_page(ptep, 2);
258 }
259