xref: /linux/arch/riscv/kvm/gstage.c (revision 63eb28bb1402891b1ad2be02a530f29a9dd7f1cd)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2019 Western Digital Corporation or its affiliates.
4  * Copyright (c) 2025 Ventana Micro Systems Inc.
5  */
6 
7 #include <linux/bitops.h>
8 #include <linux/errno.h>
9 #include <linux/kvm_host.h>
10 #include <linux/module.h>
11 #include <linux/pgtable.h>
12 #include <asm/kvm_gstage.h>
13 
14 #ifdef CONFIG_64BIT
15 unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
16 unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
17 #else
18 unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
19 unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
20 #endif
21 
22 #define gstage_pte_leaf(__ptep)	\
23 	(pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
24 
gstage_pte_index(gpa_t addr,u32 level)25 static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
26 {
27 	unsigned long mask;
28 	unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
29 
30 	if (level == (kvm_riscv_gstage_pgd_levels - 1))
31 		mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
32 	else
33 		mask = PTRS_PER_PTE - 1;
34 
35 	return (addr >> shift) & mask;
36 }
37 
gstage_pte_page_vaddr(pte_t pte)38 static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
39 {
40 	return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
41 }
42 
gstage_page_size_to_level(unsigned long page_size,u32 * out_level)43 static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
44 {
45 	u32 i;
46 	unsigned long psz = 1UL << 12;
47 
48 	for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
49 		if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
50 			*out_level = i;
51 			return 0;
52 		}
53 	}
54 
55 	return -EINVAL;
56 }
57 
gstage_level_to_page_order(u32 level,unsigned long * out_pgorder)58 static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
59 {
60 	if (kvm_riscv_gstage_pgd_levels < level)
61 		return -EINVAL;
62 
63 	*out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
64 	return 0;
65 }
66 
gstage_level_to_page_size(u32 level,unsigned long * out_pgsize)67 static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
68 {
69 	int rc;
70 	unsigned long page_order = PAGE_SHIFT;
71 
72 	rc = gstage_level_to_page_order(level, &page_order);
73 	if (rc)
74 		return rc;
75 
76 	*out_pgsize = BIT(page_order);
77 	return 0;
78 }
79 
kvm_riscv_gstage_get_leaf(struct kvm_gstage * gstage,gpa_t addr,pte_t ** ptepp,u32 * ptep_level)80 bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
81 			       pte_t **ptepp, u32 *ptep_level)
82 {
83 	pte_t *ptep;
84 	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
85 
86 	*ptep_level = current_level;
87 	ptep = (pte_t *)gstage->pgd;
88 	ptep = &ptep[gstage_pte_index(addr, current_level)];
89 	while (ptep && pte_val(ptep_get(ptep))) {
90 		if (gstage_pte_leaf(ptep)) {
91 			*ptep_level = current_level;
92 			*ptepp = ptep;
93 			return true;
94 		}
95 
96 		if (current_level) {
97 			current_level--;
98 			*ptep_level = current_level;
99 			ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
100 			ptep = &ptep[gstage_pte_index(addr, current_level)];
101 		} else {
102 			ptep = NULL;
103 		}
104 	}
105 
106 	return false;
107 }
108 
gstage_tlb_flush(struct kvm_gstage * gstage,u32 level,gpa_t addr)109 static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
110 {
111 	unsigned long order = PAGE_SHIFT;
112 
113 	if (gstage_level_to_page_order(level, &order))
114 		return;
115 	addr &= ~(BIT(order) - 1);
116 
117 	if (gstage->flags & KVM_GSTAGE_FLAGS_LOCAL)
118 		kvm_riscv_local_hfence_gvma_vmid_gpa(gstage->vmid, addr, BIT(order), order);
119 	else
120 		kvm_riscv_hfence_gvma_vmid_gpa(gstage->kvm, -1UL, 0, addr, BIT(order), order,
121 					       gstage->vmid);
122 }
123 
kvm_riscv_gstage_set_pte(struct kvm_gstage * gstage,struct kvm_mmu_memory_cache * pcache,const struct kvm_gstage_mapping * map)124 int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
125 			     struct kvm_mmu_memory_cache *pcache,
126 			     const struct kvm_gstage_mapping *map)
127 {
128 	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
129 	pte_t *next_ptep = (pte_t *)gstage->pgd;
130 	pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
131 
132 	if (current_level < map->level)
133 		return -EINVAL;
134 
135 	while (current_level != map->level) {
136 		if (gstage_pte_leaf(ptep))
137 			return -EEXIST;
138 
139 		if (!pte_val(ptep_get(ptep))) {
140 			if (!pcache)
141 				return -ENOMEM;
142 			next_ptep = kvm_mmu_memory_cache_alloc(pcache);
143 			if (!next_ptep)
144 				return -ENOMEM;
145 			set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
146 					      __pgprot(_PAGE_TABLE)));
147 		} else {
148 			if (gstage_pte_leaf(ptep))
149 				return -EEXIST;
150 			next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
151 		}
152 
153 		current_level--;
154 		ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
155 	}
156 
157 	if (pte_val(*ptep) != pte_val(map->pte)) {
158 		set_pte(ptep, map->pte);
159 		if (gstage_pte_leaf(ptep))
160 			gstage_tlb_flush(gstage, current_level, map->addr);
161 	}
162 
163 	return 0;
164 }
165 
kvm_riscv_gstage_map_page(struct kvm_gstage * gstage,struct kvm_mmu_memory_cache * pcache,gpa_t gpa,phys_addr_t hpa,unsigned long page_size,bool page_rdonly,bool page_exec,struct kvm_gstage_mapping * out_map)166 int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
167 			      struct kvm_mmu_memory_cache *pcache,
168 			      gpa_t gpa, phys_addr_t hpa, unsigned long page_size,
169 			      bool page_rdonly, bool page_exec,
170 			      struct kvm_gstage_mapping *out_map)
171 {
172 	pgprot_t prot;
173 	int ret;
174 
175 	out_map->addr = gpa;
176 	out_map->level = 0;
177 
178 	ret = gstage_page_size_to_level(page_size, &out_map->level);
179 	if (ret)
180 		return ret;
181 
182 	/*
183 	 * A RISC-V implementation can choose to either:
184 	 * 1) Update 'A' and 'D' PTE bits in hardware
185 	 * 2) Generate page fault when 'A' and/or 'D' bits are not set
186 	 *    PTE so that software can update these bits.
187 	 *
188 	 * We support both options mentioned above. To achieve this, we
189 	 * always set 'A' and 'D' PTE bits at time of creating G-stage
190 	 * mapping. To support KVM dirty page logging with both options
191 	 * mentioned above, we will write-protect G-stage PTEs to track
192 	 * dirty pages.
193 	 */
194 
195 	if (page_exec) {
196 		if (page_rdonly)
197 			prot = PAGE_READ_EXEC;
198 		else
199 			prot = PAGE_WRITE_EXEC;
200 	} else {
201 		if (page_rdonly)
202 			prot = PAGE_READ;
203 		else
204 			prot = PAGE_WRITE;
205 	}
206 	out_map->pte = pfn_pte(PFN_DOWN(hpa), prot);
207 	out_map->pte = pte_mkdirty(out_map->pte);
208 
209 	return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
210 }
211 
kvm_riscv_gstage_op_pte(struct kvm_gstage * gstage,gpa_t addr,pte_t * ptep,u32 ptep_level,enum kvm_riscv_gstage_op op)212 void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
213 			     pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
214 {
215 	int i, ret;
216 	pte_t old_pte, *next_ptep;
217 	u32 next_ptep_level;
218 	unsigned long next_page_size, page_size;
219 
220 	ret = gstage_level_to_page_size(ptep_level, &page_size);
221 	if (ret)
222 		return;
223 
224 	WARN_ON(addr & (page_size - 1));
225 
226 	if (!pte_val(ptep_get(ptep)))
227 		return;
228 
229 	if (ptep_level && !gstage_pte_leaf(ptep)) {
230 		next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
231 		next_ptep_level = ptep_level - 1;
232 		ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
233 		if (ret)
234 			return;
235 
236 		if (op == GSTAGE_OP_CLEAR)
237 			set_pte(ptep, __pte(0));
238 		for (i = 0; i < PTRS_PER_PTE; i++)
239 			kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
240 						&next_ptep[i], next_ptep_level, op);
241 		if (op == GSTAGE_OP_CLEAR)
242 			put_page(virt_to_page(next_ptep));
243 	} else {
244 		old_pte = *ptep;
245 		if (op == GSTAGE_OP_CLEAR)
246 			set_pte(ptep, __pte(0));
247 		else if (op == GSTAGE_OP_WP)
248 			set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
249 		if (pte_val(*ptep) != pte_val(old_pte))
250 			gstage_tlb_flush(gstage, ptep_level, addr);
251 	}
252 }
253 
kvm_riscv_gstage_unmap_range(struct kvm_gstage * gstage,gpa_t start,gpa_t size,bool may_block)254 void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
255 				  gpa_t start, gpa_t size, bool may_block)
256 {
257 	int ret;
258 	pte_t *ptep;
259 	u32 ptep_level;
260 	bool found_leaf;
261 	unsigned long page_size;
262 	gpa_t addr = start, end = start + size;
263 
264 	while (addr < end) {
265 		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
266 		ret = gstage_level_to_page_size(ptep_level, &page_size);
267 		if (ret)
268 			break;
269 
270 		if (!found_leaf)
271 			goto next;
272 
273 		if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
274 			kvm_riscv_gstage_op_pte(gstage, addr, ptep,
275 						ptep_level, GSTAGE_OP_CLEAR);
276 
277 next:
278 		addr += page_size;
279 
280 		/*
281 		 * If the range is too large, release the kvm->mmu_lock
282 		 * to prevent starvation and lockup detector warnings.
283 		 */
284 		if (!(gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) && may_block && addr < end)
285 			cond_resched_lock(&gstage->kvm->mmu_lock);
286 	}
287 }
288 
kvm_riscv_gstage_wp_range(struct kvm_gstage * gstage,gpa_t start,gpa_t end)289 void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
290 {
291 	int ret;
292 	pte_t *ptep;
293 	u32 ptep_level;
294 	bool found_leaf;
295 	gpa_t addr = start;
296 	unsigned long page_size;
297 
298 	while (addr < end) {
299 		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
300 		ret = gstage_level_to_page_size(ptep_level, &page_size);
301 		if (ret)
302 			break;
303 
304 		if (!found_leaf)
305 			goto next;
306 
307 		if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
308 			kvm_riscv_gstage_op_pte(gstage, addr, ptep,
309 						ptep_level, GSTAGE_OP_WP);
310 
311 next:
312 		addr += page_size;
313 	}
314 }
315 
kvm_riscv_gstage_mode_detect(void)316 void __init kvm_riscv_gstage_mode_detect(void)
317 {
318 #ifdef CONFIG_64BIT
319 	/* Try Sv57x4 G-stage mode */
320 	csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
321 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
322 		kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
323 		kvm_riscv_gstage_pgd_levels = 5;
324 		goto skip_sv48x4_test;
325 	}
326 
327 	/* Try Sv48x4 G-stage mode */
328 	csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
329 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
330 		kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
331 		kvm_riscv_gstage_pgd_levels = 4;
332 	}
333 skip_sv48x4_test:
334 
335 	csr_write(CSR_HGATP, 0);
336 	kvm_riscv_local_hfence_gvma_all();
337 #endif
338 }
339