1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2019 Western Digital Corporation or its affiliates.
4 * Copyright (c) 2025 Ventana Micro Systems Inc.
5 */
6
7 #include <linux/bitops.h>
8 #include <linux/errno.h>
9 #include <linux/kvm_host.h>
10 #include <linux/module.h>
11 #include <linux/pgtable.h>
12 #include <asm/kvm_gstage.h>
13
14 #ifdef CONFIG_64BIT
15 unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
16 unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
17 #else
18 unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
19 unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
20 #endif
21
22 #define gstage_pte_leaf(__ptep) \
23 (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
24
gstage_pte_index(gpa_t addr,u32 level)25 static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
26 {
27 unsigned long mask;
28 unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
29
30 if (level == (kvm_riscv_gstage_pgd_levels - 1))
31 mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
32 else
33 mask = PTRS_PER_PTE - 1;
34
35 return (addr >> shift) & mask;
36 }
37
gstage_pte_page_vaddr(pte_t pte)38 static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
39 {
40 return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
41 }
42
gstage_page_size_to_level(unsigned long page_size,u32 * out_level)43 static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
44 {
45 u32 i;
46 unsigned long psz = 1UL << 12;
47
48 for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
49 if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
50 *out_level = i;
51 return 0;
52 }
53 }
54
55 return -EINVAL;
56 }
57
gstage_level_to_page_order(u32 level,unsigned long * out_pgorder)58 static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
59 {
60 if (kvm_riscv_gstage_pgd_levels < level)
61 return -EINVAL;
62
63 *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
64 return 0;
65 }
66
gstage_level_to_page_size(u32 level,unsigned long * out_pgsize)67 static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
68 {
69 int rc;
70 unsigned long page_order = PAGE_SHIFT;
71
72 rc = gstage_level_to_page_order(level, &page_order);
73 if (rc)
74 return rc;
75
76 *out_pgsize = BIT(page_order);
77 return 0;
78 }
79
kvm_riscv_gstage_get_leaf(struct kvm_gstage * gstage,gpa_t addr,pte_t ** ptepp,u32 * ptep_level)80 bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
81 pte_t **ptepp, u32 *ptep_level)
82 {
83 pte_t *ptep;
84 u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
85
86 *ptep_level = current_level;
87 ptep = (pte_t *)gstage->pgd;
88 ptep = &ptep[gstage_pte_index(addr, current_level)];
89 while (ptep && pte_val(ptep_get(ptep))) {
90 if (gstage_pte_leaf(ptep)) {
91 *ptep_level = current_level;
92 *ptepp = ptep;
93 return true;
94 }
95
96 if (current_level) {
97 current_level--;
98 *ptep_level = current_level;
99 ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
100 ptep = &ptep[gstage_pte_index(addr, current_level)];
101 } else {
102 ptep = NULL;
103 }
104 }
105
106 return false;
107 }
108
gstage_tlb_flush(struct kvm_gstage * gstage,u32 level,gpa_t addr)109 static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
110 {
111 unsigned long order = PAGE_SHIFT;
112
113 if (gstage_level_to_page_order(level, &order))
114 return;
115 addr &= ~(BIT(order) - 1);
116
117 if (gstage->flags & KVM_GSTAGE_FLAGS_LOCAL)
118 kvm_riscv_local_hfence_gvma_vmid_gpa(gstage->vmid, addr, BIT(order), order);
119 else
120 kvm_riscv_hfence_gvma_vmid_gpa(gstage->kvm, -1UL, 0, addr, BIT(order), order,
121 gstage->vmid);
122 }
123
kvm_riscv_gstage_set_pte(struct kvm_gstage * gstage,struct kvm_mmu_memory_cache * pcache,const struct kvm_gstage_mapping * map)124 int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
125 struct kvm_mmu_memory_cache *pcache,
126 const struct kvm_gstage_mapping *map)
127 {
128 u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
129 pte_t *next_ptep = (pte_t *)gstage->pgd;
130 pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
131
132 if (current_level < map->level)
133 return -EINVAL;
134
135 while (current_level != map->level) {
136 if (gstage_pte_leaf(ptep))
137 return -EEXIST;
138
139 if (!pte_val(ptep_get(ptep))) {
140 if (!pcache)
141 return -ENOMEM;
142 next_ptep = kvm_mmu_memory_cache_alloc(pcache);
143 if (!next_ptep)
144 return -ENOMEM;
145 set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
146 __pgprot(_PAGE_TABLE)));
147 } else {
148 if (gstage_pte_leaf(ptep))
149 return -EEXIST;
150 next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
151 }
152
153 current_level--;
154 ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
155 }
156
157 if (pte_val(*ptep) != pte_val(map->pte)) {
158 set_pte(ptep, map->pte);
159 if (gstage_pte_leaf(ptep))
160 gstage_tlb_flush(gstage, current_level, map->addr);
161 }
162
163 return 0;
164 }
165
kvm_riscv_gstage_map_page(struct kvm_gstage * gstage,struct kvm_mmu_memory_cache * pcache,gpa_t gpa,phys_addr_t hpa,unsigned long page_size,bool page_rdonly,bool page_exec,struct kvm_gstage_mapping * out_map)166 int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
167 struct kvm_mmu_memory_cache *pcache,
168 gpa_t gpa, phys_addr_t hpa, unsigned long page_size,
169 bool page_rdonly, bool page_exec,
170 struct kvm_gstage_mapping *out_map)
171 {
172 pgprot_t prot;
173 int ret;
174
175 out_map->addr = gpa;
176 out_map->level = 0;
177
178 ret = gstage_page_size_to_level(page_size, &out_map->level);
179 if (ret)
180 return ret;
181
182 /*
183 * A RISC-V implementation can choose to either:
184 * 1) Update 'A' and 'D' PTE bits in hardware
185 * 2) Generate page fault when 'A' and/or 'D' bits are not set
186 * PTE so that software can update these bits.
187 *
188 * We support both options mentioned above. To achieve this, we
189 * always set 'A' and 'D' PTE bits at time of creating G-stage
190 * mapping. To support KVM dirty page logging with both options
191 * mentioned above, we will write-protect G-stage PTEs to track
192 * dirty pages.
193 */
194
195 if (page_exec) {
196 if (page_rdonly)
197 prot = PAGE_READ_EXEC;
198 else
199 prot = PAGE_WRITE_EXEC;
200 } else {
201 if (page_rdonly)
202 prot = PAGE_READ;
203 else
204 prot = PAGE_WRITE;
205 }
206 out_map->pte = pfn_pte(PFN_DOWN(hpa), prot);
207 out_map->pte = pte_mkdirty(out_map->pte);
208
209 return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
210 }
211
kvm_riscv_gstage_op_pte(struct kvm_gstage * gstage,gpa_t addr,pte_t * ptep,u32 ptep_level,enum kvm_riscv_gstage_op op)212 void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
213 pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
214 {
215 int i, ret;
216 pte_t old_pte, *next_ptep;
217 u32 next_ptep_level;
218 unsigned long next_page_size, page_size;
219
220 ret = gstage_level_to_page_size(ptep_level, &page_size);
221 if (ret)
222 return;
223
224 WARN_ON(addr & (page_size - 1));
225
226 if (!pte_val(ptep_get(ptep)))
227 return;
228
229 if (ptep_level && !gstage_pte_leaf(ptep)) {
230 next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
231 next_ptep_level = ptep_level - 1;
232 ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
233 if (ret)
234 return;
235
236 if (op == GSTAGE_OP_CLEAR)
237 set_pte(ptep, __pte(0));
238 for (i = 0; i < PTRS_PER_PTE; i++)
239 kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
240 &next_ptep[i], next_ptep_level, op);
241 if (op == GSTAGE_OP_CLEAR)
242 put_page(virt_to_page(next_ptep));
243 } else {
244 old_pte = *ptep;
245 if (op == GSTAGE_OP_CLEAR)
246 set_pte(ptep, __pte(0));
247 else if (op == GSTAGE_OP_WP)
248 set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
249 if (pte_val(*ptep) != pte_val(old_pte))
250 gstage_tlb_flush(gstage, ptep_level, addr);
251 }
252 }
253
kvm_riscv_gstage_unmap_range(struct kvm_gstage * gstage,gpa_t start,gpa_t size,bool may_block)254 void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
255 gpa_t start, gpa_t size, bool may_block)
256 {
257 int ret;
258 pte_t *ptep;
259 u32 ptep_level;
260 bool found_leaf;
261 unsigned long page_size;
262 gpa_t addr = start, end = start + size;
263
264 while (addr < end) {
265 found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
266 ret = gstage_level_to_page_size(ptep_level, &page_size);
267 if (ret)
268 break;
269
270 if (!found_leaf)
271 goto next;
272
273 if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
274 kvm_riscv_gstage_op_pte(gstage, addr, ptep,
275 ptep_level, GSTAGE_OP_CLEAR);
276
277 next:
278 addr += page_size;
279
280 /*
281 * If the range is too large, release the kvm->mmu_lock
282 * to prevent starvation and lockup detector warnings.
283 */
284 if (!(gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) && may_block && addr < end)
285 cond_resched_lock(&gstage->kvm->mmu_lock);
286 }
287 }
288
kvm_riscv_gstage_wp_range(struct kvm_gstage * gstage,gpa_t start,gpa_t end)289 void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
290 {
291 int ret;
292 pte_t *ptep;
293 u32 ptep_level;
294 bool found_leaf;
295 gpa_t addr = start;
296 unsigned long page_size;
297
298 while (addr < end) {
299 found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
300 ret = gstage_level_to_page_size(ptep_level, &page_size);
301 if (ret)
302 break;
303
304 if (!found_leaf)
305 goto next;
306
307 if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
308 kvm_riscv_gstage_op_pte(gstage, addr, ptep,
309 ptep_level, GSTAGE_OP_WP);
310
311 next:
312 addr += page_size;
313 }
314 }
315
kvm_riscv_gstage_mode_detect(void)316 void __init kvm_riscv_gstage_mode_detect(void)
317 {
318 #ifdef CONFIG_64BIT
319 /* Try Sv57x4 G-stage mode */
320 csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
321 if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
322 kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
323 kvm_riscv_gstage_pgd_levels = 5;
324 goto skip_sv48x4_test;
325 }
326
327 /* Try Sv48x4 G-stage mode */
328 csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
329 if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
330 kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
331 kvm_riscv_gstage_pgd_levels = 4;
332 }
333 skip_sv48x4_test:
334
335 csr_write(CSR_HGATP, 0);
336 kvm_riscv_local_hfence_gvma_all();
337 #endif
338 }
339