xref: /src/sys/powerpc/aim/mmu_oea64.c (revision 19a18bb750e8ed88207664b4959e433b7e68e926)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2008-2015 Nathan Whitehorn
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 /*
31  * Manages physical address maps.
32  *
33  * Since the information managed by this module is also stored by the
34  * logical address mapping module, this module may throw away valid virtual
35  * to physical mappings at almost any time.  However, invalidations of
36  * mappings must be done as requested.
37  *
38  * In order to cope with hardware architectures which make virtual to
39  * physical map invalidates expensive, this module may delay invalidate
40  * reduced protection operations until such time as they are actually
41  * necessary.  This module is given full information as to which processors
42  * are currently using which maps, and to when physical maps must be made
43  * correct.
44  */
45 
46 #include "opt_kstack_pages.h"
47 
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/conf.h>
51 #include <sys/queue.h>
52 #include <sys/cpuset.h>
53 #include <sys/kerneldump.h>
54 #include <sys/ktr.h>
55 #include <sys/lock.h>
56 #include <sys/msgbuf.h>
57 #include <sys/malloc.h>
58 #include <sys/mman.h>
59 #include <sys/mutex.h>
60 #include <sys/proc.h>
61 #include <sys/rwlock.h>
62 #include <sys/sched.h>
63 #include <sys/sysctl.h>
64 #include <sys/systm.h>
65 #include <sys/vmmeter.h>
66 #include <sys/smp.h>
67 #include <sys/reboot.h>
68 
69 #include <sys/kdb.h>
70 
71 #include <dev/ofw/openfirm.h>
72 
73 #include <vm/vm.h>
74 #include <vm/pmap.h>
75 #include <vm/vm_param.h>
76 #include <vm/vm_kern.h>
77 #include <vm/vm_page.h>
78 #include <vm/vm_phys.h>
79 #include <vm/vm_map.h>
80 #include <vm/vm_object.h>
81 #include <vm/vm_extern.h>
82 #include <vm/vm_pageout.h>
83 #include <vm/vm_dumpset.h>
84 #include <vm/vm_radix.h>
85 #include <vm/vm_reserv.h>
86 #include <vm/uma.h>
87 
88 #include <machine/_inttypes.h>
89 #include <machine/cpu.h>
90 #include <machine/ifunc.h>
91 #include <machine/platform.h>
92 #include <machine/frame.h>
93 #include <machine/md_var.h>
94 #include <machine/psl.h>
95 #include <machine/bat.h>
96 #include <machine/hid.h>
97 #include <machine/pte.h>
98 #include <machine/sr.h>
99 #include <machine/trap.h>
100 #include <machine/mmuvar.h>
101 
102 #include "mmu_oea64.h"
103 
104 void moea64_release_vsid(uint64_t vsid);
105 uintptr_t moea64_get_unique_vsid(void);
106 
107 #define DISABLE_TRANS(msr)	msr = mfmsr(); mtmsr(msr & ~PSL_DR)
108 #define ENABLE_TRANS(msr)	mtmsr(msr)
109 
110 #define	VSID_MAKE(sr, hash)	((sr) | (((hash) & 0xfffff) << 4))
111 #define	VSID_TO_HASH(vsid)	(((vsid) >> 4) & 0xfffff)
112 #define	VSID_HASH_MASK		0x0000007fffffffffULL
113 
114 /*
115  * Locking semantics:
116  *
117  * There are two locks of interest: the page locks and the pmap locks, which
118  * protect their individual PVO lists and are locked in that order. The contents
119  * of all PVO entries are protected by the locks of their respective pmaps.
120  * The pmap of any PVO is guaranteed not to change so long as the PVO is linked
121  * into any list.
122  *
123  */
124 
125 #define PV_LOCK_COUNT	MAXCPU
126 static struct rwlock __exclusive_cache_line pv_lock[PV_LOCK_COUNT];
127 
128 #define	PV_LOCK_SHIFT	HPT_SP_SHIFT
129 #define	pa_index(pa)	((pa) >> PV_LOCK_SHIFT)
130 
131 /*
132  * Cheap NUMA-izing of the pv locks, to reduce contention across domains.
133  * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the
134  * index at (N << 45).
135  */
136 #ifdef __powerpc64__
137 #define PV_LOCK_IDX(pa)	((pa_index(pa) * (((pa) >> 45) + 1)) % PV_LOCK_COUNT)
138 #else
139 #define PV_LOCK_IDX(pa)	(pa_index(pa) % PV_LOCK_COUNT)
140 #endif
141 #define PV_LOCKPTR(pa)	((struct rwlock *)(&pv_lock[PV_LOCK_IDX(pa)]))
142 
143 #define PV_WR_LOCK(pa)		rw_wlock(PV_LOCKPTR(pa))
144 #define PV_RD_LOCK(pa)		rw_rlock(PV_LOCKPTR(pa))
145 #define PV_UNLOCK(pa)		rw_unlock(PV_LOCKPTR(pa))
146 #define PV_LOCKASSERT(pa) 	rw_assert(PV_LOCKPTR(pa), RA_LOCKED)
147 #define PV_LOCK_RD_ASSERT(pa) 	rw_assert(PV_LOCKPTR(pa), RA_RLOCKED)
148 #define PV_LOCK_WR_ASSERT(pa) 	rw_assert(PV_LOCKPTR(pa), RA_WLOCKED)
149 
150 #define PV_PAGE_WR_LOCK(m)	PV_WR_LOCK(VM_PAGE_TO_PHYS(m))
151 #define PV_PAGE_RD_LOCK(m)	PV_RD_LOCK(VM_PAGE_TO_PHYS(m))
152 #define PV_PAGE_UNLOCK(m)	PV_UNLOCK(VM_PAGE_TO_PHYS(m))
153 #define PV_PAGE_LOCKASSERT(m)	PV_LOCKASSERT(VM_PAGE_TO_PHYS(m))
154 
155 struct ofw_map {
156 	cell_t	om_va;
157 	cell_t	om_len;
158 	uint64_t om_pa;
159 	cell_t	om_mode;
160 };
161 
162 extern unsigned char _etext[];
163 extern unsigned char _end[];
164 
165 extern void *slbtrap, *slbtrapend;
166 
167 /*
168  * Map of physical memory regions.
169  */
170 static struct	mem_region *regions;
171 static struct	mem_region *pregions;
172 static struct	numa_mem_region *numa_pregions;
173 static int	regions_sz, pregions_sz, numapregions_sz;
174 
175 u_int	phys_avail_count;
176 
177 extern void bs_remap_earlyboot(void);
178 
179 /*
180  * Lock for the SLB tables.
181  */
182 struct mtx	moea64_slb_mutex;
183 
184 /*
185  * PTEG data.
186  */
187 u_long		moea64_pteg_count;
188 u_long		moea64_pteg_mask;
189 
190 /*
191  * PVO data.
192  */
193 
194 uma_zone_t	moea64_pvo_zone; /* zone for pvo entries */
195 
196 static struct	pvo_entry *moea64_bpvo_pool;
197 static int	moea64_bpvo_pool_index = 0;
198 static int	moea64_bpvo_pool_size = 0;
199 SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD,
200     &moea64_bpvo_pool_index, 0, "");
201 
202 #define	BPVO_POOL_SIZE	327680 /* Sensible historical default value */
203 #define	BPVO_POOL_EXPANSION_FACTOR	3
204 #define	VSID_NBPW	(sizeof(u_int32_t) * 8)
205 #ifdef __powerpc64__
206 #define	NVSIDS		(NPMAPS * 16)
207 #define VSID_HASHMASK	0xffffffffUL
208 #else
209 #define NVSIDS		NPMAPS
210 #define VSID_HASHMASK	0xfffffUL
211 #endif
212 static u_int	moea64_vsid_bitmap[NVSIDS / VSID_NBPW];
213 
214 static bool	moea64_initialized = false;
215 
216 #ifdef MOEA64_STATS
217 /*
218  * Statistics.
219  */
220 u_int	moea64_pte_valid = 0;
221 u_int	moea64_pte_overflow = 0;
222 u_int	moea64_pvo_entries = 0;
223 u_int	moea64_pvo_enter_calls = 0;
224 u_int	moea64_pvo_remove_calls = 0;
225 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD,
226     &moea64_pte_valid, 0, "");
227 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD,
228     &moea64_pte_overflow, 0, "");
229 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD,
230     &moea64_pvo_entries, 0, "");
231 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD,
232     &moea64_pvo_enter_calls, 0, "");
233 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD,
234     &moea64_pvo_remove_calls, 0, "");
235 #endif
236 
237 vm_offset_t	moea64_scratchpage_va[2];
238 struct pvo_entry *moea64_scratchpage_pvo[2];
239 struct	mtx	moea64_scratchpage_mtx;
240 
241 uint64_t 	moea64_large_page_mask = 0;
242 uint64_t	moea64_large_page_size = 0;
243 int		moea64_large_page_shift = 0;
244 bool		moea64_has_lp_4k_16m = false;
245 
246 /*
247  * PVO calls.
248  */
249 static int	moea64_pvo_enter(struct pvo_entry *pvo,
250 		    struct pvo_head *pvo_head, struct pvo_entry **oldpvo);
251 static void	moea64_pvo_remove_from_pmap(struct pvo_entry *pvo);
252 static void	moea64_pvo_remove_from_page(struct pvo_entry *pvo);
253 static void	moea64_pvo_remove_from_page_locked(
254 		    struct pvo_entry *pvo, vm_page_t m);
255 static struct	pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t);
256 
257 /*
258  * Utility routines.
259  */
260 static bool		moea64_query_bit(vm_page_t, uint64_t);
261 static u_int		moea64_clear_bit(vm_page_t, uint64_t);
262 static void		moea64_kremove(vm_offset_t);
263 static void		moea64_syncicache(pmap_t pmap, vm_offset_t va,
264 			    vm_paddr_t pa, vm_size_t sz);
265 static void		moea64_pmap_init_qpages(void *);
266 static void		moea64_remove_locked(pmap_t, vm_offset_t,
267 			    vm_offset_t, struct pvo_dlist *);
268 
269 /*
270  * Superpages data and routines.
271  */
272 
273 /*
274  * PVO flags (in vaddr) that must match for promotion to succeed.
275  * Note that protection bits are checked separately, as they reside in
276  * another field.
277  */
278 #define	PVO_FLAGS_PROMOTE	(PVO_WIRED | PVO_MANAGED | PVO_PTEGIDX_VALID)
279 
280 #define	PVO_IS_SP(pvo)		(((pvo)->pvo_vaddr & PVO_LARGE) && \
281 				 (pvo)->pvo_pmap != kernel_pmap)
282 
283 /* Get physical address from PVO. */
284 #define	PVO_PADDR(pvo)		moea64_pvo_paddr(pvo)
285 
286 /* MD page flag indicating that the page is a superpage. */
287 #define	MDPG_ATTR_SP		0x40000000
288 
289 SYSCTL_DECL(_vm_pmap);
290 
291 static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0,
292     "SP page mapping counters");
293 
294 static u_long sp_demotions;
295 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD,
296     &sp_demotions, 0, "SP page demotions");
297 
298 static u_long sp_mappings;
299 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD,
300     &sp_mappings, 0, "SP page mappings");
301 
302 static u_long sp_p_failures;
303 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD,
304     &sp_p_failures, 0, "SP page promotion failures");
305 
306 static u_long sp_p_fail_pa;
307 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_pa, CTLFLAG_RD,
308     &sp_p_fail_pa, 0, "SP page promotion failure: PAs don't match");
309 
310 static u_long sp_p_fail_flags;
311 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_flags, CTLFLAG_RD,
312     &sp_p_fail_flags, 0, "SP page promotion failure: page flags don't match");
313 
314 static u_long sp_p_fail_prot;
315 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_prot, CTLFLAG_RD,
316     &sp_p_fail_prot, 0,
317     "SP page promotion failure: page protections don't match");
318 
319 static u_long sp_p_fail_wimg;
320 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_wimg, CTLFLAG_RD,
321     &sp_p_fail_wimg, 0, "SP page promotion failure: WIMG bits don't match");
322 
323 static u_long sp_promotions;
324 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD,
325     &sp_promotions, 0, "SP page promotions");
326 
327 static bool moea64_ps_enabled(pmap_t);
328 static void moea64_align_superpage(vm_object_t, vm_ooffset_t,
329     vm_offset_t *, vm_size_t);
330 
331 static int moea64_sp_enter(pmap_t pmap, vm_offset_t va,
332     vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind);
333 static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp,
334     struct pvo_dlist *tofree);
335 
336 #if VM_NRESERVLEVEL > 0
337 static void moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m);
338 #endif
339 static void moea64_sp_demote_aligned(struct pvo_entry *sp);
340 static void moea64_sp_demote(struct pvo_entry *pvo);
341 
342 static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp);
343 static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp,
344     vm_prot_t prot);
345 
346 static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit);
347 static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m,
348     uint64_t ptebit);
349 
350 static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo,
351     vm_offset_t sva, vm_offset_t eva);
352 
353 /*
354  * Kernel MMU interface
355  */
356 void moea64_clear_modify(vm_page_t);
357 void moea64_copy_page(vm_page_t, vm_page_t);
358 void moea64_copy_page_dmap(vm_page_t, vm_page_t);
359 void moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
360     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
361 void moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset,
362     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
363 int moea64_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t,
364     u_int flags, int8_t psind);
365 void moea64_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
366     vm_prot_t);
367 void moea64_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
368 vm_paddr_t moea64_extract(pmap_t, vm_offset_t);
369 vm_page_t moea64_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
370 void moea64_init(void);
371 bool moea64_is_modified(vm_page_t);
372 bool moea64_is_prefaultable(pmap_t, vm_offset_t);
373 bool moea64_is_referenced(vm_page_t);
374 int moea64_ts_referenced(vm_page_t);
375 vm_offset_t moea64_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
376 bool moea64_page_exists_quick(pmap_t, vm_page_t);
377 void moea64_page_init(vm_page_t);
378 int moea64_page_wired_mappings(vm_page_t);
379 int moea64_pinit(pmap_t);
380 void moea64_pinit0(pmap_t);
381 void moea64_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
382 void moea64_qenter(vm_offset_t, vm_page_t *, int);
383 void moea64_qremove(vm_offset_t, int);
384 void moea64_release(pmap_t);
385 void moea64_remove(pmap_t, vm_offset_t, vm_offset_t);
386 void moea64_remove_pages(pmap_t);
387 void moea64_remove_all(vm_page_t);
388 void moea64_remove_write(vm_page_t);
389 void moea64_unwire(pmap_t, vm_offset_t, vm_offset_t);
390 void moea64_zero_page(vm_page_t);
391 void moea64_zero_page_dmap(vm_page_t);
392 void moea64_zero_page_area(vm_page_t, int, int);
393 void moea64_activate(struct thread *);
394 void moea64_deactivate(struct thread *);
395 void *moea64_mapdev(vm_paddr_t, vm_size_t);
396 void *moea64_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
397 void moea64_unmapdev(void *, vm_size_t);
398 vm_paddr_t moea64_kextract(vm_offset_t);
399 void moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma);
400 void moea64_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
401 void moea64_kenter(vm_offset_t, vm_paddr_t);
402 int moea64_dev_direct_mapped(vm_paddr_t, vm_size_t);
403 static void moea64_sync_icache(pmap_t, vm_offset_t, vm_size_t);
404 void moea64_dumpsys_map(vm_paddr_t pa, size_t sz,
405     void **va);
406 void moea64_scan_init(void);
407 vm_offset_t moea64_quick_enter_page(vm_page_t m);
408 vm_offset_t moea64_quick_enter_page_dmap(vm_page_t m);
409 void moea64_quick_remove_page(vm_offset_t addr);
410 bool moea64_page_is_mapped(vm_page_t m);
411 static int moea64_map_user_ptr(pmap_t pm,
412     volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen);
413 static int moea64_decode_kernel_ptr(vm_offset_t addr,
414     int *is_user, vm_offset_t *decoded_addr);
415 static size_t moea64_scan_pmap(struct bitset *dump_bitset);
416 static void *moea64_dump_pmap_init(unsigned blkpgs);
417 #ifdef __powerpc64__
418 static void moea64_page_array_startup(long);
419 #endif
420 static int moea64_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
421 
422 static struct pmap_funcs moea64_methods = {
423 	.clear_modify = moea64_clear_modify,
424 	.copy_page = moea64_copy_page,
425 	.copy_pages = moea64_copy_pages,
426 	.enter = moea64_enter,
427 	.enter_object = moea64_enter_object,
428 	.enter_quick = moea64_enter_quick,
429 	.extract = moea64_extract,
430 	.extract_and_hold = moea64_extract_and_hold,
431 	.init = moea64_init,
432 	.is_modified = moea64_is_modified,
433 	.is_prefaultable = moea64_is_prefaultable,
434 	.is_referenced = moea64_is_referenced,
435 	.ts_referenced = moea64_ts_referenced,
436 	.map =      		moea64_map,
437 	.mincore = moea64_mincore,
438 	.page_exists_quick = moea64_page_exists_quick,
439 	.page_init = moea64_page_init,
440 	.page_wired_mappings = moea64_page_wired_mappings,
441 	.pinit = moea64_pinit,
442 	.pinit0 = moea64_pinit0,
443 	.protect = moea64_protect,
444 	.qenter = moea64_qenter,
445 	.qremove = moea64_qremove,
446 	.release = moea64_release,
447 	.remove = moea64_remove,
448 	.remove_pages = moea64_remove_pages,
449 	.remove_all =       	moea64_remove_all,
450 	.remove_write = moea64_remove_write,
451 	.sync_icache = moea64_sync_icache,
452 	.unwire = moea64_unwire,
453 	.zero_page =        	moea64_zero_page,
454 	.zero_page_area = moea64_zero_page_area,
455 	.activate = moea64_activate,
456 	.deactivate =       	moea64_deactivate,
457 	.page_set_memattr = moea64_page_set_memattr,
458 	.quick_enter_page =  moea64_quick_enter_page,
459 	.quick_remove_page =  moea64_quick_remove_page,
460 	.page_is_mapped = moea64_page_is_mapped,
461 #ifdef __powerpc64__
462 	.page_array_startup = moea64_page_array_startup,
463 #endif
464 	.ps_enabled = moea64_ps_enabled,
465 	.align_superpage = moea64_align_superpage,
466 
467 	/* Internal interfaces */
468 	.mapdev = moea64_mapdev,
469 	.mapdev_attr = moea64_mapdev_attr,
470 	.unmapdev = moea64_unmapdev,
471 	.kextract = moea64_kextract,
472 	.kenter = moea64_kenter,
473 	.kenter_attr = moea64_kenter_attr,
474 	.dev_direct_mapped = moea64_dev_direct_mapped,
475 	.dumpsys_pa_init = moea64_scan_init,
476 	.dumpsys_scan_pmap = moea64_scan_pmap,
477 	.dumpsys_dump_pmap_init =    moea64_dump_pmap_init,
478 	.dumpsys_map_chunk = moea64_dumpsys_map,
479 	.map_user_ptr = moea64_map_user_ptr,
480 	.decode_kernel_ptr =  moea64_decode_kernel_ptr,
481 };
482 
483 MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods);
484 
485 /*
486  * Get physical address from PVO.
487  *
488  * For superpages, the lower bits are not stored on pvo_pte.pa and must be
489  * obtained from VA.
490  */
491 static __always_inline vm_paddr_t
moea64_pvo_paddr(struct pvo_entry * pvo)492 moea64_pvo_paddr(struct pvo_entry *pvo)
493 {
494 	vm_paddr_t pa;
495 
496 	pa = (pvo)->pvo_pte.pa & LPTE_RPGN;
497 
498 	if (PVO_IS_SP(pvo)) {
499 		pa &= ~HPT_SP_MASK; /* This is needed to clear LPTE_LP bits. */
500 		pa |= PVO_VADDR(pvo) & HPT_SP_MASK;
501 	}
502 	return (pa);
503 }
504 
505 static struct pvo_head *
vm_page_to_pvoh(vm_page_t m)506 vm_page_to_pvoh(vm_page_t m)
507 {
508 
509 	rw_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), RA_LOCKED);
510 	return (&m->md.mdpg_pvoh);
511 }
512 
513 static struct pvo_entry *
alloc_pvo_entry(int bootstrap)514 alloc_pvo_entry(int bootstrap)
515 {
516 	struct pvo_entry *pvo;
517 
518 	if (!moea64_initialized || bootstrap) {
519 		if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) {
520 			panic("%s: bpvo pool exhausted, index=%d, size=%d, bytes=%zd."
521 			    "Try setting machdep.moea64_bpvo_pool_size tunable",
522 			    __func__, moea64_bpvo_pool_index,
523 			    moea64_bpvo_pool_size,
524 			    moea64_bpvo_pool_size * sizeof(struct pvo_entry));
525 		}
526 		pvo = &moea64_bpvo_pool[
527 		    atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)];
528 		bzero(pvo, sizeof(*pvo));
529 		pvo->pvo_vaddr = PVO_BOOTSTRAP;
530 	} else
531 		pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT | M_ZERO);
532 
533 	return (pvo);
534 }
535 
536 static void
init_pvo_entry(struct pvo_entry * pvo,pmap_t pmap,vm_offset_t va)537 init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va)
538 {
539 	uint64_t vsid;
540 	uint64_t hash;
541 	int shift;
542 
543 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
544 
545 	pvo->pvo_pmap = pmap;
546 	va &= ~ADDR_POFF;
547 	pvo->pvo_vaddr |= va;
548 	vsid = va_to_vsid(pmap, va);
549 	pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT)
550 	    | (vsid << 16);
551 
552 	if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0)
553 		shift = moea64_large_page_shift;
554 	else
555 		shift = ADDR_PIDX_SHFT;
556 	hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift);
557 	pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3;
558 }
559 
560 static void
free_pvo_entry(struct pvo_entry * pvo)561 free_pvo_entry(struct pvo_entry *pvo)
562 {
563 
564 	if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
565 		uma_zfree(moea64_pvo_zone, pvo);
566 }
567 
568 void
moea64_pte_from_pvo(const struct pvo_entry * pvo,struct lpte * lpte)569 moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte)
570 {
571 
572 	lpte->pte_hi = moea64_pte_vpn_from_pvo_vpn(pvo);
573 	lpte->pte_hi |= LPTE_VALID;
574 
575 	if (pvo->pvo_vaddr & PVO_LARGE)
576 		lpte->pte_hi |= LPTE_BIG;
577 	if (pvo->pvo_vaddr & PVO_WIRED)
578 		lpte->pte_hi |= LPTE_WIRED;
579 	if (pvo->pvo_vaddr & PVO_HID)
580 		lpte->pte_hi |= LPTE_HID;
581 
582 	lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */
583 	if (pvo->pvo_pte.prot & VM_PROT_WRITE)
584 		lpte->pte_lo |= LPTE_BW;
585 	else
586 		lpte->pte_lo |= LPTE_BR;
587 
588 	if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE))
589 		lpte->pte_lo |= LPTE_NOEXEC;
590 }
591 
592 static __inline uint64_t
moea64_calc_wimg(vm_paddr_t pa,vm_memattr_t ma)593 moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
594 {
595 	uint64_t pte_lo;
596 	int i;
597 
598 	if (ma != VM_MEMATTR_DEFAULT) {
599 		switch (ma) {
600 		case VM_MEMATTR_UNCACHEABLE:
601 			return (LPTE_I | LPTE_G);
602 		case VM_MEMATTR_CACHEABLE:
603 			return (LPTE_M);
604 		case VM_MEMATTR_WRITE_COMBINING:
605 		case VM_MEMATTR_WRITE_BACK:
606 		case VM_MEMATTR_PREFETCHABLE:
607 			return (LPTE_I);
608 		case VM_MEMATTR_WRITE_THROUGH:
609 			return (LPTE_W | LPTE_M);
610 		}
611 	}
612 
613 	/*
614 	 * Assume the page is cache inhibited and access is guarded unless
615 	 * it's in our available memory array.
616 	 */
617 	pte_lo = LPTE_I | LPTE_G;
618 	for (i = 0; i < pregions_sz; i++) {
619 		if ((pa >= pregions[i].mr_start) &&
620 		    (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
621 			pte_lo &= ~(LPTE_I | LPTE_G);
622 			pte_lo |= LPTE_M;
623 			break;
624 		}
625 	}
626 
627 	return pte_lo;
628 }
629 
630 /*
631  * Quick sort callout for comparing memory regions.
632  */
633 static int	om_cmp(const void *a, const void *b);
634 
635 static int
om_cmp(const void * a,const void * b)636 om_cmp(const void *a, const void *b)
637 {
638 	const struct	ofw_map *mapa;
639 	const struct	ofw_map *mapb;
640 
641 	mapa = a;
642 	mapb = b;
643 	if (mapa->om_pa < mapb->om_pa)
644 		return (-1);
645 	else if (mapa->om_pa > mapb->om_pa)
646 		return (1);
647 	else
648 		return (0);
649 }
650 
651 static void
moea64_add_ofw_mappings(phandle_t mmu,size_t sz)652 moea64_add_ofw_mappings(phandle_t mmu, size_t sz)
653 {
654 	struct ofw_map	translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */
655 	pcell_t		acells, trans_cells[sz/sizeof(cell_t)];
656 	struct pvo_entry *pvo;
657 	register_t	msr;
658 	vm_offset_t	off;
659 	vm_paddr_t	pa_base;
660 	int		i, j;
661 
662 	bzero(translations, sz);
663 	OF_getencprop(OF_finddevice("/"), "#address-cells", &acells,
664 	    sizeof(acells));
665 	if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1)
666 		panic("moea64_bootstrap: can't get ofw translations");
667 
668 	CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations");
669 	sz /= sizeof(cell_t);
670 	for (i = 0, j = 0; i < sz; j++) {
671 		translations[j].om_va = trans_cells[i++];
672 		translations[j].om_len = trans_cells[i++];
673 		translations[j].om_pa = trans_cells[i++];
674 		if (acells == 2) {
675 			translations[j].om_pa <<= 32;
676 			translations[j].om_pa |= trans_cells[i++];
677 		}
678 		translations[j].om_mode = trans_cells[i++];
679 	}
680 	KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)",
681 	    i, sz));
682 
683 	sz = j;
684 	qsort(translations, sz, sizeof (*translations), om_cmp);
685 
686 	for (i = 0; i < sz; i++) {
687 		pa_base = translations[i].om_pa;
688 	      #ifndef __powerpc64__
689 		if ((translations[i].om_pa >> 32) != 0)
690 			panic("OFW translations above 32-bit boundary!");
691 	      #endif
692 
693 		if (pa_base % PAGE_SIZE)
694 			panic("OFW translation not page-aligned (phys)!");
695 		if (translations[i].om_va % PAGE_SIZE)
696 			panic("OFW translation not page-aligned (virt)!");
697 
698 		CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x",
699 		    pa_base, translations[i].om_va, translations[i].om_len);
700 
701 		/* Now enter the pages for this mapping */
702 
703 		DISABLE_TRANS(msr);
704 		for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) {
705 			/* If this address is direct-mapped, skip remapping */
706 			if (hw_direct_map &&
707 			    translations[i].om_va == PHYS_TO_DMAP(pa_base) &&
708 			    moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT)
709  			    == LPTE_M)
710 				continue;
711 
712 			PMAP_LOCK(kernel_pmap);
713 			pvo = moea64_pvo_find_va(kernel_pmap,
714 			    translations[i].om_va + off);
715 			PMAP_UNLOCK(kernel_pmap);
716 			if (pvo != NULL)
717 				continue;
718 
719 			moea64_kenter(translations[i].om_va + off,
720 			    pa_base + off);
721 		}
722 		ENABLE_TRANS(msr);
723 	}
724 }
725 
726 #ifdef __powerpc64__
727 static void
moea64_probe_large_page(void)728 moea64_probe_large_page(void)
729 {
730 	uint16_t pvr = mfpvr() >> 16;
731 
732 	switch (pvr) {
733 	case IBM970:
734 	case IBM970FX:
735 	case IBM970MP:
736 		powerpc_sync(); isync();
737 		mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG);
738 		powerpc_sync(); isync();
739 
740 		/* FALLTHROUGH */
741 	default:
742 		if (moea64_large_page_size == 0) {
743 			moea64_large_page_size = 0x1000000; /* 16 MB */
744 			moea64_large_page_shift = 24;
745 		}
746 	}
747 
748 	moea64_large_page_mask = moea64_large_page_size - 1;
749 }
750 
751 static void
moea64_bootstrap_slb_prefault(vm_offset_t va,int large)752 moea64_bootstrap_slb_prefault(vm_offset_t va, int large)
753 {
754 	struct slb *cache;
755 	struct slb entry;
756 	uint64_t esid, slbe;
757 	uint64_t i;
758 
759 	cache = PCPU_GET(aim.slb);
760 	esid = va >> ADDR_SR_SHFT;
761 	slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
762 
763 	for (i = 0; i < 64; i++) {
764 		if (cache[i].slbe == (slbe | i))
765 			return;
766 	}
767 
768 	entry.slbe = slbe;
769 	entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT;
770 	if (large)
771 		entry.slbv |= SLBV_L;
772 
773 	slb_insert_kernel(entry.slbe, entry.slbv);
774 }
775 #endif
776 
777 static int
moea64_kenter_large(vm_offset_t va,vm_paddr_t pa,uint64_t attr,int bootstrap)778 moea64_kenter_large(vm_offset_t va, vm_paddr_t pa, uint64_t attr, int bootstrap)
779 {
780 	struct pvo_entry *pvo;
781 	uint64_t pte_lo;
782 	int error;
783 
784 	pte_lo = LPTE_M;
785 	pte_lo |= attr;
786 
787 	pvo = alloc_pvo_entry(bootstrap);
788 	pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE;
789 	init_pvo_entry(pvo, kernel_pmap, va);
790 
791 	pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE |
792 	    VM_PROT_EXECUTE;
793 	pvo->pvo_pte.pa = pa | pte_lo;
794 	error = moea64_pvo_enter(pvo, NULL, NULL);
795 	if (error != 0)
796 		panic("Error %d inserting large page\n", error);
797 	return (0);
798 }
799 
800 static void
moea64_setup_direct_map(vm_offset_t kernelstart,vm_offset_t kernelend)801 moea64_setup_direct_map(vm_offset_t kernelstart,
802     vm_offset_t kernelend)
803 {
804 	register_t msr;
805 	vm_paddr_t pa, pkernelstart, pkernelend;
806 	vm_offset_t size, off;
807 	uint64_t pte_lo;
808 	int i;
809 
810 	if (moea64_large_page_size == 0)
811 		hw_direct_map = 0;
812 
813 	DISABLE_TRANS(msr);
814 	if (hw_direct_map) {
815 		PMAP_LOCK(kernel_pmap);
816 		for (i = 0; i < pregions_sz; i++) {
817 		  for (pa = pregions[i].mr_start; pa < pregions[i].mr_start +
818 		     pregions[i].mr_size; pa += moea64_large_page_size) {
819 			pte_lo = LPTE_M;
820 			if (pa & moea64_large_page_mask) {
821 				pa &= moea64_large_page_mask;
822 				pte_lo |= LPTE_G;
823 			}
824 			if (pa + moea64_large_page_size >
825 			    pregions[i].mr_start + pregions[i].mr_size)
826 				pte_lo |= LPTE_G;
827 
828 			moea64_kenter_large(PHYS_TO_DMAP(pa), pa, pte_lo, 1);
829 		  }
830 		}
831 		PMAP_UNLOCK(kernel_pmap);
832 	}
833 
834 	/*
835 	 * Make sure the kernel and BPVO pool stay mapped on systems either
836 	 * without a direct map or on which the kernel is not already executing
837 	 * out of the direct-mapped region.
838 	 */
839 	if (kernelstart < DMAP_BASE_ADDRESS) {
840 		/*
841 		 * For pre-dmap execution, we need to use identity mapping
842 		 * because we will be operating with the mmu on but in the
843 		 * wrong address configuration until we __restartkernel().
844 		 */
845 		for (pa = kernelstart & ~PAGE_MASK; pa < kernelend;
846 		    pa += PAGE_SIZE)
847 			moea64_kenter(pa, pa);
848 	} else if (!hw_direct_map) {
849 		pkernelstart = kernelstart & ~DMAP_BASE_ADDRESS;
850 		pkernelend = kernelend & ~DMAP_BASE_ADDRESS;
851 		for (pa = pkernelstart & ~PAGE_MASK; pa < pkernelend;
852 		    pa += PAGE_SIZE)
853 			moea64_kenter(pa | DMAP_BASE_ADDRESS, pa);
854 	}
855 
856 	if (!hw_direct_map) {
857 		size = moea64_bpvo_pool_size*sizeof(struct pvo_entry);
858 		off = (vm_offset_t)(moea64_bpvo_pool);
859 		for (pa = off; pa < off + size; pa += PAGE_SIZE)
860 			moea64_kenter(pa, pa);
861 
862 		/* Map exception vectors */
863 		for (pa = EXC_RSVD; pa < EXC_LAST; pa += PAGE_SIZE)
864 			moea64_kenter(pa | DMAP_BASE_ADDRESS, pa);
865 	}
866 	ENABLE_TRANS(msr);
867 
868 	/*
869 	 * Allow user to override unmapped_buf_allowed for testing.
870 	 * XXXKIB Only direct map implementation was tested.
871 	 */
872 	if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed",
873 	    &unmapped_buf_allowed))
874 		unmapped_buf_allowed = hw_direct_map;
875 }
876 
877 /* Quick sort callout for comparing physical addresses. */
878 static int
pa_cmp(const void * a,const void * b)879 pa_cmp(const void *a, const void *b)
880 {
881 	const vm_paddr_t *pa = a, *pb = b;
882 
883 	if (*pa < *pb)
884 		return (-1);
885 	else if (*pa > *pb)
886 		return (1);
887 	else
888 		return (0);
889 }
890 
891 void
moea64_early_bootstrap(vm_offset_t kernelstart,vm_offset_t kernelend)892 moea64_early_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
893 {
894 	int		i, j;
895 	vm_size_t	physsz, hwphyssz;
896 	vm_paddr_t	kernelphysstart, kernelphysend;
897 	int		rm_pavail;
898 
899 	/* Level 0 reservations consist of 4096 pages (16MB superpage). */
900 	vm_level_0_order = VM_LEVEL_0_ORDER_HPT;
901 
902 #ifndef __powerpc64__
903 	/* We don't have a direct map since there is no BAT */
904 	hw_direct_map = 0;
905 
906 	/* Make sure battable is zero, since we have no BAT */
907 	for (i = 0; i < 16; i++) {
908 		battable[i].batu = 0;
909 		battable[i].batl = 0;
910 	}
911 #else
912 	/* Install trap handlers for SLBs */
913 	bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap);
914 	bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap);
915 	__syncicache((void *)EXC_DSE, 0x80);
916 	__syncicache((void *)EXC_ISE, 0x80);
917 #endif
918 
919 	kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS;
920 	kernelphysend = kernelend & ~DMAP_BASE_ADDRESS;
921 
922 	/* Get physical memory regions from firmware */
923 	mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
924 	CTR0(KTR_PMAP, "moea64_bootstrap: physical memory");
925 
926 	if (PHYS_AVAIL_ENTRIES < regions_sz)
927 		panic("moea64_bootstrap: phys_avail too small");
928 
929 	phys_avail_count = 0;
930 	physsz = 0;
931 	hwphyssz = 0;
932 	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
933 	for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
934 		CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)",
935 		    regions[i].mr_start, regions[i].mr_start +
936 		    regions[i].mr_size, regions[i].mr_size);
937 		if (hwphyssz != 0 &&
938 		    (physsz + regions[i].mr_size) >= hwphyssz) {
939 			if (physsz < hwphyssz) {
940 				phys_avail[j] = regions[i].mr_start;
941 				phys_avail[j + 1] = regions[i].mr_start +
942 				    hwphyssz - physsz;
943 				physsz = hwphyssz;
944 				phys_avail_count++;
945 				dump_avail[j] = phys_avail[j];
946 				dump_avail[j + 1] = phys_avail[j + 1];
947 			}
948 			break;
949 		}
950 		phys_avail[j] = regions[i].mr_start;
951 		phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
952 		phys_avail_count++;
953 		physsz += regions[i].mr_size;
954 		dump_avail[j] = phys_avail[j];
955 		dump_avail[j + 1] = phys_avail[j + 1];
956 	}
957 
958 	/* Check for overlap with the kernel and exception vectors */
959 	rm_pavail = 0;
960 	for (j = 0; j < 2*phys_avail_count; j+=2) {
961 		if (phys_avail[j] < EXC_LAST)
962 			phys_avail[j] += EXC_LAST;
963 
964 		if (phys_avail[j] >= kernelphysstart &&
965 		    phys_avail[j+1] <= kernelphysend) {
966 			phys_avail[j] = phys_avail[j+1] = ~0;
967 			rm_pavail++;
968 			continue;
969 		}
970 
971 		if (kernelphysstart >= phys_avail[j] &&
972 		    kernelphysstart < phys_avail[j+1]) {
973 			if (kernelphysend < phys_avail[j+1]) {
974 				phys_avail[2*phys_avail_count] =
975 				    (kernelphysend & ~PAGE_MASK) + PAGE_SIZE;
976 				phys_avail[2*phys_avail_count + 1] =
977 				    phys_avail[j+1];
978 				phys_avail_count++;
979 			}
980 
981 			phys_avail[j+1] = kernelphysstart & ~PAGE_MASK;
982 		}
983 
984 		if (kernelphysend >= phys_avail[j] &&
985 		    kernelphysend < phys_avail[j+1]) {
986 			if (kernelphysstart > phys_avail[j]) {
987 				phys_avail[2*phys_avail_count] = phys_avail[j];
988 				phys_avail[2*phys_avail_count + 1] =
989 				    kernelphysstart & ~PAGE_MASK;
990 				phys_avail_count++;
991 			}
992 
993 			phys_avail[j] = (kernelphysend & ~PAGE_MASK) +
994 			    PAGE_SIZE;
995 		}
996 	}
997 
998 	/* Remove physical available regions marked for removal (~0) */
999 	if (rm_pavail) {
1000 		qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]),
1001 			pa_cmp);
1002 		phys_avail_count -= rm_pavail;
1003 		for (i = 2*phys_avail_count;
1004 		     i < 2*(phys_avail_count + rm_pavail); i+=2)
1005 			phys_avail[i] = phys_avail[i+1] = 0;
1006 	}
1007 
1008 	physmem = btoc(physsz);
1009 
1010 #ifdef PTEGCOUNT
1011 	moea64_pteg_count = PTEGCOUNT;
1012 #else
1013 	moea64_pteg_count = 0x1000;
1014 
1015 	while (moea64_pteg_count < physmem)
1016 		moea64_pteg_count <<= 1;
1017 
1018 	moea64_pteg_count >>= 1;
1019 #endif /* PTEGCOUNT */
1020 }
1021 
1022 void
moea64_mid_bootstrap(vm_offset_t kernelstart,vm_offset_t kernelend)1023 moea64_mid_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
1024 {
1025 	int		i;
1026 
1027 	/*
1028 	 * Set PTEG mask
1029 	 */
1030 	moea64_pteg_mask = moea64_pteg_count - 1;
1031 
1032 	/*
1033 	 * Initialize SLB table lock and page locks
1034 	 */
1035 	mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF);
1036 	for (i = 0; i < PV_LOCK_COUNT; i++)
1037 		rw_init(&pv_lock[i], "pv lock");
1038 
1039 	/*
1040 	 * Initialise the bootstrap pvo pool.
1041 	 */
1042 	TUNABLE_INT_FETCH("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size);
1043 	if (moea64_bpvo_pool_size == 0) {
1044 		if (!hw_direct_map)
1045 			moea64_bpvo_pool_size = ((ptoa((uintmax_t)physmem) * sizeof(struct vm_page)) /
1046 			    (PAGE_SIZE * PAGE_SIZE)) * BPVO_POOL_EXPANSION_FACTOR;
1047 		else
1048 			moea64_bpvo_pool_size = BPVO_POOL_SIZE;
1049 	}
1050 
1051 	if (boothowto & RB_VERBOSE) {
1052 		printf("mmu_oea64: bpvo pool entries = %d, bpvo pool size = %zu MB\n",
1053 		    moea64_bpvo_pool_size,
1054 		    moea64_bpvo_pool_size*sizeof(struct pvo_entry) / 1048576);
1055 	}
1056 
1057 	moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc(
1058 		moea64_bpvo_pool_size*sizeof(struct pvo_entry), PAGE_SIZE);
1059 	moea64_bpvo_pool_index = 0;
1060 
1061 	/* Place at address usable through the direct map */
1062 	if (hw_direct_map)
1063 		moea64_bpvo_pool = (struct pvo_entry *)
1064 		    PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool);
1065 
1066 	/*
1067 	 * Make sure kernel vsid is allocated as well as VSID 0.
1068 	 */
1069 	#ifndef __powerpc64__
1070 	moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW]
1071 		|= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
1072 	moea64_vsid_bitmap[0] |= 1;
1073 	#endif
1074 
1075 	/*
1076 	 * Initialize the kernel pmap (which is statically allocated).
1077 	 */
1078 	#ifdef __powerpc64__
1079 	for (i = 0; i < 64; i++) {
1080 		pcpup->pc_aim.slb[i].slbv = 0;
1081 		pcpup->pc_aim.slb[i].slbe = 0;
1082 	}
1083 	#else
1084 	for (i = 0; i < 16; i++)
1085 		kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
1086 	#endif
1087 
1088 	kernel_pmap->pmap_phys = kernel_pmap;
1089 	CPU_FILL(&kernel_pmap->pm_active);
1090 	RB_INIT(&kernel_pmap->pmap_pvo);
1091 
1092 	PMAP_LOCK_INIT(kernel_pmap);
1093 
1094 	/*
1095 	 * Now map in all the other buffers we allocated earlier
1096 	 */
1097 
1098 	moea64_setup_direct_map(kernelstart, kernelend);
1099 }
1100 
1101 void
moea64_late_bootstrap(vm_offset_t kernelstart,vm_offset_t kernelend)1102 moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
1103 {
1104 	ihandle_t	mmui;
1105 	phandle_t	chosen;
1106 	phandle_t	mmu;
1107 	ssize_t		sz;
1108 	int		i;
1109 	vm_offset_t	pa, va;
1110 	void		*dpcpu;
1111 
1112 	/*
1113 	 * Set up the Open Firmware pmap and add its mappings if not in real
1114 	 * mode.
1115 	 */
1116 
1117 	chosen = OF_finddevice("/chosen");
1118 	if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) {
1119 		mmu = OF_instance_to_package(mmui);
1120 		if (mmu == -1 ||
1121 		    (sz = OF_getproplen(mmu, "translations")) == -1)
1122 			sz = 0;
1123 		if (sz > 6144 /* tmpstksz - 2 KB headroom */)
1124 			panic("moea64_bootstrap: too many ofw translations");
1125 
1126 		if (sz > 0)
1127 			moea64_add_ofw_mappings(mmu, sz);
1128 	}
1129 
1130 	/*
1131 	 * Calculate the last available physical address.
1132 	 */
1133 	Maxmem = 0;
1134 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
1135 		Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
1136 
1137 	/*
1138 	 * Initialize MMU.
1139 	 */
1140 	pmap_cpu_bootstrap(0);
1141 	mtmsr(mfmsr() | PSL_DR | PSL_IR);
1142 	pmap_bootstrapped++;
1143 
1144 	/*
1145 	 * Set the start and end of kva.
1146 	 */
1147 	virtual_avail = VM_MIN_KERNEL_ADDRESS;
1148 	virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
1149 
1150 	/*
1151 	 * Map the entire KVA range into the SLB. We must not fault there.
1152 	 */
1153 	#ifdef __powerpc64__
1154 	for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH)
1155 		moea64_bootstrap_slb_prefault(va, 0);
1156 	#endif
1157 
1158 	/*
1159 	 * Remap any early IO mappings (console framebuffer, etc.)
1160 	 */
1161 	bs_remap_earlyboot();
1162 
1163 	/*
1164 	 * Figure out how far we can extend virtual_end into segment 16
1165 	 * without running into existing mappings. Segment 16 is guaranteed
1166 	 * to contain neither RAM nor devices (at least on Apple hardware),
1167 	 * but will generally contain some OFW mappings we should not
1168 	 * step on.
1169 	 */
1170 
1171 	#ifndef __powerpc64__	/* KVA is in high memory on PPC64 */
1172 	PMAP_LOCK(kernel_pmap);
1173 	while (virtual_end < VM_MAX_KERNEL_ADDRESS &&
1174 	    moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL)
1175 		virtual_end += PAGE_SIZE;
1176 	PMAP_UNLOCK(kernel_pmap);
1177 	#endif
1178 
1179 	/*
1180 	 * Allocate a kernel stack with a guard page for thread0 and map it
1181 	 * into the kernel page map.
1182 	 */
1183 	pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE);
1184 	va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
1185 	virtual_avail = va + kstack_pages * PAGE_SIZE;
1186 	CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
1187 	thread0.td_kstack = va;
1188 	thread0.td_kstack_pages = kstack_pages;
1189 	for (i = 0; i < kstack_pages; i++) {
1190 		moea64_kenter(va, pa);
1191 		pa += PAGE_SIZE;
1192 		va += PAGE_SIZE;
1193 	}
1194 
1195 	/*
1196 	 * Allocate virtual address space for the message buffer.
1197 	 */
1198 	pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE);
1199 	msgbufp = (struct msgbuf *)virtual_avail;
1200 	va = virtual_avail;
1201 	virtual_avail += round_page(msgbufsize);
1202 	while (va < virtual_avail) {
1203 		moea64_kenter(va, pa);
1204 		pa += PAGE_SIZE;
1205 		va += PAGE_SIZE;
1206 	}
1207 
1208 	/*
1209 	 * Allocate virtual address space for the dynamic percpu area.
1210 	 */
1211 	pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE);
1212 	dpcpu = (void *)virtual_avail;
1213 	va = virtual_avail;
1214 	virtual_avail += DPCPU_SIZE;
1215 	while (va < virtual_avail) {
1216 		moea64_kenter(va, pa);
1217 		pa += PAGE_SIZE;
1218 		va += PAGE_SIZE;
1219 	}
1220 	dpcpu_init(dpcpu, curcpu);
1221 
1222 	crashdumpmap = (caddr_t)virtual_avail;
1223 	virtual_avail += MAXDUMPPGS * PAGE_SIZE;
1224 
1225 	/*
1226 	 * Allocate some things for page zeroing. We put this directly
1227 	 * in the page table and use MOEA64_PTE_REPLACE to avoid any
1228 	 * of the PVO book-keeping or other parts of the VM system
1229 	 * from even knowing that this hack exists.
1230 	 */
1231 
1232 	if (!hw_direct_map) {
1233 		mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL,
1234 		    MTX_DEF);
1235 		for (i = 0; i < 2; i++) {
1236 			moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE;
1237 			virtual_end -= PAGE_SIZE;
1238 
1239 			moea64_kenter(moea64_scratchpage_va[i], 0);
1240 
1241 			PMAP_LOCK(kernel_pmap);
1242 			moea64_scratchpage_pvo[i] = moea64_pvo_find_va(
1243 			    kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]);
1244 			PMAP_UNLOCK(kernel_pmap);
1245 		}
1246 	}
1247 
1248 	numa_mem_regions(&numa_pregions, &numapregions_sz);
1249 }
1250 
1251 static void
moea64_pmap_init_qpages(void * dummy __unused)1252 moea64_pmap_init_qpages(void *dummy __unused)
1253 {
1254 	struct pcpu *pc;
1255 	int i;
1256 
1257 	if (hw_direct_map)
1258 		return;
1259 
1260 	CPU_FOREACH(i) {
1261 		pc = pcpu_find(i);
1262 		pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
1263 		if (pc->pc_qmap_addr == 0)
1264 			panic("pmap_init_qpages: unable to allocate KVA");
1265 		PMAP_LOCK(kernel_pmap);
1266 		pc->pc_aim.qmap_pvo =
1267 		    moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr);
1268 		PMAP_UNLOCK(kernel_pmap);
1269 		mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF);
1270 	}
1271 }
1272 
1273 SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL);
1274 
1275 /*
1276  * Activate a user pmap.  This mostly involves setting some non-CPU
1277  * state.
1278  */
1279 void
moea64_activate(struct thread * td)1280 moea64_activate(struct thread *td)
1281 {
1282 	pmap_t	pm;
1283 
1284 	pm = &td->td_proc->p_vmspace->vm_pmap;
1285 	CPU_SET(PCPU_GET(cpuid), &pm->pm_active);
1286 
1287 	#ifdef __powerpc64__
1288 	PCPU_SET(aim.userslb, pm->pm_slb);
1289 	__asm __volatile("slbmte %0, %1; isync" ::
1290 	    "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE));
1291 	#else
1292 	PCPU_SET(curpmap, pm->pmap_phys);
1293 	mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid);
1294 	#endif
1295 }
1296 
1297 void
moea64_deactivate(struct thread * td)1298 moea64_deactivate(struct thread *td)
1299 {
1300 	pmap_t	pm;
1301 
1302 	__asm __volatile("isync; slbie %0" :: "r"(USER_ADDR));
1303 
1304 	pm = &td->td_proc->p_vmspace->vm_pmap;
1305 	CPU_CLR(PCPU_GET(cpuid), &pm->pm_active);
1306 	#ifdef __powerpc64__
1307 	PCPU_SET(aim.userslb, NULL);
1308 	#else
1309 	PCPU_SET(curpmap, NULL);
1310 	#endif
1311 }
1312 
1313 void
moea64_unwire(pmap_t pm,vm_offset_t sva,vm_offset_t eva)1314 moea64_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1315 {
1316 	struct	pvo_entry key, *pvo;
1317 	vm_page_t m;
1318 	int64_t	refchg;
1319 
1320 	key.pvo_vaddr = sva;
1321 	PMAP_LOCK(pm);
1322 	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1323 	    pvo != NULL && PVO_VADDR(pvo) < eva;
1324 	    pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
1325 		if (PVO_IS_SP(pvo)) {
1326 			if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
1327 				pvo = moea64_sp_unwire(pvo);
1328 				continue;
1329 			} else {
1330 				CTR1(KTR_PMAP, "%s: demote before unwire",
1331 				    __func__);
1332 				moea64_sp_demote(pvo);
1333 			}
1334 		}
1335 
1336 		if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
1337 			panic("moea64_unwire: pvo %p is missing PVO_WIRED",
1338 			    pvo);
1339 		pvo->pvo_vaddr &= ~PVO_WIRED;
1340 		refchg = moea64_pte_replace(pvo, 0 /* No invalidation */);
1341 		if ((pvo->pvo_vaddr & PVO_MANAGED) &&
1342 		    (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
1343 			if (refchg < 0)
1344 				refchg = LPTE_CHG;
1345 			m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
1346 
1347 			refchg |= atomic_readandclear_32(&m->md.mdpg_attrs);
1348 			if (refchg & LPTE_CHG)
1349 				vm_page_dirty(m);
1350 			if (refchg & LPTE_REF)
1351 				vm_page_aflag_set(m, PGA_REFERENCED);
1352 		}
1353 		pm->pm_stats.wired_count--;
1354 	}
1355 	PMAP_UNLOCK(pm);
1356 }
1357 
1358 static int
moea64_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)1359 moea64_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
1360 {
1361 	struct pvo_entry *pvo;
1362 	vm_paddr_t pa;
1363 	vm_page_t m;
1364 	int val;
1365 	bool managed;
1366 
1367 	PMAP_LOCK(pmap);
1368 
1369 	pvo = moea64_pvo_find_va(pmap, addr);
1370 	if (pvo != NULL) {
1371 		pa = PVO_PADDR(pvo);
1372 		m = PHYS_TO_VM_PAGE(pa);
1373 		managed = (pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED;
1374 		if (PVO_IS_SP(pvo))
1375 			val = MINCORE_INCORE | MINCORE_PSIND(1);
1376 		else
1377 			val = MINCORE_INCORE;
1378 	} else {
1379 		PMAP_UNLOCK(pmap);
1380 		return (0);
1381 	}
1382 
1383 	PMAP_UNLOCK(pmap);
1384 
1385 	if (m == NULL)
1386 		return (0);
1387 
1388 	if (managed) {
1389 		if (moea64_is_modified(m))
1390 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
1391 
1392 		if (moea64_is_referenced(m))
1393 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
1394 	}
1395 
1396 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
1397 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
1398 	    managed) {
1399 		*pap = pa;
1400 	}
1401 
1402 	return (val);
1403 }
1404 
1405 /*
1406  * This goes through and sets the physical address of our
1407  * special scratch PTE to the PA we want to zero or copy. Because
1408  * of locking issues (this can get called in pvo_enter() by
1409  * the UMA allocator), we can't use most other utility functions here
1410  */
1411 
1412 static __inline
moea64_set_scratchpage_pa(int which,vm_paddr_t pa)1413 void moea64_set_scratchpage_pa(int which, vm_paddr_t pa)
1414 {
1415 	struct pvo_entry *pvo;
1416 
1417 	KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!"));
1418 	mtx_assert(&moea64_scratchpage_mtx, MA_OWNED);
1419 
1420 	pvo = moea64_scratchpage_pvo[which];
1421 	PMAP_LOCK(pvo->pvo_pmap);
1422 	pvo->pvo_pte.pa =
1423 	    moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa;
1424 	moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
1425 	PMAP_UNLOCK(pvo->pvo_pmap);
1426 	isync();
1427 }
1428 
1429 void
moea64_copy_page(vm_page_t msrc,vm_page_t mdst)1430 moea64_copy_page(vm_page_t msrc, vm_page_t mdst)
1431 {
1432 	mtx_lock(&moea64_scratchpage_mtx);
1433 
1434 	moea64_set_scratchpage_pa(0, VM_PAGE_TO_PHYS(msrc));
1435 	moea64_set_scratchpage_pa(1, VM_PAGE_TO_PHYS(mdst));
1436 
1437 	bcopy((void *)moea64_scratchpage_va[0],
1438 	    (void *)moea64_scratchpage_va[1], PAGE_SIZE);
1439 
1440 	mtx_unlock(&moea64_scratchpage_mtx);
1441 }
1442 
1443 void
moea64_copy_page_dmap(vm_page_t msrc,vm_page_t mdst)1444 moea64_copy_page_dmap(vm_page_t msrc, vm_page_t mdst)
1445 {
1446 	vm_offset_t	dst;
1447 	vm_offset_t	src;
1448 
1449 	dst = VM_PAGE_TO_PHYS(mdst);
1450 	src = VM_PAGE_TO_PHYS(msrc);
1451 
1452 	bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst),
1453 	    PAGE_SIZE);
1454 }
1455 
1456 inline void
moea64_copy_pages_dmap(vm_page_t * ma,vm_offset_t a_offset,vm_page_t * mb,vm_offset_t b_offset,int xfersize)1457 moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset,
1458     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1459 {
1460 	void *a_cp, *b_cp;
1461 	vm_offset_t a_pg_offset, b_pg_offset;
1462 	int cnt;
1463 
1464 	while (xfersize > 0) {
1465 		a_pg_offset = a_offset & PAGE_MASK;
1466 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1467 		a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
1468 		    VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
1469 		    a_pg_offset;
1470 		b_pg_offset = b_offset & PAGE_MASK;
1471 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1472 		b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
1473 		    VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
1474 		    b_pg_offset;
1475 		bcopy(a_cp, b_cp, cnt);
1476 		a_offset += cnt;
1477 		b_offset += cnt;
1478 		xfersize -= cnt;
1479 	}
1480 }
1481 
1482 void
moea64_copy_pages(vm_page_t * ma,vm_offset_t a_offset,vm_page_t * mb,vm_offset_t b_offset,int xfersize)1483 moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
1484     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1485 {
1486 	void *a_cp, *b_cp;
1487 	vm_offset_t a_pg_offset, b_pg_offset;
1488 	int cnt;
1489 
1490 	mtx_lock(&moea64_scratchpage_mtx);
1491 	while (xfersize > 0) {
1492 		a_pg_offset = a_offset & PAGE_MASK;
1493 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1494 		moea64_set_scratchpage_pa(0,
1495 		    VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]));
1496 		a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset;
1497 		b_pg_offset = b_offset & PAGE_MASK;
1498 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1499 		moea64_set_scratchpage_pa(1,
1500 		    VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]));
1501 		b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset;
1502 		bcopy(a_cp, b_cp, cnt);
1503 		a_offset += cnt;
1504 		b_offset += cnt;
1505 		xfersize -= cnt;
1506 	}
1507 	mtx_unlock(&moea64_scratchpage_mtx);
1508 }
1509 
1510 void
moea64_zero_page_area(vm_page_t m,int off,int size)1511 moea64_zero_page_area(vm_page_t m, int off, int size)
1512 {
1513 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1514 
1515 	if (size + off > PAGE_SIZE)
1516 		panic("moea64_zero_page: size + off > PAGE_SIZE");
1517 
1518 	if (hw_direct_map) {
1519 		bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size);
1520 	} else {
1521 		mtx_lock(&moea64_scratchpage_mtx);
1522 		moea64_set_scratchpage_pa(0, pa);
1523 		bzero((caddr_t)moea64_scratchpage_va[0] + off, size);
1524 		mtx_unlock(&moea64_scratchpage_mtx);
1525 	}
1526 }
1527 
1528 /*
1529  * Zero a page of physical memory by temporarily mapping it
1530  */
1531 void
moea64_zero_page(vm_page_t m)1532 moea64_zero_page(vm_page_t m)
1533 {
1534 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1535 	vm_offset_t va;
1536 
1537 	mtx_lock(&moea64_scratchpage_mtx);
1538 
1539 	moea64_set_scratchpage_pa(0, pa);
1540 	va = moea64_scratchpage_va[0];
1541 
1542 	bzero((void *)va, PAGE_SIZE);
1543 
1544 	mtx_unlock(&moea64_scratchpage_mtx);
1545 }
1546 
1547 void
moea64_zero_page_dmap(vm_page_t m)1548 moea64_zero_page_dmap(vm_page_t m)
1549 {
1550 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1551 	vm_offset_t va;
1552 
1553 	va = PHYS_TO_DMAP(pa);
1554 	bzero((void *)va, PAGE_SIZE);
1555 }
1556 
1557 vm_offset_t
moea64_quick_enter_page(vm_page_t m)1558 moea64_quick_enter_page(vm_page_t m)
1559 {
1560 	struct pvo_entry *pvo;
1561 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1562 
1563 	/*
1564  	 * MOEA64_PTE_REPLACE does some locking, so we can't just grab
1565 	 * a critical section and access the PCPU data like on i386.
1566 	 * Instead, pin the thread and grab the PCPU lock to prevent
1567 	 * a preempting thread from using the same PCPU data.
1568 	 */
1569 	sched_pin();
1570 
1571 	mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED);
1572 	pvo = PCPU_GET(aim.qmap_pvo);
1573 
1574 	mtx_lock(PCPU_PTR(aim.qmap_lock));
1575 	pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) |
1576 	    (uint64_t)pa;
1577 	moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
1578 	isync();
1579 
1580 	return (PCPU_GET(qmap_addr));
1581 }
1582 
1583 vm_offset_t
moea64_quick_enter_page_dmap(vm_page_t m)1584 moea64_quick_enter_page_dmap(vm_page_t m)
1585 {
1586 
1587 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
1588 }
1589 
1590 void
moea64_quick_remove_page(vm_offset_t addr)1591 moea64_quick_remove_page(vm_offset_t addr)
1592 {
1593 
1594 	mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED);
1595 	KASSERT(PCPU_GET(qmap_addr) == addr,
1596 	    ("moea64_quick_remove_page: invalid address"));
1597 	mtx_unlock(PCPU_PTR(aim.qmap_lock));
1598 	sched_unpin();
1599 }
1600 
1601 bool
moea64_page_is_mapped(vm_page_t m)1602 moea64_page_is_mapped(vm_page_t m)
1603 {
1604 	return (!LIST_EMPTY(&(m)->md.mdpg_pvoh));
1605 }
1606 
1607 /*
1608  * Map the given physical page at the specified virtual address in the
1609  * target pmap with the protection requested.  If specified the page
1610  * will be wired down.
1611  */
1612 
1613 int
moea64_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)1614 moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
1615     vm_prot_t prot, u_int flags, int8_t psind)
1616 {
1617 	struct		pvo_entry *pvo, *oldpvo, *tpvo;
1618 	struct		pvo_head *pvo_head;
1619 	uint64_t	pte_lo;
1620 	int		error;
1621 	vm_paddr_t	pa;
1622 
1623 	if ((m->oflags & VPO_UNMANAGED) == 0) {
1624 		if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0)
1625 			VM_PAGE_OBJECT_BUSY_ASSERT(m);
1626 		else
1627 			VM_OBJECT_ASSERT_LOCKED(m->object);
1628 	}
1629 
1630 	if (psind > 0)
1631 		return (moea64_sp_enter(pmap, va, m, prot, flags, psind));
1632 
1633 	pvo = alloc_pvo_entry(0);
1634 	if (pvo == NULL)
1635 		return (KERN_RESOURCE_SHORTAGE);
1636 	pvo->pvo_pmap = NULL; /* to be filled in later */
1637 	pvo->pvo_pte.prot = prot;
1638 
1639 	pa = VM_PAGE_TO_PHYS(m);
1640 	pte_lo = moea64_calc_wimg(pa, pmap_page_get_memattr(m));
1641 	pvo->pvo_pte.pa = pa | pte_lo;
1642 
1643 	if ((flags & PMAP_ENTER_WIRED) != 0)
1644 		pvo->pvo_vaddr |= PVO_WIRED;
1645 
1646 	if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) {
1647 		pvo_head = NULL;
1648 	} else {
1649 		pvo_head = &m->md.mdpg_pvoh;
1650 		pvo->pvo_vaddr |= PVO_MANAGED;
1651 	}
1652 
1653 	PV_WR_LOCK(pa);
1654 	PMAP_LOCK(pmap);
1655 	if (pvo->pvo_pmap == NULL)
1656 		init_pvo_entry(pvo, pmap, va);
1657 
1658 	if (moea64_ps_enabled(pmap) &&
1659 	    (tpvo = moea64_pvo_find_va(pmap, va & ~HPT_SP_MASK)) != NULL &&
1660 	    PVO_IS_SP(tpvo)) {
1661 		/* Demote SP before entering a regular page */
1662 		CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx",
1663 		    __func__, (uintmax_t)va);
1664 		moea64_sp_demote_aligned(tpvo);
1665 	}
1666 
1667 	if (prot & VM_PROT_WRITE)
1668 		if (pmap_bootstrapped &&
1669 		    (m->oflags & VPO_UNMANAGED) == 0)
1670 			vm_page_aflag_set(m, PGA_WRITEABLE);
1671 
1672 	error = moea64_pvo_enter(pvo, pvo_head, &oldpvo);
1673 	if (error == EEXIST) {
1674 		if (oldpvo->pvo_vaddr == pvo->pvo_vaddr &&
1675 		    oldpvo->pvo_pte.pa == pvo->pvo_pte.pa &&
1676 		    oldpvo->pvo_pte.prot == prot) {
1677 			/* Identical mapping already exists */
1678 			error = 0;
1679 
1680 			/* If not in page table, reinsert it */
1681 			if (moea64_pte_synch(oldpvo) < 0) {
1682 				STAT_MOEA64(moea64_pte_overflow--);
1683 				moea64_pte_insert(oldpvo);
1684 			}
1685 
1686 			/* Then just clean up and go home */
1687 			PMAP_UNLOCK(pmap);
1688 			PV_UNLOCK(pa);
1689 			free_pvo_entry(pvo);
1690 			pvo = NULL;
1691 			goto out;
1692 		} else {
1693 			/* Otherwise, need to kill it first */
1694 			KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old "
1695 			    "mapping does not match new mapping"));
1696 			moea64_pvo_remove_from_pmap(oldpvo);
1697 			moea64_pvo_enter(pvo, pvo_head, NULL);
1698 		}
1699 	}
1700 	PMAP_UNLOCK(pmap);
1701 	PV_UNLOCK(pa);
1702 
1703 	/* Free any dead pages */
1704 	if (error == EEXIST) {
1705 		moea64_pvo_remove_from_page(oldpvo);
1706 		free_pvo_entry(oldpvo);
1707 	}
1708 
1709 out:
1710 	/*
1711 	 * Flush the page from the instruction cache if this page is
1712 	 * mapped executable and cacheable.
1713 	 */
1714 	if (pmap != kernel_pmap && (m->a.flags & PGA_EXECUTABLE) == 0 &&
1715 	    (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
1716 		vm_page_aflag_set(m, PGA_EXECUTABLE);
1717 		moea64_syncicache(pmap, va, pa, PAGE_SIZE);
1718 	}
1719 
1720 #if VM_NRESERVLEVEL > 0
1721 	/*
1722 	 * Try to promote pages.
1723 	 *
1724 	 * If the VA of the entered page is not aligned with its PA,
1725 	 * don't try page promotion as it is not possible.
1726 	 * This reduces the number of promotion failures dramatically.
1727 	 *
1728 	 * Ignore VM_PROT_NO_PROMOTE unless PMAP_ENTER_QUICK_LOCKED.
1729 	 */
1730 	if (moea64_ps_enabled(pmap) && pmap != kernel_pmap && pvo != NULL &&
1731 	    (pvo->pvo_vaddr & PVO_MANAGED) != 0 &&
1732 	    (va & HPT_SP_MASK) == (pa & HPT_SP_MASK) &&
1733 	    ((prot & VM_PROT_NO_PROMOTE) == 0 ||
1734 	    (flags & PMAP_ENTER_QUICK_LOCKED) == 0) &&
1735 	    (m->flags & PG_FICTITIOUS) == 0 &&
1736 	    vm_reserv_level_iffullpop(m) == 0)
1737 		moea64_sp_promote(pmap, va, m);
1738 #endif
1739 
1740 	return (KERN_SUCCESS);
1741 }
1742 
1743 static void
moea64_syncicache(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,vm_size_t sz)1744 moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1745     vm_size_t sz)
1746 {
1747 
1748 	/*
1749 	 * This is much trickier than on older systems because
1750 	 * we can't sync the icache on physical addresses directly
1751 	 * without a direct map. Instead we check a couple of cases
1752 	 * where the memory is already mapped in and, failing that,
1753 	 * use the same trick we use for page zeroing to create
1754 	 * a temporary mapping for this physical address.
1755 	 */
1756 
1757 	if (!pmap_bootstrapped) {
1758 		/*
1759 		 * If PMAP is not bootstrapped, we are likely to be
1760 		 * in real mode.
1761 		 */
1762 		__syncicache((void *)(uintptr_t)pa, sz);
1763 	} else if (pmap == kernel_pmap) {
1764 		__syncicache((void *)va, sz);
1765 	} else if (hw_direct_map) {
1766 		__syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz);
1767 	} else {
1768 		/* Use the scratch page to set up a temp mapping */
1769 
1770 		mtx_lock(&moea64_scratchpage_mtx);
1771 
1772 		moea64_set_scratchpage_pa(1, pa & ~ADDR_POFF);
1773 		__syncicache((void *)(moea64_scratchpage_va[1] +
1774 		    (va & ADDR_POFF)), sz);
1775 
1776 		mtx_unlock(&moea64_scratchpage_mtx);
1777 	}
1778 }
1779 
1780 /*
1781  * Maps a sequence of resident pages belonging to the same object.
1782  * The sequence begins with the given page m_start.  This page is
1783  * mapped at the given virtual address start.  Each subsequent page is
1784  * mapped at a virtual address that is offset from start by the same
1785  * amount as the page is offset from m_start within the object.  The
1786  * last page in the sequence is the page with the largest offset from
1787  * m_start that can be mapped at a virtual address less than the given
1788  * virtual address end.  Not every virtual page between start and end
1789  * is mapped; only those for which a resident page exists with the
1790  * corresponding offset from m_start are mapped.
1791  */
1792 void
moea64_enter_object(pmap_t pm,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)1793 moea64_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end,
1794     vm_page_t m_start, vm_prot_t prot)
1795 {
1796 	struct pctrie_iter pages;
1797 	vm_page_t m;
1798 	vm_offset_t va;
1799 	int8_t psind;
1800 
1801 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
1802 
1803 	vm_page_iter_limit_init(&pages, m_start->object,
1804 	    m_start->pindex + atop(end - start));
1805 	m = vm_radix_iter_lookup(&pages, m_start->pindex);
1806 	while (m != NULL) {
1807 		va = start + ptoa(m->pindex - m_start->pindex);
1808 		if ((va & HPT_SP_MASK) == 0 && va + HPT_SP_SIZE <= end &&
1809 		    m->psind == 1 && moea64_ps_enabled(pm))
1810 			psind = 1;
1811 		else
1812 			psind = 0;
1813 		moea64_enter(pm, va, m, prot &
1814 		    (VM_PROT_READ | VM_PROT_EXECUTE),
1815 		    PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind);
1816 		if (psind == 1)
1817 			m = vm_radix_iter_jump(&pages, HPT_SP_SIZE / PAGE_SIZE);
1818 		else
1819 			m = vm_radix_iter_step(&pages);
1820 	}
1821 }
1822 
1823 void
moea64_enter_quick(pmap_t pm,vm_offset_t va,vm_page_t m,vm_prot_t prot)1824 moea64_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m,
1825     vm_prot_t prot)
1826 {
1827 
1828 	moea64_enter(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE |
1829 	    VM_PROT_NO_PROMOTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED,
1830 	    0);
1831 }
1832 
1833 vm_paddr_t
moea64_extract(pmap_t pm,vm_offset_t va)1834 moea64_extract(pmap_t pm, vm_offset_t va)
1835 {
1836 	struct	pvo_entry *pvo;
1837 	vm_paddr_t pa;
1838 
1839 	PMAP_LOCK(pm);
1840 	pvo = moea64_pvo_find_va(pm, va);
1841 	if (pvo == NULL)
1842 		pa = 0;
1843 	else
1844 		pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
1845 	PMAP_UNLOCK(pm);
1846 
1847 	return (pa);
1848 }
1849 
1850 /*
1851  * Atomically extract and hold the physical page with the given
1852  * pmap and virtual address pair if that mapping permits the given
1853  * protection.
1854  */
1855 vm_page_t
moea64_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1856 moea64_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1857 {
1858 	struct	pvo_entry *pvo;
1859 	vm_page_t m;
1860 
1861 	m = NULL;
1862 	PMAP_LOCK(pmap);
1863 	pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
1864 	if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) {
1865 		m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
1866 		if (!vm_page_wire_mapped(m))
1867 			m = NULL;
1868 	}
1869 	PMAP_UNLOCK(pmap);
1870 	return (m);
1871 }
1872 
1873 static void *
moea64_uma_page_alloc(uma_zone_t zone,vm_size_t bytes,int domain,uint8_t * flags,int wait)1874 moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
1875     uint8_t *flags, int wait)
1876 {
1877 	struct pvo_entry *pvo;
1878         vm_offset_t va;
1879         vm_page_t m;
1880         int needed_lock;
1881 
1882 	/*
1883 	 * This entire routine is a horrible hack to avoid bothering kmem
1884 	 * for new KVA addresses. Because this can get called from inside
1885 	 * kmem allocation routines, calling kmem for a new address here
1886 	 * can lead to multiply locking non-recursive mutexes.
1887 	 */
1888 
1889 	*flags = UMA_SLAB_PRIV;
1890 	needed_lock = !PMAP_LOCKED(kernel_pmap);
1891 
1892 	m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) |
1893 	    VM_ALLOC_WIRED);
1894 	if (m == NULL)
1895 		return (NULL);
1896 
1897 	va = VM_PAGE_TO_PHYS(m);
1898 
1899 	pvo = alloc_pvo_entry(1 /* bootstrap */);
1900 
1901 	pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE;
1902 	pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M;
1903 
1904 	if (needed_lock)
1905 		PMAP_LOCK(kernel_pmap);
1906 
1907 	init_pvo_entry(pvo, kernel_pmap, va);
1908 	pvo->pvo_vaddr |= PVO_WIRED;
1909 
1910 	moea64_pvo_enter(pvo, NULL, NULL);
1911 
1912 	if (needed_lock)
1913 		PMAP_UNLOCK(kernel_pmap);
1914 
1915 	return (void *)va;
1916 }
1917 
1918 extern int elf32_nxstack;
1919 
1920 void
moea64_init(void)1921 moea64_init(void)
1922 {
1923 
1924 	CTR0(KTR_PMAP, "moea64_init");
1925 
1926 	moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
1927 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1928 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1929 
1930 	/* Are large page mappings enabled? */
1931 	superpages_enabled = 1;
1932 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1933 	if (superpages_enabled) {
1934 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1935 		    ("moea64_init: can't assign to pagesizes[1]"));
1936 
1937 		if (moea64_large_page_size == 0) {
1938 			printf("mmu_oea64: HW does not support large pages. "
1939 					"Disabling superpages...\n");
1940 			superpages_enabled = 0;
1941 		} else if (!moea64_has_lp_4k_16m) {
1942 			printf("mmu_oea64: "
1943 			    "HW does not support mixed 4KB/16MB page sizes. "
1944 			    "Disabling superpages...\n");
1945 			superpages_enabled = 0;
1946 		} else
1947 			pagesizes[1] = HPT_SP_SIZE;
1948 	}
1949 
1950 	if (!hw_direct_map) {
1951 		uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc);
1952 	}
1953 
1954 #ifdef COMPAT_FREEBSD32
1955 	elf32_nxstack = 1;
1956 #endif
1957 
1958 	moea64_initialized = true;
1959 }
1960 
1961 bool
moea64_is_referenced(vm_page_t m)1962 moea64_is_referenced(vm_page_t m)
1963 {
1964 
1965 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1966 	    ("moea64_is_referenced: page %p is not managed", m));
1967 
1968 	return (moea64_query_bit(m, LPTE_REF));
1969 }
1970 
1971 bool
moea64_is_modified(vm_page_t m)1972 moea64_is_modified(vm_page_t m)
1973 {
1974 
1975 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1976 	    ("moea64_is_modified: page %p is not managed", m));
1977 
1978 	/*
1979 	 * If the page is not busied then this check is racy.
1980 	 */
1981 	if (!pmap_page_is_write_mapped(m))
1982 		return (false);
1983 
1984 	return (moea64_query_bit(m, LPTE_CHG));
1985 }
1986 
1987 bool
moea64_is_prefaultable(pmap_t pmap,vm_offset_t va)1988 moea64_is_prefaultable(pmap_t pmap, vm_offset_t va)
1989 {
1990 	struct pvo_entry *pvo;
1991 	bool rv = true;
1992 
1993 	PMAP_LOCK(pmap);
1994 	pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
1995 	if (pvo != NULL)
1996 		rv = false;
1997 	PMAP_UNLOCK(pmap);
1998 	return (rv);
1999 }
2000 
2001 void
moea64_clear_modify(vm_page_t m)2002 moea64_clear_modify(vm_page_t m)
2003 {
2004 
2005 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2006 	    ("moea64_clear_modify: page %p is not managed", m));
2007 	vm_page_assert_busied(m);
2008 
2009 	if (!pmap_page_is_write_mapped(m))
2010 		return;
2011 	moea64_clear_bit(m, LPTE_CHG);
2012 }
2013 
2014 /*
2015  * Clear the write and modified bits in each of the given page's mappings.
2016  */
2017 void
moea64_remove_write(vm_page_t m)2018 moea64_remove_write(vm_page_t m)
2019 {
2020 	struct	pvo_entry *pvo;
2021 	int64_t	refchg, ret;
2022 	pmap_t	pmap;
2023 
2024 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2025 	    ("moea64_remove_write: page %p is not managed", m));
2026 	vm_page_assert_busied(m);
2027 
2028 	if (!pmap_page_is_write_mapped(m))
2029 		return;
2030 
2031 	powerpc_sync();
2032 	PV_PAGE_WR_LOCK(m);
2033 	refchg = 0;
2034 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2035 		pmap = pvo->pvo_pmap;
2036 		PMAP_LOCK(pmap);
2037 		if (!(pvo->pvo_vaddr & PVO_DEAD) &&
2038 		    (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
2039 			if (PVO_IS_SP(pvo)) {
2040 				CTR1(KTR_PMAP, "%s: demote before remwr",
2041 				    __func__);
2042 				moea64_sp_demote(pvo);
2043 			}
2044 			pvo->pvo_pte.prot &= ~VM_PROT_WRITE;
2045 			ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
2046 			if (ret < 0)
2047 				ret = LPTE_CHG;
2048 			refchg |= ret;
2049 			if (pvo->pvo_pmap == kernel_pmap)
2050 				isync();
2051 		}
2052 		PMAP_UNLOCK(pmap);
2053 	}
2054 	if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG)
2055 		vm_page_dirty(m);
2056 	vm_page_aflag_clear(m, PGA_WRITEABLE);
2057 	PV_PAGE_UNLOCK(m);
2058 }
2059 
2060 /*
2061  *	moea64_ts_referenced:
2062  *
2063  *	Return a count of reference bits for a page, clearing those bits.
2064  *	It is not necessary for every reference bit to be cleared, but it
2065  *	is necessary that 0 only be returned when there are truly no
2066  *	reference bits set.
2067  *
2068  *	XXX: The exact number of bits to check and clear is a matter that
2069  *	should be tested and standardized at some point in the future for
2070  *	optimal aging of shared pages.
2071  */
2072 int
moea64_ts_referenced(vm_page_t m)2073 moea64_ts_referenced(vm_page_t m)
2074 {
2075 
2076 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2077 	    ("moea64_ts_referenced: page %p is not managed", m));
2078 	return (moea64_clear_bit(m, LPTE_REF));
2079 }
2080 
2081 /*
2082  * Modify the WIMG settings of all mappings for a page.
2083  */
2084 void
moea64_page_set_memattr(vm_page_t m,vm_memattr_t ma)2085 moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma)
2086 {
2087 	struct	pvo_entry *pvo;
2088 	int64_t	refchg;
2089 	pmap_t	pmap;
2090 	uint64_t lo;
2091 
2092 	CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x",
2093 	    __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma);
2094 
2095 	if (m->md.mdpg_cache_attrs == ma)
2096 		return;
2097 
2098 	if ((m->oflags & VPO_UNMANAGED) != 0) {
2099 		m->md.mdpg_cache_attrs = ma;
2100 		return;
2101 	}
2102 
2103 	lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma);
2104 
2105 	PV_PAGE_WR_LOCK(m);
2106 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2107 		pmap = pvo->pvo_pmap;
2108 		PMAP_LOCK(pmap);
2109 		if (!(pvo->pvo_vaddr & PVO_DEAD)) {
2110 			if (PVO_IS_SP(pvo)) {
2111 				CTR1(KTR_PMAP,
2112 				    "%s: demote before set_memattr", __func__);
2113 				moea64_sp_demote(pvo);
2114 			}
2115 			pvo->pvo_pte.pa &= ~LPTE_WIMG;
2116 			pvo->pvo_pte.pa |= lo;
2117 			refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
2118 			if (refchg < 0)
2119 				refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ?
2120 				    LPTE_CHG : 0;
2121 			if ((pvo->pvo_vaddr & PVO_MANAGED) &&
2122 			    (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
2123 				refchg |=
2124 				    atomic_readandclear_32(&m->md.mdpg_attrs);
2125 				if (refchg & LPTE_CHG)
2126 					vm_page_dirty(m);
2127 				if (refchg & LPTE_REF)
2128 					vm_page_aflag_set(m, PGA_REFERENCED);
2129 			}
2130 			if (pvo->pvo_pmap == kernel_pmap)
2131 				isync();
2132 		}
2133 		PMAP_UNLOCK(pmap);
2134 	}
2135 	m->md.mdpg_cache_attrs = ma;
2136 	PV_PAGE_UNLOCK(m);
2137 }
2138 
2139 /*
2140  * Map a wired page into kernel virtual address space.
2141  */
2142 void
moea64_kenter_attr(vm_offset_t va,vm_paddr_t pa,vm_memattr_t ma)2143 moea64_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
2144 {
2145 	int		error;
2146 	struct pvo_entry *pvo, *oldpvo;
2147 
2148 	do {
2149 		pvo = alloc_pvo_entry(0);
2150 		if (pvo == NULL)
2151 			vm_wait(NULL);
2152 	} while (pvo == NULL);
2153 	pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
2154 	pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma);
2155 	pvo->pvo_vaddr |= PVO_WIRED;
2156 
2157 	PMAP_LOCK(kernel_pmap);
2158 	oldpvo = moea64_pvo_find_va(kernel_pmap, va);
2159 	if (oldpvo != NULL)
2160 		moea64_pvo_remove_from_pmap(oldpvo);
2161 	init_pvo_entry(pvo, kernel_pmap, va);
2162 	error = moea64_pvo_enter(pvo, NULL, NULL);
2163 	PMAP_UNLOCK(kernel_pmap);
2164 
2165 	/* Free any dead pages */
2166 	if (oldpvo != NULL) {
2167 		moea64_pvo_remove_from_page(oldpvo);
2168 		free_pvo_entry(oldpvo);
2169 	}
2170 
2171 	if (error != 0)
2172 		panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va,
2173 		    (uintmax_t)pa, error);
2174 }
2175 
2176 void
moea64_kenter(vm_offset_t va,vm_paddr_t pa)2177 moea64_kenter(vm_offset_t va, vm_paddr_t pa)
2178 {
2179 
2180 	moea64_kenter_attr(va, pa, VM_MEMATTR_DEFAULT);
2181 }
2182 
2183 /*
2184  * Extract the physical page address associated with the given kernel virtual
2185  * address.
2186  */
2187 vm_paddr_t
moea64_kextract(vm_offset_t va)2188 moea64_kextract(vm_offset_t va)
2189 {
2190 	struct		pvo_entry *pvo;
2191 	vm_paddr_t pa;
2192 
2193 	/*
2194 	 * Shortcut the direct-mapped case when applicable.  We never put
2195 	 * anything but 1:1 (or 62-bit aliased) mappings below
2196 	 * VM_MIN_KERNEL_ADDRESS.
2197 	 */
2198 	if (va < VM_MIN_KERNEL_ADDRESS)
2199 		return (va & ~DMAP_BASE_ADDRESS);
2200 
2201 	PMAP_LOCK(kernel_pmap);
2202 	pvo = moea64_pvo_find_va(kernel_pmap, va);
2203 	KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR,
2204 	    va));
2205 	pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
2206 	PMAP_UNLOCK(kernel_pmap);
2207 	return (pa);
2208 }
2209 
2210 /*
2211  * Remove a wired page from kernel virtual address space.
2212  */
2213 void
moea64_kremove(vm_offset_t va)2214 moea64_kremove(vm_offset_t va)
2215 {
2216 	moea64_remove(kernel_pmap, va, va + PAGE_SIZE);
2217 }
2218 
2219 /*
2220  * Provide a kernel pointer corresponding to a given userland pointer.
2221  * The returned pointer is valid until the next time this function is
2222  * called in this thread. This is used internally in copyin/copyout.
2223  */
2224 static int
moea64_map_user_ptr(pmap_t pm,volatile const void * uaddr,void ** kaddr,size_t ulen,size_t * klen)2225 moea64_map_user_ptr(pmap_t pm, volatile const void *uaddr,
2226     void **kaddr, size_t ulen, size_t *klen)
2227 {
2228 	size_t l;
2229 #ifdef __powerpc64__
2230 	struct slb *slb;
2231 #endif
2232 	register_t slbv;
2233 
2234 	*kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK);
2235 	l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr);
2236 	if (l > ulen)
2237 		l = ulen;
2238 	if (klen)
2239 		*klen = l;
2240 	else if (l != ulen)
2241 		return (EFAULT);
2242 
2243 #ifdef __powerpc64__
2244 	/* Try lockless look-up first */
2245 	slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr);
2246 
2247 	if (slb == NULL) {
2248 		/* If it isn't there, we need to pre-fault the VSID */
2249 		PMAP_LOCK(pm);
2250 		slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT;
2251 		PMAP_UNLOCK(pm);
2252 	} else {
2253 		slbv = slb->slbv;
2254 	}
2255 
2256 	/* Mark segment no-execute */
2257 	slbv |= SLBV_N;
2258 #else
2259 	slbv = va_to_vsid(pm, (vm_offset_t)uaddr);
2260 
2261 	/* Mark segment no-execute */
2262 	slbv |= SR_N;
2263 #endif
2264 
2265 	/* If we have already set this VSID, we can just return */
2266 	if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv)
2267 		return (0);
2268 
2269 	__asm __volatile("isync");
2270 	curthread->td_pcb->pcb_cpu.aim.usr_segm =
2271 	    (uintptr_t)uaddr >> ADDR_SR_SHFT;
2272 	curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv;
2273 #ifdef __powerpc64__
2274 	__asm __volatile ("slbie %0; slbmte %1, %2; isync" ::
2275 	    "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE));
2276 #else
2277 	__asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv));
2278 #endif
2279 
2280 	return (0);
2281 }
2282 
2283 /*
2284  * Figure out where a given kernel pointer (usually in a fault) points
2285  * to from the VM's perspective, potentially remapping into userland's
2286  * address space.
2287  */
2288 static int
moea64_decode_kernel_ptr(vm_offset_t addr,int * is_user,vm_offset_t * decoded_addr)2289 moea64_decode_kernel_ptr(vm_offset_t addr, int *is_user,
2290     vm_offset_t *decoded_addr)
2291 {
2292 	vm_offset_t user_sr;
2293 
2294 	if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) {
2295 		user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm;
2296 		addr &= ADDR_PIDX | ADDR_POFF;
2297 		addr |= user_sr << ADDR_SR_SHFT;
2298 		*decoded_addr = addr;
2299 		*is_user = 1;
2300 	} else {
2301 		*decoded_addr = addr;
2302 		*is_user = 0;
2303 	}
2304 
2305 	return (0);
2306 }
2307 
2308 /*
2309  * Map a range of physical addresses into kernel virtual address space.
2310  *
2311  * The value passed in *virt is a suggested virtual address for the mapping.
2312  * Architectures which can support a direct-mapped physical to virtual region
2313  * can return the appropriate address within that region, leaving '*virt'
2314  * unchanged.  Other architectures should map the pages starting at '*virt' and
2315  * update '*virt' with the first usable address after the mapped region.
2316  */
2317 vm_offset_t
moea64_map(vm_offset_t * virt,vm_paddr_t pa_start,vm_paddr_t pa_end,int prot)2318 moea64_map(vm_offset_t *virt, vm_paddr_t pa_start,
2319     vm_paddr_t pa_end, int prot)
2320 {
2321 	vm_offset_t	sva, va;
2322 
2323 	if (hw_direct_map) {
2324 		/*
2325 		 * Check if every page in the region is covered by the direct
2326 		 * map. The direct map covers all of physical memory. Use
2327 		 * moea64_calc_wimg() as a shortcut to see if the page is in
2328 		 * physical memory as a way to see if the direct map covers it.
2329 		 */
2330 		for (va = pa_start; va < pa_end; va += PAGE_SIZE)
2331 			if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M)
2332 				break;
2333 		if (va == pa_end)
2334 			return (PHYS_TO_DMAP(pa_start));
2335 	}
2336 	sva = *virt;
2337 	va = sva;
2338 	/* XXX respect prot argument */
2339 	for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
2340 		moea64_kenter(va, pa_start);
2341 	*virt = va;
2342 
2343 	return (sva);
2344 }
2345 
2346 /*
2347  * Returns true if the pmap's pv is one of the first
2348  * 16 pvs linked to from this page.  This count may
2349  * be changed upwards or downwards in the future; it
2350  * is only necessary that true be returned for a small
2351  * subset of pmaps for proper page aging.
2352  */
2353 bool
moea64_page_exists_quick(pmap_t pmap,vm_page_t m)2354 moea64_page_exists_quick(pmap_t pmap, vm_page_t m)
2355 {
2356         int loops;
2357 	struct pvo_entry *pvo;
2358 	bool rv;
2359 
2360 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2361 	    ("moea64_page_exists_quick: page %p is not managed", m));
2362 	loops = 0;
2363 	rv = false;
2364 	PV_PAGE_RD_LOCK(m);
2365 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2366 		if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) {
2367 			rv = true;
2368 			break;
2369 		}
2370 		if (++loops >= 16)
2371 			break;
2372 	}
2373 	PV_PAGE_UNLOCK(m);
2374 	return (rv);
2375 }
2376 
2377 void
moea64_page_init(vm_page_t m)2378 moea64_page_init(vm_page_t m)
2379 {
2380 
2381 	m->md.mdpg_attrs = 0;
2382 	m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
2383 	LIST_INIT(&m->md.mdpg_pvoh);
2384 }
2385 
2386 /*
2387  * Return the number of managed mappings to the given physical page
2388  * that are wired.
2389  */
2390 int
moea64_page_wired_mappings(vm_page_t m)2391 moea64_page_wired_mappings(vm_page_t m)
2392 {
2393 	struct pvo_entry *pvo;
2394 	int count;
2395 
2396 	count = 0;
2397 	if ((m->oflags & VPO_UNMANAGED) != 0)
2398 		return (count);
2399 	PV_PAGE_RD_LOCK(m);
2400 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
2401 		if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED)
2402 			count++;
2403 	PV_PAGE_UNLOCK(m);
2404 	return (count);
2405 }
2406 
2407 static uintptr_t	moea64_vsidcontext;
2408 
2409 uintptr_t
moea64_get_unique_vsid(void)2410 moea64_get_unique_vsid(void) {
2411 	u_int entropy;
2412 	register_t hash;
2413 	uint32_t mask;
2414 	int i;
2415 
2416 	entropy = 0;
2417 	__asm __volatile("mftb %0" : "=r"(entropy));
2418 
2419 	mtx_lock(&moea64_slb_mutex);
2420 	for (i = 0; i < NVSIDS; i += VSID_NBPW) {
2421 		u_int	n;
2422 
2423 		/*
2424 		 * Create a new value by multiplying by a prime and adding in
2425 		 * entropy from the timebase register.  This is to make the
2426 		 * VSID more random so that the PT hash function collides
2427 		 * less often.  (Note that the prime casues gcc to do shifts
2428 		 * instead of a multiply.)
2429 		 */
2430 		moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy;
2431 		hash = moea64_vsidcontext & (NVSIDS - 1);
2432 		if (hash == 0)		/* 0 is special, avoid it */
2433 			continue;
2434 		n = hash >> 5;
2435 		mask = 1 << (hash & (VSID_NBPW - 1));
2436 		hash = (moea64_vsidcontext & VSID_HASHMASK);
2437 		if (moea64_vsid_bitmap[n] & mask) {	/* collision? */
2438 			/* anything free in this bucket? */
2439 			if (moea64_vsid_bitmap[n] == 0xffffffff) {
2440 				entropy = (moea64_vsidcontext >> 20);
2441 				continue;
2442 			}
2443 			i = ffs(~moea64_vsid_bitmap[n]) - 1;
2444 			mask = 1 << i;
2445 			hash &= rounddown2(VSID_HASHMASK, VSID_NBPW);
2446 			hash |= i;
2447 		}
2448 		if (hash == VSID_VRMA)	/* also special, avoid this too */
2449 			continue;
2450 		KASSERT(!(moea64_vsid_bitmap[n] & mask),
2451 		    ("Allocating in-use VSID %#zx\n", hash));
2452 		moea64_vsid_bitmap[n] |= mask;
2453 		mtx_unlock(&moea64_slb_mutex);
2454 		return (hash);
2455 	}
2456 
2457 	mtx_unlock(&moea64_slb_mutex);
2458 	panic("%s: out of segments",__func__);
2459 }
2460 
2461 #ifdef __powerpc64__
2462 int
moea64_pinit(pmap_t pmap)2463 moea64_pinit(pmap_t pmap)
2464 {
2465 
2466 	RB_INIT(&pmap->pmap_pvo);
2467 
2468 	pmap->pm_slb_tree_root = slb_alloc_tree();
2469 	pmap->pm_slb = slb_alloc_user_cache();
2470 	pmap->pm_slb_len = 0;
2471 
2472 	return (1);
2473 }
2474 #else
2475 int
moea64_pinit(pmap_t pmap)2476 moea64_pinit(pmap_t pmap)
2477 {
2478 	int	i;
2479 	uint32_t hash;
2480 
2481 	RB_INIT(&pmap->pmap_pvo);
2482 
2483 	if (pmap_bootstrapped)
2484 		pmap->pmap_phys = (pmap_t)moea64_kextract((vm_offset_t)pmap);
2485 	else
2486 		pmap->pmap_phys = pmap;
2487 
2488 	/*
2489 	 * Allocate some segment registers for this pmap.
2490 	 */
2491 	hash = moea64_get_unique_vsid();
2492 
2493 	for (i = 0; i < 16; i++)
2494 		pmap->pm_sr[i] = VSID_MAKE(i, hash);
2495 
2496 	KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0"));
2497 
2498 	return (1);
2499 }
2500 #endif
2501 
2502 /*
2503  * Initialize the pmap associated with process 0.
2504  */
2505 void
moea64_pinit0(pmap_t pm)2506 moea64_pinit0(pmap_t pm)
2507 {
2508 
2509 	PMAP_LOCK_INIT(pm);
2510 	moea64_pinit(pm);
2511 	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
2512 }
2513 
2514 /*
2515  * Set the physical protection on the specified range of this map as requested.
2516  */
2517 static void
moea64_pvo_protect(pmap_t pm,struct pvo_entry * pvo,vm_prot_t prot)2518 moea64_pvo_protect( pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot)
2519 {
2520 	struct vm_page *pg;
2521 	vm_prot_t oldprot;
2522 	int32_t refchg;
2523 
2524 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
2525 
2526 	/*
2527 	 * Change the protection of the page.
2528 	 */
2529 	oldprot = pvo->pvo_pte.prot;
2530 	pvo->pvo_pte.prot = prot;
2531 	pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
2532 
2533 	/*
2534 	 * If the PVO is in the page table, update mapping
2535 	 */
2536 	refchg = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
2537 	if (refchg < 0)
2538 		refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0;
2539 
2540 	if (pm != kernel_pmap && pg != NULL &&
2541 	    (pg->a.flags & PGA_EXECUTABLE) == 0 &&
2542 	    (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
2543 		if ((pg->oflags & VPO_UNMANAGED) == 0)
2544 			vm_page_aflag_set(pg, PGA_EXECUTABLE);
2545 		moea64_syncicache(pm, PVO_VADDR(pvo),
2546 		    PVO_PADDR(pvo), PAGE_SIZE);
2547 	}
2548 
2549 	/*
2550 	 * Update vm about the REF/CHG bits if the page is managed and we have
2551 	 * removed write access.
2552 	 */
2553 	if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) &&
2554 	    (oldprot & VM_PROT_WRITE)) {
2555 		refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
2556 		if (refchg & LPTE_CHG)
2557 			vm_page_dirty(pg);
2558 		if (refchg & LPTE_REF)
2559 			vm_page_aflag_set(pg, PGA_REFERENCED);
2560 	}
2561 }
2562 
2563 void
moea64_protect(pmap_t pm,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)2564 moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
2565     vm_prot_t prot)
2566 {
2567 	struct	pvo_entry *pvo, key;
2568 
2569 	CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm,
2570 	    sva, eva, prot);
2571 
2572 	KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
2573 	    ("moea64_protect: non current pmap"));
2574 
2575 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2576 		moea64_remove(pm, sva, eva);
2577 		return;
2578 	}
2579 
2580 	PMAP_LOCK(pm);
2581 	key.pvo_vaddr = sva;
2582 	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2583 	    pvo != NULL && PVO_VADDR(pvo) < eva;
2584 	    pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
2585 		if (PVO_IS_SP(pvo)) {
2586 			if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
2587 				pvo = moea64_sp_protect(pvo, prot);
2588 				continue;
2589 			} else {
2590 				CTR1(KTR_PMAP, "%s: demote before protect",
2591 				    __func__);
2592 				moea64_sp_demote(pvo);
2593 			}
2594 		}
2595 		moea64_pvo_protect(pm, pvo, prot);
2596 	}
2597 	PMAP_UNLOCK(pm);
2598 }
2599 
2600 /*
2601  * Map a list of wired pages into kernel virtual address space.  This is
2602  * intended for temporary mappings which do not need page modification or
2603  * references recorded.  Existing mappings in the region are overwritten.
2604  */
2605 void
moea64_qenter(vm_offset_t va,vm_page_t * m,int count)2606 moea64_qenter(vm_offset_t va, vm_page_t *m, int count)
2607 {
2608 	while (count-- > 0) {
2609 		moea64_kenter(va, VM_PAGE_TO_PHYS(*m));
2610 		va += PAGE_SIZE;
2611 		m++;
2612 	}
2613 }
2614 
2615 /*
2616  * Remove page mappings from kernel virtual address space.  Intended for
2617  * temporary mappings entered by moea64_qenter.
2618  */
2619 void
moea64_qremove(vm_offset_t va,int count)2620 moea64_qremove(vm_offset_t va, int count)
2621 {
2622 	while (count-- > 0) {
2623 		moea64_kremove(va);
2624 		va += PAGE_SIZE;
2625 	}
2626 }
2627 
2628 void
moea64_release_vsid(uint64_t vsid)2629 moea64_release_vsid(uint64_t vsid)
2630 {
2631 	int idx, mask;
2632 
2633 	mtx_lock(&moea64_slb_mutex);
2634 	idx = vsid & (NVSIDS-1);
2635 	mask = 1 << (idx % VSID_NBPW);
2636 	idx /= VSID_NBPW;
2637 	KASSERT(moea64_vsid_bitmap[idx] & mask,
2638 	    ("Freeing unallocated VSID %#jx", vsid));
2639 	moea64_vsid_bitmap[idx] &= ~mask;
2640 	mtx_unlock(&moea64_slb_mutex);
2641 }
2642 
2643 void
moea64_release(pmap_t pmap)2644 moea64_release(pmap_t pmap)
2645 {
2646 
2647 	/*
2648 	 * Free segment registers' VSIDs
2649 	 */
2650     #ifdef __powerpc64__
2651 	slb_free_tree(pmap);
2652 	slb_free_user_cache(pmap->pm_slb);
2653     #else
2654 	KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0"));
2655 
2656 	moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0]));
2657     #endif
2658 }
2659 
2660 /*
2661  * Remove all pages mapped by the specified pmap
2662  */
2663 void
moea64_remove_pages(pmap_t pm)2664 moea64_remove_pages(pmap_t pm)
2665 {
2666 	struct pvo_entry *pvo, *tpvo;
2667 	struct pvo_dlist tofree;
2668 
2669 	SLIST_INIT(&tofree);
2670 
2671 	PMAP_LOCK(pm);
2672 	RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) {
2673 		if (pvo->pvo_vaddr & PVO_WIRED)
2674 			continue;
2675 
2676 		/*
2677 		 * For locking reasons, remove this from the page table and
2678 		 * pmap, but save delinking from the vm_page for a second
2679 		 * pass
2680 		 */
2681 		moea64_pvo_remove_from_pmap(pvo);
2682 		SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink);
2683 	}
2684 	PMAP_UNLOCK(pm);
2685 
2686 	while (!SLIST_EMPTY(&tofree)) {
2687 		pvo = SLIST_FIRST(&tofree);
2688 		SLIST_REMOVE_HEAD(&tofree, pvo_dlink);
2689 		moea64_pvo_remove_from_page(pvo);
2690 		free_pvo_entry(pvo);
2691 	}
2692 }
2693 
2694 static void
moea64_remove_locked(pmap_t pm,vm_offset_t sva,vm_offset_t eva,struct pvo_dlist * tofree)2695 moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
2696     struct pvo_dlist *tofree)
2697 {
2698 	struct pvo_entry *pvo, *tpvo, key;
2699 
2700 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
2701 
2702 	key.pvo_vaddr = sva;
2703 	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2704 	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
2705 		if (PVO_IS_SP(pvo)) {
2706 			if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
2707 				tpvo = moea64_sp_remove(pvo, tofree);
2708 				continue;
2709 			} else {
2710 				CTR1(KTR_PMAP, "%s: demote before remove",
2711 				    __func__);
2712 				moea64_sp_demote(pvo);
2713 			}
2714 		}
2715 		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
2716 
2717 		/*
2718 		 * For locking reasons, remove this from the page table and
2719 		 * pmap, but save delinking from the vm_page for a second
2720 		 * pass
2721 		 */
2722 		moea64_pvo_remove_from_pmap(pvo);
2723 		SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
2724 	}
2725 }
2726 
2727 /*
2728  * Remove the given range of addresses from the specified map.
2729  */
2730 void
moea64_remove(pmap_t pm,vm_offset_t sva,vm_offset_t eva)2731 moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
2732 {
2733 	struct pvo_entry *pvo;
2734 	struct pvo_dlist tofree;
2735 
2736 	/*
2737 	 * Perform an unsynchronized read.  This is, however, safe.
2738 	 */
2739 	if (pm->pm_stats.resident_count == 0)
2740 		return;
2741 
2742 	SLIST_INIT(&tofree);
2743 	PMAP_LOCK(pm);
2744 	moea64_remove_locked(pm, sva, eva, &tofree);
2745 	PMAP_UNLOCK(pm);
2746 
2747 	while (!SLIST_EMPTY(&tofree)) {
2748 		pvo = SLIST_FIRST(&tofree);
2749 		SLIST_REMOVE_HEAD(&tofree, pvo_dlink);
2750 		moea64_pvo_remove_from_page(pvo);
2751 		free_pvo_entry(pvo);
2752 	}
2753 }
2754 
2755 /*
2756  * Remove physical page from all pmaps in which it resides. moea64_pvo_remove()
2757  * will reflect changes in pte's back to the vm_page.
2758  */
2759 void
moea64_remove_all(vm_page_t m)2760 moea64_remove_all(vm_page_t m)
2761 {
2762 	struct	pvo_entry *pvo, *next_pvo;
2763 	struct	pvo_head freequeue;
2764 	int	wasdead;
2765 	pmap_t	pmap;
2766 
2767 	LIST_INIT(&freequeue);
2768 
2769 	PV_PAGE_WR_LOCK(m);
2770 	LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) {
2771 		pmap = pvo->pvo_pmap;
2772 		PMAP_LOCK(pmap);
2773 		wasdead = (pvo->pvo_vaddr & PVO_DEAD);
2774 		if (!wasdead) {
2775 			if (PVO_IS_SP(pvo)) {
2776 				CTR1(KTR_PMAP, "%s: demote before remove_all",
2777 				    __func__);
2778 				moea64_sp_demote(pvo);
2779 			}
2780 			moea64_pvo_remove_from_pmap(pvo);
2781 		}
2782 		moea64_pvo_remove_from_page_locked(pvo, m);
2783 		if (!wasdead)
2784 			LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink);
2785 		PMAP_UNLOCK(pmap);
2786 
2787 	}
2788 	KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings"));
2789 	KASSERT((m->a.flags & PGA_WRITEABLE) == 0, ("Page still writable"));
2790 	PV_PAGE_UNLOCK(m);
2791 
2792 	/* Clean up UMA allocations */
2793 	LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo)
2794 		free_pvo_entry(pvo);
2795 }
2796 
2797 /*
2798  * Allocate a physical page of memory directly from the phys_avail map.
2799  * Can only be called from moea64_bootstrap before avail start and end are
2800  * calculated.
2801  */
2802 vm_offset_t
moea64_bootstrap_alloc(vm_size_t size,vm_size_t align)2803 moea64_bootstrap_alloc(vm_size_t size, vm_size_t align)
2804 {
2805 	vm_offset_t	s, e;
2806 	int		i, j;
2807 
2808 	size = round_page(size);
2809 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2810 		if (align != 0)
2811 			s = roundup2(phys_avail[i], align);
2812 		else
2813 			s = phys_avail[i];
2814 		e = s + size;
2815 
2816 		if (s < phys_avail[i] || e > phys_avail[i + 1])
2817 			continue;
2818 
2819 		if (s + size > platform_real_maxaddr())
2820 			continue;
2821 
2822 		if (s == phys_avail[i]) {
2823 			phys_avail[i] += size;
2824 		} else if (e == phys_avail[i + 1]) {
2825 			phys_avail[i + 1] -= size;
2826 		} else {
2827 			for (j = phys_avail_count * 2; j > i; j -= 2) {
2828 				phys_avail[j] = phys_avail[j - 2];
2829 				phys_avail[j + 1] = phys_avail[j - 1];
2830 			}
2831 
2832 			phys_avail[i + 3] = phys_avail[i + 1];
2833 			phys_avail[i + 1] = s;
2834 			phys_avail[i + 2] = e;
2835 			phys_avail_count++;
2836 		}
2837 
2838 		return (s);
2839 	}
2840 	panic("moea64_bootstrap_alloc: could not allocate memory");
2841 }
2842 
2843 static int
moea64_pvo_enter(struct pvo_entry * pvo,struct pvo_head * pvo_head,struct pvo_entry ** oldpvop)2844 moea64_pvo_enter(struct pvo_entry *pvo, struct pvo_head *pvo_head,
2845     struct pvo_entry **oldpvop)
2846 {
2847 	struct pvo_entry *old_pvo;
2848 	int err;
2849 
2850 	PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
2851 
2852 	STAT_MOEA64(moea64_pvo_enter_calls++);
2853 
2854 	/*
2855 	 * Add to pmap list
2856 	 */
2857 	old_pvo = RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2858 
2859 	if (old_pvo != NULL) {
2860 		if (oldpvop != NULL)
2861 			*oldpvop = old_pvo;
2862 		return (EEXIST);
2863 	}
2864 
2865 	if (pvo_head != NULL) {
2866 		LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
2867 	}
2868 
2869 	if (pvo->pvo_vaddr & PVO_WIRED)
2870 		pvo->pvo_pmap->pm_stats.wired_count++;
2871 	pvo->pvo_pmap->pm_stats.resident_count++;
2872 
2873 	/*
2874 	 * Insert it into the hardware page table
2875 	 */
2876 	err = moea64_pte_insert(pvo);
2877 	if (err != 0) {
2878 		panic("moea64_pvo_enter: overflow");
2879 	}
2880 
2881 	STAT_MOEA64(moea64_pvo_entries++);
2882 
2883 	if (pvo->pvo_pmap == kernel_pmap)
2884 		isync();
2885 
2886 #ifdef __powerpc64__
2887 	/*
2888 	 * Make sure all our bootstrap mappings are in the SLB as soon
2889 	 * as virtual memory is switched on.
2890 	 */
2891 	if (!pmap_bootstrapped)
2892 		moea64_bootstrap_slb_prefault(PVO_VADDR(pvo),
2893 		    pvo->pvo_vaddr & PVO_LARGE);
2894 #endif
2895 
2896 	return (0);
2897 }
2898 
2899 static void
moea64_pvo_remove_from_pmap(struct pvo_entry * pvo)2900 moea64_pvo_remove_from_pmap(struct pvo_entry *pvo)
2901 {
2902 	struct	vm_page *pg;
2903 	int32_t refchg;
2904 
2905 	KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap"));
2906 	PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
2907 	KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO"));
2908 
2909 	/*
2910 	 * If there is an active pte entry, we need to deactivate it
2911 	 */
2912 	refchg = moea64_pte_unset(pvo);
2913 	if (refchg < 0) {
2914 		/*
2915 		 * If it was evicted from the page table, be pessimistic and
2916 		 * dirty the page.
2917 		 */
2918 		if (pvo->pvo_pte.prot & VM_PROT_WRITE)
2919 			refchg = LPTE_CHG;
2920 		else
2921 			refchg = 0;
2922 	}
2923 
2924 	/*
2925 	 * Update our statistics.
2926 	 */
2927 	pvo->pvo_pmap->pm_stats.resident_count--;
2928 	if (pvo->pvo_vaddr & PVO_WIRED)
2929 		pvo->pvo_pmap->pm_stats.wired_count--;
2930 
2931 	/*
2932 	 * Remove this PVO from the pmap list.
2933 	 */
2934 	RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2935 
2936 	/*
2937 	 * Mark this for the next sweep
2938 	 */
2939 	pvo->pvo_vaddr |= PVO_DEAD;
2940 
2941 	/* Send RC bits to VM */
2942 	if ((pvo->pvo_vaddr & PVO_MANAGED) &&
2943 	    (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
2944 		pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
2945 		if (pg != NULL) {
2946 			refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
2947 			if (refchg & LPTE_CHG)
2948 				vm_page_dirty(pg);
2949 			if (refchg & LPTE_REF)
2950 				vm_page_aflag_set(pg, PGA_REFERENCED);
2951 		}
2952 	}
2953 }
2954 
2955 static inline void
moea64_pvo_remove_from_page_locked(struct pvo_entry * pvo,vm_page_t m)2956 moea64_pvo_remove_from_page_locked(struct pvo_entry *pvo,
2957     vm_page_t m)
2958 {
2959 
2960 	KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page"));
2961 
2962 	/* Use NULL pmaps as a sentinel for races in page deletion */
2963 	if (pvo->pvo_pmap == NULL)
2964 		return;
2965 	pvo->pvo_pmap = NULL;
2966 
2967 	/*
2968 	 * Update vm about page writeability/executability if managed
2969 	 */
2970 	PV_LOCKASSERT(PVO_PADDR(pvo));
2971 	if (pvo->pvo_vaddr & PVO_MANAGED) {
2972 		if (m != NULL) {
2973 			LIST_REMOVE(pvo, pvo_vlink);
2974 			if (LIST_EMPTY(vm_page_to_pvoh(m)))
2975 				vm_page_aflag_clear(m,
2976 				    PGA_WRITEABLE | PGA_EXECUTABLE);
2977 		}
2978 	}
2979 
2980 	STAT_MOEA64(moea64_pvo_entries--);
2981 	STAT_MOEA64(moea64_pvo_remove_calls++);
2982 }
2983 
2984 static void
moea64_pvo_remove_from_page(struct pvo_entry * pvo)2985 moea64_pvo_remove_from_page(struct pvo_entry *pvo)
2986 {
2987 	vm_page_t pg = NULL;
2988 
2989 	if (pvo->pvo_vaddr & PVO_MANAGED)
2990 		pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
2991 
2992 	PV_WR_LOCK(PVO_PADDR(pvo));
2993 	moea64_pvo_remove_from_page_locked(pvo, pg);
2994 	PV_UNLOCK(PVO_PADDR(pvo));
2995 }
2996 
2997 static struct pvo_entry *
moea64_pvo_find_va(pmap_t pm,vm_offset_t va)2998 moea64_pvo_find_va(pmap_t pm, vm_offset_t va)
2999 {
3000 	struct pvo_entry key;
3001 
3002 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
3003 
3004 	key.pvo_vaddr = va & ~ADDR_POFF;
3005 	return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key));
3006 }
3007 
3008 static bool
moea64_query_bit(vm_page_t m,uint64_t ptebit)3009 moea64_query_bit(vm_page_t m, uint64_t ptebit)
3010 {
3011 	struct	pvo_entry *pvo;
3012 	int64_t ret;
3013 	bool rv;
3014 	vm_page_t sp;
3015 
3016 	/*
3017 	 * See if this bit is stored in the page already.
3018 	 *
3019 	 * For superpages, the bit is stored in the first vm page.
3020 	 */
3021 	if ((m->md.mdpg_attrs & ptebit) != 0 ||
3022 	    ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK)) != NULL &&
3023 	     (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) ==
3024 	     (ptebit | MDPG_ATTR_SP)))
3025 		return (true);
3026 
3027 	/*
3028 	 * Examine each PTE.  Sync so that any pending REF/CHG bits are
3029 	 * flushed to the PTEs.
3030 	 */
3031 	rv = false;
3032 	powerpc_sync();
3033 	PV_PAGE_RD_LOCK(m);
3034 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
3035 		if (PVO_IS_SP(pvo)) {
3036 			ret = moea64_sp_query(pvo, ptebit);
3037 			/*
3038 			 * If SP was not demoted, check its REF/CHG bits here.
3039 			 */
3040 			if (ret != -1) {
3041 				if ((ret & ptebit) != 0) {
3042 					rv = true;
3043 					break;
3044 				}
3045 				continue;
3046 			}
3047 			/* else, fallthrough */
3048 		}
3049 
3050 		ret = 0;
3051 
3052 		/*
3053 		 * See if this pvo has a valid PTE.  if so, fetch the
3054 		 * REF/CHG bits from the valid PTE.  If the appropriate
3055 		 * ptebit is set, return success.
3056 		 */
3057 		PMAP_LOCK(pvo->pvo_pmap);
3058 		if (!(pvo->pvo_vaddr & PVO_DEAD))
3059 			ret = moea64_pte_synch(pvo);
3060 		PMAP_UNLOCK(pvo->pvo_pmap);
3061 
3062 		if (ret > 0) {
3063 			atomic_set_32(&m->md.mdpg_attrs,
3064 			    ret & (LPTE_CHG | LPTE_REF));
3065 			if (ret & ptebit) {
3066 				rv = true;
3067 				break;
3068 			}
3069 		}
3070 	}
3071 	PV_PAGE_UNLOCK(m);
3072 
3073 	return (rv);
3074 }
3075 
3076 static u_int
moea64_clear_bit(vm_page_t m,u_int64_t ptebit)3077 moea64_clear_bit(vm_page_t m, u_int64_t ptebit)
3078 {
3079 	u_int	count;
3080 	struct	pvo_entry *pvo;
3081 	int64_t ret;
3082 
3083 	/*
3084 	 * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
3085 	 * we can reset the right ones).
3086 	 */
3087 	powerpc_sync();
3088 
3089 	/*
3090 	 * For each pvo entry, clear the pte's ptebit.
3091 	 */
3092 	count = 0;
3093 	PV_PAGE_WR_LOCK(m);
3094 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
3095 		if (PVO_IS_SP(pvo)) {
3096 			if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) {
3097 				count += ret;
3098 				continue;
3099 			}
3100 		}
3101 		ret = 0;
3102 
3103 		PMAP_LOCK(pvo->pvo_pmap);
3104 		if (!(pvo->pvo_vaddr & PVO_DEAD))
3105 			ret = moea64_pte_clear(pvo, ptebit);
3106 		PMAP_UNLOCK(pvo->pvo_pmap);
3107 
3108 		if (ret > 0 && (ret & ptebit))
3109 			count++;
3110 	}
3111 	atomic_clear_32(&m->md.mdpg_attrs, ptebit);
3112 	PV_PAGE_UNLOCK(m);
3113 
3114 	return (count);
3115 }
3116 
3117 int
moea64_dev_direct_mapped(vm_paddr_t pa,vm_size_t size)3118 moea64_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
3119 {
3120 	struct pvo_entry *pvo, key;
3121 	vm_offset_t ppa;
3122 	int error = 0;
3123 
3124 	if (hw_direct_map && mem_valid(pa, size) == 0)
3125 		return (0);
3126 
3127 	PMAP_LOCK(kernel_pmap);
3128 	ppa = pa & ~ADDR_POFF;
3129 	key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa;
3130 	for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key);
3131 	    ppa < pa + size; ppa += PAGE_SIZE,
3132 	    pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) {
3133 		if (pvo == NULL || PVO_PADDR(pvo) != ppa) {
3134 			error = EFAULT;
3135 			break;
3136 		}
3137 	}
3138 	PMAP_UNLOCK(kernel_pmap);
3139 
3140 	return (error);
3141 }
3142 
3143 /*
3144  * Map a set of physical memory pages into the kernel virtual
3145  * address space. Return a pointer to where it is mapped. This
3146  * routine is intended to be used for mapping device memory,
3147  * NOT real memory.
3148  */
3149 void *
moea64_mapdev_attr(vm_paddr_t pa,vm_size_t size,vm_memattr_t ma)3150 moea64_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma)
3151 {
3152 	vm_offset_t va, tmpva, ppa, offset;
3153 
3154 	ppa = trunc_page(pa);
3155 	offset = pa & PAGE_MASK;
3156 	size = roundup2(offset + size, PAGE_SIZE);
3157 
3158 	va = kva_alloc(size);
3159 
3160 	if (!va)
3161 		panic("moea64_mapdev: Couldn't alloc kernel virtual memory");
3162 
3163 	for (tmpva = va; size > 0;) {
3164 		moea64_kenter_attr(tmpva, ppa, ma);
3165 		size -= PAGE_SIZE;
3166 		tmpva += PAGE_SIZE;
3167 		ppa += PAGE_SIZE;
3168 	}
3169 
3170 	return ((void *)(va + offset));
3171 }
3172 
3173 void *
moea64_mapdev(vm_paddr_t pa,vm_size_t size)3174 moea64_mapdev(vm_paddr_t pa, vm_size_t size)
3175 {
3176 
3177 	return moea64_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT);
3178 }
3179 
3180 void
moea64_unmapdev(void * p,vm_size_t size)3181 moea64_unmapdev(void *p, vm_size_t size)
3182 {
3183 	vm_offset_t base, offset, va;
3184 
3185 	va = (vm_offset_t)p;
3186 	base = trunc_page(va);
3187 	offset = va & PAGE_MASK;
3188 	size = roundup2(offset + size, PAGE_SIZE);
3189 
3190 	moea64_qremove(base, atop(size));
3191 	kva_free(base, size);
3192 }
3193 
3194 void
moea64_sync_icache(pmap_t pm,vm_offset_t va,vm_size_t sz)3195 moea64_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
3196 {
3197 	struct pvo_entry *pvo;
3198 	vm_offset_t lim;
3199 	vm_paddr_t pa;
3200 	vm_size_t len;
3201 
3202 	if (__predict_false(pm == NULL))
3203 		pm = &curthread->td_proc->p_vmspace->vm_pmap;
3204 
3205 	PMAP_LOCK(pm);
3206 	while (sz > 0) {
3207 		lim = round_page(va+1);
3208 		len = MIN(lim - va, sz);
3209 		pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF);
3210 		if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) {
3211 			pa = PVO_PADDR(pvo) | (va & ADDR_POFF);
3212 			moea64_syncicache(pm, va, pa, len);
3213 		}
3214 		va += len;
3215 		sz -= len;
3216 	}
3217 	PMAP_UNLOCK(pm);
3218 }
3219 
3220 void
moea64_dumpsys_map(vm_paddr_t pa,size_t sz,void ** va)3221 moea64_dumpsys_map(vm_paddr_t pa, size_t sz, void **va)
3222 {
3223 
3224 	*va = (void *)(uintptr_t)pa;
3225 }
3226 
3227 extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1];
3228 
3229 void
moea64_scan_init(void)3230 moea64_scan_init(void)
3231 {
3232 	struct pvo_entry *pvo;
3233 	vm_offset_t va;
3234 	int i;
3235 
3236 	if (!do_minidump) {
3237 		/* Initialize phys. segments for dumpsys(). */
3238 		memset(&dump_map, 0, sizeof(dump_map));
3239 		mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
3240 		for (i = 0; i < pregions_sz; i++) {
3241 			dump_map[i].pa_start = pregions[i].mr_start;
3242 			dump_map[i].pa_size = pregions[i].mr_size;
3243 		}
3244 		return;
3245 	}
3246 
3247 	/* Virtual segments for minidumps: */
3248 	memset(&dump_map, 0, sizeof(dump_map));
3249 
3250 	/* 1st: kernel .data and .bss. */
3251 	dump_map[0].pa_start = trunc_page((uintptr_t)_etext);
3252 	dump_map[0].pa_size = round_page((uintptr_t)_end) -
3253 	    dump_map[0].pa_start;
3254 
3255 	/* 2nd: msgbuf and tables (see pmap_bootstrap()). */
3256 	dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr;
3257 	dump_map[1].pa_size = round_page(msgbufp->msg_size);
3258 
3259 	/* 3rd: kernel VM. */
3260 	va = dump_map[1].pa_start + dump_map[1].pa_size;
3261 	/* Find start of next chunk (from va). */
3262 	while (va < virtual_end) {
3263 		/* Don't dump the buffer cache. */
3264 		if (va >= kmi.buffer_sva && va < kmi.buffer_eva) {
3265 			va = kmi.buffer_eva;
3266 			continue;
3267 		}
3268 		pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
3269 		if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD))
3270 			break;
3271 		va += PAGE_SIZE;
3272 	}
3273 	if (va < virtual_end) {
3274 		dump_map[2].pa_start = va;
3275 		va += PAGE_SIZE;
3276 		/* Find last page in chunk. */
3277 		while (va < virtual_end) {
3278 			/* Don't run into the buffer cache. */
3279 			if (va == kmi.buffer_sva)
3280 				break;
3281 			pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
3282 			if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD))
3283 				break;
3284 			va += PAGE_SIZE;
3285 		}
3286 		dump_map[2].pa_size = va - dump_map[2].pa_start;
3287 	}
3288 }
3289 
3290 #ifdef __powerpc64__
3291 
3292 static size_t
moea64_scan_pmap(struct bitset * dump_bitset)3293 moea64_scan_pmap(struct bitset *dump_bitset)
3294 {
3295 	struct pvo_entry *pvo;
3296 	vm_paddr_t pa, pa_end;
3297 	vm_offset_t va, pgva, kstart, kend, kstart_lp, kend_lp;
3298 	uint64_t lpsize;
3299 
3300 	lpsize = moea64_large_page_size;
3301 	kstart = trunc_page((vm_offset_t)_etext);
3302 	kend = round_page((vm_offset_t)_end);
3303 	kstart_lp = kstart & ~moea64_large_page_mask;
3304 	kend_lp = (kend + moea64_large_page_mask) & ~moea64_large_page_mask;
3305 
3306 	CTR4(KTR_PMAP, "moea64_scan_pmap: kstart=0x%016lx, kend=0x%016lx, "
3307 	    "kstart_lp=0x%016lx, kend_lp=0x%016lx",
3308 	    kstart, kend, kstart_lp, kend_lp);
3309 
3310 	PMAP_LOCK(kernel_pmap);
3311 	RB_FOREACH(pvo, pvo_tree, &kernel_pmap->pmap_pvo) {
3312 		va = pvo->pvo_vaddr;
3313 
3314 		if (va & PVO_DEAD)
3315 			continue;
3316 
3317 		/* Skip DMAP (except kernel area) */
3318 		if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) {
3319 			if (va & PVO_LARGE) {
3320 				pgva = va & ~moea64_large_page_mask;
3321 				if (pgva < kstart_lp || pgva >= kend_lp)
3322 					continue;
3323 			} else {
3324 				pgva = trunc_page(va);
3325 				if (pgva < kstart || pgva >= kend)
3326 					continue;
3327 			}
3328 		}
3329 
3330 		pa = PVO_PADDR(pvo);
3331 
3332 		if (va & PVO_LARGE) {
3333 			pa_end = pa + lpsize;
3334 			for (; pa < pa_end; pa += PAGE_SIZE) {
3335 				if (vm_phys_is_dumpable(pa))
3336 					vm_page_dump_add(dump_bitset, pa);
3337 			}
3338 		} else {
3339 			if (vm_phys_is_dumpable(pa))
3340 				vm_page_dump_add(dump_bitset, pa);
3341 		}
3342 	}
3343 	PMAP_UNLOCK(kernel_pmap);
3344 
3345 	return (sizeof(struct lpte) * moea64_pteg_count * 8);
3346 }
3347 
3348 static struct dump_context dump_ctx;
3349 
3350 static void *
moea64_dump_pmap_init(unsigned blkpgs)3351 moea64_dump_pmap_init(unsigned blkpgs)
3352 {
3353 	dump_ctx.ptex = 0;
3354 	dump_ctx.ptex_end = moea64_pteg_count * 8;
3355 	dump_ctx.blksz = blkpgs * PAGE_SIZE;
3356 	return (&dump_ctx);
3357 }
3358 
3359 #else
3360 
3361 static size_t
moea64_scan_pmap(struct bitset * dump_bitset __unused)3362 moea64_scan_pmap(struct bitset *dump_bitset __unused)
3363 {
3364 	return (0);
3365 }
3366 
3367 static void *
moea64_dump_pmap_init(unsigned blkpgs)3368 moea64_dump_pmap_init(unsigned blkpgs)
3369 {
3370 	return (NULL);
3371 }
3372 
3373 #endif
3374 
3375 #ifdef __powerpc64__
3376 static void
moea64_map_range(vm_offset_t va,vm_paddr_t pa,vm_size_t npages)3377 moea64_map_range(vm_offset_t va, vm_paddr_t pa, vm_size_t npages)
3378 {
3379 
3380 	for (; npages > 0; --npages) {
3381 		if (moea64_large_page_size != 0 &&
3382 		    (pa & moea64_large_page_mask) == 0 &&
3383 		    (va & moea64_large_page_mask) == 0 &&
3384 		    npages >= (moea64_large_page_size >> PAGE_SHIFT)) {
3385 			PMAP_LOCK(kernel_pmap);
3386 			moea64_kenter_large(va, pa, 0, 0);
3387 			PMAP_UNLOCK(kernel_pmap);
3388 			pa += moea64_large_page_size;
3389 			va += moea64_large_page_size;
3390 			npages -= (moea64_large_page_size >> PAGE_SHIFT) - 1;
3391 		} else {
3392 			moea64_kenter(va, pa);
3393 			pa += PAGE_SIZE;
3394 			va += PAGE_SIZE;
3395 		}
3396 	}
3397 }
3398 
3399 static void
moea64_page_array_startup(long pages)3400 moea64_page_array_startup(long pages)
3401 {
3402 	long dom_pages[MAXMEMDOM];
3403 	vm_paddr_t pa;
3404 	vm_offset_t va, vm_page_base;
3405 	vm_size_t needed, size;
3406 	int domain;
3407 	int i;
3408 
3409 	vm_page_base = 0xd000000000000000ULL;
3410 
3411 	/* Short-circuit single-domain systems. */
3412 	if (vm_ndomains == 1) {
3413 		size = round_page(pages * sizeof(struct vm_page));
3414 		pa = vm_phys_early_alloc(0, size);
3415 		vm_page_base = moea64_map(&vm_page_base,
3416 		    pa, pa + size, VM_PROT_READ | VM_PROT_WRITE);
3417 		vm_page_array_size = pages;
3418 		vm_page_array = (vm_page_t)vm_page_base;
3419 		return;
3420 	}
3421 
3422 	for (i = 0; i < MAXMEMDOM; i++)
3423 		dom_pages[i] = 0;
3424 
3425 	/* Now get the number of pages required per domain. */
3426 	for (i = 0; i < vm_phys_nsegs; i++) {
3427 		domain = vm_phys_segs[i].domain;
3428 		KASSERT(domain < MAXMEMDOM,
3429 		    ("Invalid vm_phys_segs NUMA domain %d!\n", domain));
3430 		/* Get size of vm_page_array needed for this segment. */
3431 		size = btoc(vm_phys_segs[i].end - vm_phys_segs[i].start);
3432 		dom_pages[domain] += size;
3433 	}
3434 
3435 	for (i = 0; phys_avail[i + 1] != 0; i+= 2) {
3436 		domain = vm_phys_domain(phys_avail[i]);
3437 		KASSERT(domain < MAXMEMDOM,
3438 		    ("Invalid phys_avail NUMA domain %d!\n", domain));
3439 		size = btoc(phys_avail[i + 1] - phys_avail[i]);
3440 		dom_pages[domain] += size;
3441 	}
3442 
3443 	/*
3444 	 * Map in chunks that can get us all 16MB pages.  There will be some
3445 	 * overlap between domains, but that's acceptable for now.
3446 	 */
3447 	vm_page_array_size = 0;
3448 	va = vm_page_base;
3449 	for (i = 0; i < MAXMEMDOM && vm_page_array_size < pages; i++) {
3450 		if (dom_pages[i] == 0)
3451 			continue;
3452 		size = ulmin(pages - vm_page_array_size, dom_pages[i]);
3453 		size = round_page(size * sizeof(struct vm_page));
3454 		needed = size;
3455 		size = roundup2(size, moea64_large_page_size);
3456 		pa = vm_phys_early_alloc(i, size);
3457 		vm_page_array_size += size / sizeof(struct vm_page);
3458 		moea64_map_range(va, pa, size >> PAGE_SHIFT);
3459 		/* Scoot up domain 0, to reduce the domain page overlap. */
3460 		if (i == 0)
3461 			vm_page_base += size - needed;
3462 		va += size;
3463 	}
3464 	vm_page_array = (vm_page_t)vm_page_base;
3465 	vm_page_array_size = pages;
3466 }
3467 #endif
3468 
3469 static int64_t
moea64_null_method(void)3470 moea64_null_method(void)
3471 {
3472 	return (0);
3473 }
3474 
moea64_pte_replace_default(struct pvo_entry * pvo,int flags)3475 static int64_t moea64_pte_replace_default(struct pvo_entry *pvo, int flags)
3476 {
3477 	int64_t refchg;
3478 
3479 	refchg = moea64_pte_unset(pvo);
3480 	moea64_pte_insert(pvo);
3481 
3482 	return (refchg);
3483 }
3484 
3485 struct moea64_funcs *moea64_ops;
3486 
3487 #define DEFINE_OEA64_IFUNC(ret, func, args, def)		\
3488 	DEFINE_IFUNC(, ret, moea64_##func, args) {		\
3489 		moea64_##func##_t f;				\
3490 		if (moea64_ops == NULL)				\
3491 			return ((moea64_##func##_t)def);	\
3492 		f = moea64_ops->func;				\
3493 		return (f != NULL ? f : (moea64_##func##_t)def);\
3494 	}
3495 
3496 void
moea64_install(void)3497 moea64_install(void)
3498 {
3499 #ifdef __powerpc64__
3500 	if (hw_direct_map == -1) {
3501 		moea64_probe_large_page();
3502 
3503 		/* Use a direct map if we have large page support */
3504 		if (moea64_large_page_size > 0)
3505 			hw_direct_map = 1;
3506 		else
3507 			hw_direct_map = 0;
3508 	}
3509 #endif
3510 
3511 	/*
3512 	 * Default to non-DMAP, and switch over to DMAP functions once we know
3513 	 * we have DMAP.
3514 	 */
3515 	if (hw_direct_map) {
3516 		moea64_methods.quick_enter_page = moea64_quick_enter_page_dmap;
3517 		moea64_methods.quick_remove_page = NULL;
3518 		moea64_methods.copy_page = moea64_copy_page_dmap;
3519 		moea64_methods.zero_page = moea64_zero_page_dmap;
3520 		moea64_methods.copy_pages = moea64_copy_pages_dmap;
3521 	}
3522 }
3523 
3524 DEFINE_OEA64_IFUNC(int64_t, pte_replace, (struct pvo_entry *, int),
3525     moea64_pte_replace_default)
3526 DEFINE_OEA64_IFUNC(int64_t, pte_insert, (struct pvo_entry *), moea64_null_method)
3527 DEFINE_OEA64_IFUNC(int64_t, pte_unset, (struct pvo_entry *), moea64_null_method)
3528 DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t),
3529     moea64_null_method)
3530 DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method)
3531 DEFINE_OEA64_IFUNC(int64_t, pte_insert_sp, (struct pvo_entry *), moea64_null_method)
3532 DEFINE_OEA64_IFUNC(int64_t, pte_unset_sp, (struct pvo_entry *), moea64_null_method)
3533 DEFINE_OEA64_IFUNC(int64_t, pte_replace_sp, (struct pvo_entry *), moea64_null_method)
3534 
3535 /* Superpage functions */
3536 
3537 /* MMU interface */
3538 
3539 static bool
moea64_ps_enabled(pmap_t pmap)3540 moea64_ps_enabled(pmap_t pmap)
3541 {
3542 	return (superpages_enabled);
3543 }
3544 
3545 static void
moea64_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)3546 moea64_align_superpage(vm_object_t object, vm_ooffset_t offset,
3547     vm_offset_t *addr, vm_size_t size)
3548 {
3549 	vm_offset_t sp_offset;
3550 
3551 	if (size < HPT_SP_SIZE)
3552 		return;
3553 
3554 	CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx",
3555 	    __func__, (uintmax_t)offset, addr, (uintmax_t)size);
3556 
3557 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
3558 		offset += ptoa(object->pg_color);
3559 	sp_offset = offset & HPT_SP_MASK;
3560 	if (size - ((HPT_SP_SIZE - sp_offset) & HPT_SP_MASK) < HPT_SP_SIZE ||
3561 	    (*addr & HPT_SP_MASK) == sp_offset)
3562 		return;
3563 	if ((*addr & HPT_SP_MASK) < sp_offset)
3564 		*addr = (*addr & ~HPT_SP_MASK) + sp_offset;
3565 	else
3566 		*addr = ((*addr + HPT_SP_MASK) & ~HPT_SP_MASK) + sp_offset;
3567 }
3568 
3569 /* Helpers */
3570 
3571 static __inline void
moea64_pvo_cleanup(struct pvo_dlist * tofree)3572 moea64_pvo_cleanup(struct pvo_dlist *tofree)
3573 {
3574 	struct pvo_entry *pvo;
3575 
3576 	/* clean up */
3577 	while (!SLIST_EMPTY(tofree)) {
3578 		pvo = SLIST_FIRST(tofree);
3579 		SLIST_REMOVE_HEAD(tofree, pvo_dlink);
3580 		if (pvo->pvo_vaddr & PVO_DEAD)
3581 			moea64_pvo_remove_from_page(pvo);
3582 		free_pvo_entry(pvo);
3583 	}
3584 }
3585 
3586 static __inline uint16_t
pvo_to_vmpage_flags(struct pvo_entry * pvo)3587 pvo_to_vmpage_flags(struct pvo_entry *pvo)
3588 {
3589 	uint16_t flags;
3590 
3591 	flags = 0;
3592 	if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0)
3593 		flags |= PGA_WRITEABLE;
3594 	if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0)
3595 		flags |= PGA_EXECUTABLE;
3596 
3597 	return (flags);
3598 }
3599 
3600 /*
3601  * Check if the given pvo and its superpage are in sva-eva range.
3602  */
3603 static __inline bool
moea64_sp_pvo_in_range(struct pvo_entry * pvo,vm_offset_t sva,vm_offset_t eva)3604 moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva)
3605 {
3606 	vm_offset_t spva;
3607 
3608 	spva = PVO_VADDR(pvo) & ~HPT_SP_MASK;
3609 	if (spva >= sva && spva + HPT_SP_SIZE <= eva) {
3610 		/*
3611 		 * Because this function is intended to be called from loops
3612 		 * that iterate over ordered pvo entries, if the condition
3613 		 * above is true then the pvo must be the first of its
3614 		 * superpage.
3615 		 */
3616 		KASSERT(PVO_VADDR(pvo) == spva,
3617 		    ("%s: unexpected unaligned superpage pvo", __func__));
3618 		return (true);
3619 	}
3620 	return (false);
3621 }
3622 
3623 /*
3624  * Update vm about the REF/CHG bits if the superpage is managed and
3625  * has (or had) write access.
3626  */
3627 static void
moea64_sp_refchg_process(struct pvo_entry * sp,vm_page_t m,int64_t sp_refchg,vm_prot_t prot)3628 moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m,
3629     int64_t sp_refchg, vm_prot_t prot)
3630 {
3631 	vm_page_t m_end;
3632 	int64_t refchg;
3633 
3634 	if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) {
3635 		for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++) {
3636 			refchg = sp_refchg |
3637 			    atomic_readandclear_32(&m->md.mdpg_attrs);
3638 			if (refchg & LPTE_CHG)
3639 				vm_page_dirty(m);
3640 			if (refchg & LPTE_REF)
3641 				vm_page_aflag_set(m, PGA_REFERENCED);
3642 		}
3643 	}
3644 }
3645 
3646 /* Superpage ops */
3647 
3648 static int
moea64_sp_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)3649 moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
3650     vm_prot_t prot, u_int flags, int8_t psind)
3651 {
3652 	struct pvo_entry *pvo, **pvos;
3653 	struct pvo_head *pvo_head;
3654 	vm_offset_t sva;
3655 	vm_page_t sm;
3656 	vm_paddr_t pa, spa;
3657 	bool sync;
3658 	struct pvo_dlist tofree;
3659 	int error __diagused, i;
3660 	uint16_t aflags;
3661 
3662 	KASSERT((va & HPT_SP_MASK) == 0, ("%s: va %#jx unaligned",
3663 	    __func__, (uintmax_t)va));
3664 	KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind));
3665 	KASSERT(m->psind == 1, ("%s: invalid m->psind: %d",
3666 	    __func__, m->psind));
3667 	KASSERT(pmap != kernel_pmap,
3668 	    ("%s: function called with kernel pmap", __func__));
3669 
3670 	CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1",
3671 	    __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m),
3672 	    prot, flags);
3673 
3674 	SLIST_INIT(&tofree);
3675 
3676 	sva = va;
3677 	sm = m;
3678 	spa = pa = VM_PAGE_TO_PHYS(sm);
3679 
3680 	/* Try to allocate all PVOs first, to make failure handling easier. */
3681 	pvos = malloc(HPT_SP_PAGES * sizeof(struct pvo_entry *), M_TEMP,
3682 	    M_NOWAIT);
3683 	if (pvos == NULL) {
3684 		CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__);
3685 		return (KERN_RESOURCE_SHORTAGE);
3686 	}
3687 
3688 	for (i = 0; i < HPT_SP_PAGES; i++) {
3689 		pvos[i] = alloc_pvo_entry(0);
3690 		if (pvos[i] == NULL) {
3691 			CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__);
3692 			for (i = i - 1; i >= 0; i--)
3693 				free_pvo_entry(pvos[i]);
3694 			free(pvos, M_TEMP);
3695 			return (KERN_RESOURCE_SHORTAGE);
3696 		}
3697 	}
3698 
3699 	PV_WR_LOCK(spa);
3700 	PMAP_LOCK(pmap);
3701 
3702 	/* Note: moea64_remove_locked() also clears cached REF/CHG bits. */
3703 	moea64_remove_locked(pmap, va, va + HPT_SP_SIZE, &tofree);
3704 
3705 	/* Enter pages */
3706 	for (i = 0; i < HPT_SP_PAGES;
3707 	    i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) {
3708 		pvo = pvos[i];
3709 
3710 		pvo->pvo_pte.prot = prot;
3711 		pvo->pvo_pte.pa = (pa & ~HPT_SP_MASK) | LPTE_LP_4K_16M |
3712 		    moea64_calc_wimg(pa, pmap_page_get_memattr(m));
3713 
3714 		if ((flags & PMAP_ENTER_WIRED) != 0)
3715 			pvo->pvo_vaddr |= PVO_WIRED;
3716 		pvo->pvo_vaddr |= PVO_LARGE;
3717 
3718 		if ((m->oflags & VPO_UNMANAGED) != 0)
3719 			pvo_head = NULL;
3720 		else {
3721 			pvo_head = &m->md.mdpg_pvoh;
3722 			pvo->pvo_vaddr |= PVO_MANAGED;
3723 		}
3724 
3725 		init_pvo_entry(pvo, pmap, va);
3726 
3727 		error = moea64_pvo_enter(pvo, pvo_head, NULL);
3728 		/*
3729 		 * All superpage PVOs were previously removed, so no errors
3730 		 * should occur while inserting the new ones.
3731 		 */
3732 		KASSERT(error == 0, ("%s: unexpected error "
3733 			    "when inserting superpage PVO: %d",
3734 			    __func__, error));
3735 	}
3736 
3737 	PMAP_UNLOCK(pmap);
3738 	PV_UNLOCK(spa);
3739 
3740 	sync = (sm->a.flags & PGA_EXECUTABLE) == 0;
3741 	/* Note: moea64_pvo_cleanup() also clears page prot. flags. */
3742 	moea64_pvo_cleanup(&tofree);
3743 	pvo = pvos[0];
3744 
3745 	/* Set vm page flags */
3746 	aflags = pvo_to_vmpage_flags(pvo);
3747 	if (aflags != 0)
3748 		for (m = sm; m < &sm[HPT_SP_PAGES]; m++)
3749 			vm_page_aflag_set(m, aflags);
3750 
3751 	/*
3752 	 * Flush the page from the instruction cache if this page is
3753 	 * mapped executable and cacheable.
3754 	 */
3755 	if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0)
3756 		moea64_syncicache(pmap, sva, spa, HPT_SP_SIZE);
3757 
3758 	atomic_add_long(&sp_mappings, 1);
3759 	CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p",
3760 	    __func__, (uintmax_t)sva, pmap);
3761 
3762 	free(pvos, M_TEMP);
3763 	return (KERN_SUCCESS);
3764 }
3765 
3766 #if VM_NRESERVLEVEL > 0
3767 static void
moea64_sp_promote(pmap_t pmap,vm_offset_t va,vm_page_t m)3768 moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m)
3769 {
3770 	struct pvo_entry *first, *pvo;
3771 	vm_paddr_t pa, pa_end;
3772 	vm_offset_t sva, va_end;
3773 	int64_t sp_refchg;
3774 
3775 	/* This CTR may generate a lot of output. */
3776 	/* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */
3777 
3778 	va &= ~HPT_SP_MASK;
3779 	sva = va;
3780 	/* Get superpage */
3781 	pa = VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK;
3782 	m = PHYS_TO_VM_PAGE(pa);
3783 
3784 	PMAP_LOCK(pmap);
3785 
3786 	/*
3787 	 * Check if all pages meet promotion criteria.
3788 	 *
3789 	 * XXX In some cases the loop below may be executed for each or most
3790 	 * of the entered pages of a superpage, which can be expensive
3791 	 * (although it was not profiled) and need some optimization.
3792 	 *
3793 	 * Some cases where this seems to happen are:
3794 	 * - When a superpage is first entered read-only and later becomes
3795 	 *   read-write.
3796 	 * - When some of the superpage's virtual addresses map to previously
3797 	 *   wired/cached pages while others map to pages allocated from a
3798 	 *   different physical address range. A common scenario where this
3799 	 *   happens is when mmap'ing a file that is already present in FS
3800 	 *   block cache and doesn't fill a superpage.
3801 	 */
3802 	first = pvo = moea64_pvo_find_va(pmap, sva);
3803 	for (pa_end = pa + HPT_SP_SIZE;
3804 	    pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) {
3805 		if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
3806 			CTR3(KTR_PMAP,
3807 			    "%s: NULL or dead PVO: pmap=%p, va=%#jx",
3808 			    __func__, pmap, (uintmax_t)va);
3809 			goto error;
3810 		}
3811 		if (PVO_PADDR(pvo) != pa) {
3812 			CTR5(KTR_PMAP, "%s: PAs don't match: "
3813 			    "pmap=%p, va=%#jx, pvo_pa=%#jx, exp_pa=%#jx",
3814 			    __func__, pmap, (uintmax_t)va,
3815 			    (uintmax_t)PVO_PADDR(pvo), (uintmax_t)pa);
3816 			atomic_add_long(&sp_p_fail_pa, 1);
3817 			goto error;
3818 		}
3819 		if ((first->pvo_vaddr & PVO_FLAGS_PROMOTE) !=
3820 		    (pvo->pvo_vaddr & PVO_FLAGS_PROMOTE)) {
3821 			CTR5(KTR_PMAP, "%s: PVO flags don't match: "
3822 			    "pmap=%p, va=%#jx, pvo_flags=%#jx, exp_flags=%#jx",
3823 			    __func__, pmap, (uintmax_t)va,
3824 			    (uintmax_t)(pvo->pvo_vaddr & PVO_FLAGS_PROMOTE),
3825 			    (uintmax_t)(first->pvo_vaddr & PVO_FLAGS_PROMOTE));
3826 			atomic_add_long(&sp_p_fail_flags, 1);
3827 			goto error;
3828 		}
3829 		if (first->pvo_pte.prot != pvo->pvo_pte.prot) {
3830 			CTR5(KTR_PMAP, "%s: PVO protections don't match: "
3831 			    "pmap=%p, va=%#jx, pvo_prot=%#x, exp_prot=%#x",
3832 			    __func__, pmap, (uintmax_t)va,
3833 			    pvo->pvo_pte.prot, first->pvo_pte.prot);
3834 			atomic_add_long(&sp_p_fail_prot, 1);
3835 			goto error;
3836 		}
3837 		if ((first->pvo_pte.pa & LPTE_WIMG) !=
3838 		    (pvo->pvo_pte.pa & LPTE_WIMG)) {
3839 			CTR5(KTR_PMAP, "%s: WIMG bits don't match: "
3840 			    "pmap=%p, va=%#jx, pvo_wimg=%#jx, exp_wimg=%#jx",
3841 			    __func__, pmap, (uintmax_t)va,
3842 			    (uintmax_t)(pvo->pvo_pte.pa & LPTE_WIMG),
3843 			    (uintmax_t)(first->pvo_pte.pa & LPTE_WIMG));
3844 			atomic_add_long(&sp_p_fail_wimg, 1);
3845 			goto error;
3846 		}
3847 
3848 		pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo);
3849 	}
3850 
3851 	/* All OK, promote. */
3852 
3853 	/*
3854 	 * Handle superpage REF/CHG bits. If REF or CHG is set in
3855 	 * any page, then it must be set in the superpage.
3856 	 *
3857 	 * Instead of querying each page, we take advantage of two facts:
3858 	 * 1- If a page is being promoted, it was referenced.
3859 	 * 2- If promoted pages are writable, they were modified.
3860 	 */
3861 	sp_refchg = LPTE_REF |
3862 	    ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0);
3863 
3864 	/* Promote pages */
3865 
3866 	for (pvo = first, va_end = PVO_VADDR(pvo) + HPT_SP_SIZE;
3867 	    pvo != NULL && PVO_VADDR(pvo) < va_end;
3868 	    pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
3869 		pvo->pvo_pte.pa &= ADDR_POFF | ~HPT_SP_MASK;
3870 		pvo->pvo_pte.pa |= LPTE_LP_4K_16M;
3871 		pvo->pvo_vaddr |= PVO_LARGE;
3872 	}
3873 	moea64_pte_replace_sp(first);
3874 
3875 	/* Send REF/CHG bits to VM */
3876 	moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot);
3877 
3878 	/* Use first page to cache REF/CHG bits */
3879 	atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP);
3880 
3881 	PMAP_UNLOCK(pmap);
3882 
3883 	atomic_add_long(&sp_mappings, 1);
3884 	atomic_add_long(&sp_promotions, 1);
3885 	CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
3886 	    __func__, (uintmax_t)sva, pmap);
3887 	return;
3888 
3889 error:
3890 	atomic_add_long(&sp_p_failures, 1);
3891 	PMAP_UNLOCK(pmap);
3892 }
3893 #endif
3894 
3895 static void
moea64_sp_demote_aligned(struct pvo_entry * sp)3896 moea64_sp_demote_aligned(struct pvo_entry *sp)
3897 {
3898 	struct pvo_entry *pvo;
3899 	vm_offset_t va, va_end;
3900 	vm_paddr_t pa;
3901 	vm_page_t m;
3902 	pmap_t pmap __diagused;
3903 	int64_t refchg;
3904 
3905 	CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
3906 
3907 	pmap = sp->pvo_pmap;
3908 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3909 
3910 	pvo = sp;
3911 
3912 	/* Demote pages */
3913 
3914 	va = PVO_VADDR(pvo);
3915 	pa = PVO_PADDR(pvo);
3916 	m = PHYS_TO_VM_PAGE(pa);
3917 
3918 	for (pvo = sp, va_end = va + HPT_SP_SIZE;
3919 	    pvo != NULL && PVO_VADDR(pvo) < va_end;
3920 	    pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo),
3921 	    va += PAGE_SIZE, pa += PAGE_SIZE) {
3922 		KASSERT(pvo && PVO_VADDR(pvo) == va,
3923 		    ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va));
3924 
3925 		pvo->pvo_vaddr &= ~PVO_LARGE;
3926 		pvo->pvo_pte.pa &= ~LPTE_RPGN;
3927 		pvo->pvo_pte.pa |= pa;
3928 
3929 	}
3930 	refchg = moea64_pte_replace_sp(sp);
3931 
3932 	/*
3933 	 * Clear SP flag
3934 	 *
3935 	 * XXX It is possible that another pmap has this page mapped as
3936 	 *     part of a superpage, but as the SP flag is used only for
3937 	 *     caching SP REF/CHG bits, that will be queried if not set
3938 	 *     in cache, it should be ok to clear it here.
3939 	 */
3940 	atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP);
3941 
3942 	/*
3943 	 * Handle superpage REF/CHG bits. A bit set in the superpage
3944 	 * means all pages should consider it set.
3945 	 */
3946 	moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot);
3947 
3948 	atomic_add_long(&sp_demotions, 1);
3949 	CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
3950 	    __func__, (uintmax_t)PVO_VADDR(sp), pmap);
3951 }
3952 
3953 static void
moea64_sp_demote(struct pvo_entry * pvo)3954 moea64_sp_demote(struct pvo_entry *pvo)
3955 {
3956 	PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
3957 
3958 	if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
3959 		pvo = moea64_pvo_find_va(pvo->pvo_pmap,
3960 		    PVO_VADDR(pvo) & ~HPT_SP_MASK);
3961 		KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx",
3962 		     __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
3963 	}
3964 	moea64_sp_demote_aligned(pvo);
3965 }
3966 
3967 static struct pvo_entry *
moea64_sp_unwire(struct pvo_entry * sp)3968 moea64_sp_unwire(struct pvo_entry *sp)
3969 {
3970 	struct pvo_entry *pvo, *prev;
3971 	vm_offset_t eva;
3972 	pmap_t pm;
3973 	int64_t ret, refchg;
3974 
3975 	CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
3976 
3977 	pm = sp->pvo_pmap;
3978 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
3979 
3980 	eva = PVO_VADDR(sp) + HPT_SP_SIZE;
3981 	refchg = 0;
3982 	for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
3983 	    prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
3984 		if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
3985 			panic("%s: pvo %p is missing PVO_WIRED",
3986 			    __func__, pvo);
3987 		pvo->pvo_vaddr &= ~PVO_WIRED;
3988 
3989 		ret = moea64_pte_replace(pvo, 0 /* No invalidation */);
3990 		if (ret < 0)
3991 			refchg |= LPTE_CHG;
3992 		else
3993 			refchg |= ret;
3994 
3995 		pm->pm_stats.wired_count--;
3996 	}
3997 
3998 	/* Send REF/CHG bits to VM */
3999 	moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)),
4000 	    refchg, sp->pvo_pte.prot);
4001 
4002 	return (prev);
4003 }
4004 
4005 static struct pvo_entry *
moea64_sp_protect(struct pvo_entry * sp,vm_prot_t prot)4006 moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot)
4007 {
4008 	struct pvo_entry *pvo, *prev;
4009 	vm_offset_t eva;
4010 	pmap_t pm;
4011 	vm_page_t m, m_end;
4012 	int64_t ret, refchg;
4013 	vm_prot_t oldprot;
4014 
4015 	CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x",
4016 	    __func__, (uintmax_t)PVO_VADDR(sp), prot);
4017 
4018 	pm = sp->pvo_pmap;
4019 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
4020 
4021 	oldprot = sp->pvo_pte.prot;
4022 	m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
4023 	KASSERT(m != NULL, ("%s: missing vm page for pa %#jx",
4024 	    __func__, (uintmax_t)PVO_PADDR(sp)));
4025 	eva = PVO_VADDR(sp) + HPT_SP_SIZE;
4026 	refchg = 0;
4027 
4028 	for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
4029 	    prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
4030 		pvo->pvo_pte.prot = prot;
4031 		/*
4032 		 * If the PVO is in the page table, update mapping
4033 		 */
4034 		ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
4035 		if (ret < 0)
4036 			refchg |= LPTE_CHG;
4037 		else
4038 			refchg |= ret;
4039 	}
4040 
4041 	/* Send REF/CHG bits to VM */
4042 	moea64_sp_refchg_process(sp, m, refchg, oldprot);
4043 
4044 	/* Handle pages that became executable */
4045 	if ((m->a.flags & PGA_EXECUTABLE) == 0 &&
4046 	    (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
4047 		if ((m->oflags & VPO_UNMANAGED) == 0)
4048 			for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++)
4049 				vm_page_aflag_set(m, PGA_EXECUTABLE);
4050 		moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp),
4051 		    HPT_SP_SIZE);
4052 	}
4053 
4054 	return (prev);
4055 }
4056 
4057 static struct pvo_entry *
moea64_sp_remove(struct pvo_entry * sp,struct pvo_dlist * tofree)4058 moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree)
4059 {
4060 	struct pvo_entry *pvo, *tpvo;
4061 	vm_offset_t eva;
4062 	pmap_t pm __diagused;
4063 
4064 	CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
4065 
4066 	pm = sp->pvo_pmap;
4067 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
4068 
4069 	eva = PVO_VADDR(sp) + HPT_SP_SIZE;
4070 	for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
4071 		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
4072 
4073 		/*
4074 		 * For locking reasons, remove this from the page table and
4075 		 * pmap, but save delinking from the vm_page for a second
4076 		 * pass
4077 		 */
4078 		moea64_pvo_remove_from_pmap(pvo);
4079 		SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
4080 	}
4081 
4082 	/*
4083 	 * Clear SP bit
4084 	 *
4085 	 * XXX See comment in moea64_sp_demote_aligned() for why it's
4086 	 *     ok to always clear the SP bit on remove/demote.
4087 	 */
4088 	atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs,
4089 	    MDPG_ATTR_SP);
4090 
4091 	return (tpvo);
4092 }
4093 
4094 static int64_t
moea64_sp_query_locked(struct pvo_entry * pvo,uint64_t ptebit)4095 moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit)
4096 {
4097 	int64_t refchg, ret;
4098 	vm_offset_t eva;
4099 	vm_page_t m;
4100 	pmap_t pmap;
4101 	struct pvo_entry *sp;
4102 
4103 	PV_LOCKASSERT(PVO_PADDR(pvo));
4104 
4105 	pmap = pvo->pvo_pmap;
4106 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4107 
4108 	/* Get first SP PVO */
4109 	if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
4110 		sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK);
4111 		KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
4112 		     __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
4113 	} else
4114 		sp = pvo;
4115 	eva = PVO_VADDR(sp) + HPT_SP_SIZE;
4116 
4117 	refchg = 0;
4118 	for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
4119 	    pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
4120 		ret = moea64_pte_synch(pvo);
4121 		if (ret > 0) {
4122 			refchg |= ret & (LPTE_CHG | LPTE_REF);
4123 			if ((refchg & ptebit) != 0)
4124 				break;
4125 		}
4126 	}
4127 
4128 	/* Save results */
4129 	if (refchg != 0) {
4130 		m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
4131 		atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP);
4132 	}
4133 
4134 	return (refchg);
4135 }
4136 
4137 /*
4138  * Note: this assumes the vm_page represented by the given pvo
4139  * is at least read locked.
4140  */
4141 static int64_t
moea64_sp_query(struct pvo_entry * pvo,uint64_t ptebit)4142 moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit)
4143 {
4144 	int64_t refchg;
4145 	pmap_t pmap;
4146 
4147 	PV_LOCKASSERT(PVO_PADDR(pvo));
4148 
4149 	pmap = pvo->pvo_pmap;
4150 	PMAP_LOCK(pmap);
4151 
4152 	/*
4153 	 * Check if SP was demoted/removed before pmap lock was acquired.
4154 	 */
4155 	if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
4156 		CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
4157 		    __func__, (uintmax_t)PVO_PADDR(pvo));
4158 		PMAP_UNLOCK(pmap);
4159 		return (-1);
4160 	}
4161 
4162 	refchg = moea64_sp_query_locked(pvo, ptebit);
4163 	PMAP_UNLOCK(pmap);
4164 
4165 	CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
4166 	    __func__, (uintmax_t)PVO_VADDR(pvo),
4167 	    (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg);
4168 
4169 	return (refchg);
4170 }
4171 
4172 static int64_t
moea64_sp_pvo_clear(struct pvo_entry * pvo,uint64_t ptebit)4173 moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit)
4174 {
4175 	int64_t refchg, ret;
4176 	pmap_t pmap;
4177 	struct pvo_entry *sp;
4178 	vm_offset_t eva;
4179 	vm_page_t m;
4180 
4181 	pmap = pvo->pvo_pmap;
4182 	PMAP_LOCK(pmap);
4183 
4184 	/*
4185 	 * Check if SP was demoted/removed before pmap lock was acquired.
4186 	 */
4187 	if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
4188 		CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
4189 		    __func__, (uintmax_t)PVO_PADDR(pvo));
4190 		PMAP_UNLOCK(pmap);
4191 		return (-1);
4192 	}
4193 
4194 	/* Get first SP PVO */
4195 	if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
4196 		sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK);
4197 		KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
4198 		     __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
4199 	} else
4200 		sp = pvo;
4201 	eva = PVO_VADDR(sp) + HPT_SP_SIZE;
4202 
4203 	refchg = 0;
4204 	for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
4205 	    pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
4206 		ret = moea64_pte_clear(pvo, ptebit);
4207 		if (ret > 0)
4208 			refchg |= ret & (LPTE_CHG | LPTE_REF);
4209 	}
4210 
4211 	m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
4212 	atomic_clear_32(&m->md.mdpg_attrs, ptebit);
4213 	PMAP_UNLOCK(pmap);
4214 
4215 	CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
4216 	    __func__, (uintmax_t)PVO_VADDR(sp),
4217 	    (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg);
4218 
4219 	return (refchg);
4220 }
4221 
4222 static int64_t
moea64_sp_clear(struct pvo_entry * pvo,vm_page_t m,uint64_t ptebit)4223 moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit)
4224 {
4225 	int64_t count, ret;
4226 	pmap_t pmap;
4227 
4228 	count = 0;
4229 	pmap = pvo->pvo_pmap;
4230 
4231 	/*
4232 	 * Since this reference bit is shared by 4096 4KB pages, it
4233 	 * should not be cleared every time it is tested. Apply a
4234 	 * simple "hash" function on the physical page number, the
4235 	 * virtual superpage number, and the pmap address to select
4236 	 * one 4KB page out of the 4096 on which testing the
4237 	 * reference bit will result in clearing that reference bit.
4238 	 * This function is designed to avoid the selection of the
4239 	 * same 4KB page for every 16MB page mapping.
4240 	 *
4241 	 * Always leave the reference bit of a wired mapping set, as
4242 	 * the current state of its reference bit won't affect page
4243 	 * replacement.
4244 	 */
4245 	if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^
4246 	    (PVO_VADDR(pvo) >> HPT_SP_SHIFT) ^ (uintptr_t)pmap) &
4247 	    (HPT_SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) {
4248 		if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1)
4249 			return (-1);
4250 
4251 		if ((ret & ptebit) != 0)
4252 			count++;
4253 
4254 	/*
4255 	 * If this page was not selected by the hash function, then assume
4256 	 * its REF bit was set.
4257 	 */
4258 	} else if (ptebit == LPTE_REF) {
4259 		count++;
4260 
4261 	/*
4262 	 * To clear the CHG bit of a single SP page, first it must be demoted.
4263 	 * But if no CHG bit is set, no bit clear and thus no SP demotion is
4264 	 * needed.
4265 	 */
4266 	} else {
4267 		CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx",
4268 		    __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo),
4269 		    (uintmax_t)PVO_PADDR(pvo));
4270 
4271 		PMAP_LOCK(pmap);
4272 
4273 		/*
4274 		 * Make sure SP wasn't demoted/removed before pmap lock
4275 		 * was acquired.
4276 		 */
4277 		if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
4278 			CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
4279 			    __func__, (uintmax_t)PVO_PADDR(pvo));
4280 			PMAP_UNLOCK(pmap);
4281 			return (-1);
4282 		}
4283 
4284 		ret = moea64_sp_query_locked(pvo, ptebit);
4285 		if ((ret & ptebit) != 0)
4286 			count++;
4287 		else {
4288 			PMAP_UNLOCK(pmap);
4289 			return (0);
4290 		}
4291 
4292 		moea64_sp_demote(pvo);
4293 		moea64_pte_clear(pvo, ptebit);
4294 
4295 		/*
4296 		 * Write protect the mapping to a single page so that a
4297 		 * subsequent write access may repromote.
4298 		 */
4299 		if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
4300 			moea64_pvo_protect(pmap, pvo,
4301 			    pvo->pvo_pte.prot & ~VM_PROT_WRITE);
4302 
4303 		PMAP_UNLOCK(pmap);
4304 	}
4305 
4306 	return (count);
4307 }
4308