xref: /src/sys/amd64/amd64/machdep.c (revision 89589b6d3fbac43eb7c6b3cdbdd6f077888b2142)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #include "opt_atpic.h"
42 #include "opt_cpu.h"
43 #include "opt_ddb.h"
44 #include "opt_inet.h"
45 #include "opt_isa.h"
46 #include "opt_kstack_pages.h"
47 #include "opt_maxmem.h"
48 #include "opt_pci.h"
49 #include "opt_platform.h"
50 #include "opt_sched.h"
51 
52 #include <sys/param.h>
53 #include <sys/proc.h>
54 #include <sys/systm.h>
55 #include <sys/asan.h>
56 #include <sys/bio.h>
57 #include <sys/buf.h>
58 #include <sys/bus.h>
59 #include <sys/callout.h>
60 #include <sys/cons.h>
61 #include <sys/cpu.h>
62 #include <sys/csan.h>
63 #include <sys/efi.h>
64 #include <sys/eventhandler.h>
65 #include <sys/exec.h>
66 #include <sys/imgact.h>
67 #include <sys/kdb.h>
68 #include <sys/kernel.h>
69 #include <sys/ktr.h>
70 #include <sys/linker.h>
71 #include <sys/lock.h>
72 #include <sys/malloc.h>
73 #include <sys/memrange.h>
74 #include <sys/msan.h>
75 #include <sys/msgbuf.h>
76 #include <sys/mutex.h>
77 #include <sys/pcpu.h>
78 #include <sys/ptrace.h>
79 #include <sys/reboot.h>
80 #include <sys/reg.h>
81 #include <sys/rwlock.h>
82 #include <sys/sched.h>
83 #include <sys/signalvar.h>
84 #include <sys/smp.h>
85 #include <sys/syscallsubr.h>
86 #include <sys/sysctl.h>
87 #include <sys/sysent.h>
88 #include <sys/sysproto.h>
89 #include <sys/ucontext.h>
90 #include <sys/vmmeter.h>
91 
92 #include <vm/vm.h>
93 #include <vm/vm_param.h>
94 #include <vm/vm_extern.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_map.h>
98 #include <vm/vm_object.h>
99 #include <vm/vm_pager.h>
100 #include <vm/vm_phys.h>
101 #include <vm/vm_dumpset.h>
102 
103 #ifdef DDB
104 #ifndef KDB
105 #error KDB must be enabled in order for DDB to work!
106 #endif
107 #include <ddb/ddb.h>
108 #include <ddb/db_sym.h>
109 #endif
110 
111 #include <net/netisr.h>
112 
113 #include <dev/smbios/smbios.h>
114 
115 #include <machine/clock.h>
116 #include <machine/cpu.h>
117 #include <machine/cputypes.h>
118 #include <machine/frame.h>
119 #include <machine/intr_machdep.h>
120 #include <x86/mca.h>
121 #include <machine/md_var.h>
122 #include <machine/metadata.h>
123 #include <machine/pc/bios.h>
124 #include <machine/pcb.h>
125 #include <machine/proc.h>
126 #include <machine/sigframe.h>
127 #include <machine/specialreg.h>
128 #include <machine/trap.h>
129 #include <machine/tss.h>
130 #include <x86/ucode.h>
131 #include <x86/ifunc.h>
132 #include <machine/smp.h>
133 #ifdef FDT
134 #include <x86/fdt.h>
135 #endif
136 
137 #ifdef DEV_ATPIC
138 #include <x86/isa/icu.h>
139 #else
140 #include <x86/apicvar.h>
141 #endif
142 
143 #include <isa/isareg.h>
144 #include <isa/rtc.h>
145 #include <x86/init.h>
146 
147 #ifndef SMP
148 #error amd64 requires options SMP
149 #endif
150 
151 /* Sanity check for __curthread() */
152 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
153 
154 /*
155  * The PTI trampoline stack needs enough space for a hardware trapframe and a
156  * couple of scratch registers, as well as the trapframe left behind after an
157  * iret fault.
158  */
159 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
160     offsetof(struct pti_frame, pti_rip));
161 
162 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
163 
164 static void cpu_startup(void *);
165 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
166 
167 /* Probe 8254 PIT and TSC. */
168 static void native_clock_source_init(void);
169 
170 /* Preload data parse function */
171 static void native_parse_preload_data(u_int64_t);
172 
173 /* Native function to fetch and parse the e820 map */
174 static void native_parse_memmap(vm_paddr_t *, int *);
175 
176 /* Default init_ops implementation. */
177 struct init_ops init_ops = {
178 	.parse_preload_data =		native_parse_preload_data,
179 	.early_clock_source_init =	native_clock_source_init,
180 	.early_delay =			i8254_delay,
181 	.parse_memmap =			native_parse_memmap,
182 };
183 
184 /*
185  * Physical address of the EFI System Table. Stashed from the metadata hints
186  * passed into the kernel and used by the EFI code to call runtime services.
187  */
188 vm_paddr_t efi_systbl_phys;
189 
190 /*
191  * Bitmap of extra EFI memory region types that should be preserved and mapped
192  * during runtime services calls.
193  */
194 uint32_t efi_map_regs;
195 
196 /* Intel ICH registers */
197 #define ICH_PMBASE	0x400
198 #define ICH_SMI_EN	ICH_PMBASE + 0x30
199 
200 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
201 
202 int cold = 1;
203 
204 long Maxmem = 0;
205 long realmem = 0;
206 int late_console = 1;
207 int lass_enabled = 0;
208 
209 struct kva_md_info kmi;
210 
211 struct region_descriptor r_idt;
212 
213 struct pcpu *__pcpu;
214 struct pcpu temp_bsp_pcpu;
215 
216 struct mtx icu_lock;
217 
218 struct mem_range_softc mem_range_softc;
219 
220 struct mtx dt_lock;	/* lock for GDT and LDT */
221 
222 void (*vmm_suspend_p)(void);
223 void (*vmm_resume_p)(void);
224 
225 bool efi_boot;
226 
227 static void
cpu_startup(void * dummy)228 cpu_startup(void *dummy)
229 {
230 	uintmax_t memsize;
231 	char *sysenv;
232 
233 	/*
234 	 * On MacBooks, we need to disallow the legacy USB circuit to
235 	 * generate an SMI# because this can cause several problems,
236 	 * namely: incorrect CPU frequency detection and failure to
237 	 * start the APs.
238 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
239 	 * Enable register) of the Intel ICH LPC Interface Bridge.
240 	 */
241 	sysenv = kern_getenv("smbios.system.product");
242 	if (sysenv != NULL) {
243 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
244 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
245 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
246 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
247 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
248 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
249 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
250 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
251 			if (bootverbose)
252 				printf("Disabling LEGACY_USB_EN bit on "
253 				    "Intel ICH.\n");
254 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
255 		}
256 		freeenv(sysenv);
257 	}
258 
259 	/*
260 	 * Good {morning,afternoon,evening,night}.
261 	 */
262 	startrtclock();
263 	printcpuinfo();
264 
265 	/*
266 	 * Display physical memory if SMBIOS reports reasonable amount.
267 	 */
268 	memsize = 0;
269 	sysenv = kern_getenv("smbios.memory.enabled");
270 	if (sysenv != NULL) {
271 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
272 		freeenv(sysenv);
273 	}
274 	if (memsize < ptoa((uintmax_t)vm_free_count()))
275 		memsize = ptoa((uintmax_t)Maxmem);
276 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
277 	realmem = atop(memsize);
278 
279 	/*
280 	 * Display any holes after the first chunk of extended memory.
281 	 */
282 	if (bootverbose) {
283 		int indx;
284 
285 		printf("Physical memory chunk(s):\n");
286 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
287 			vm_paddr_t size;
288 
289 			size = phys_avail[indx + 1] - phys_avail[indx];
290 			printf(
291 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
292 			    (uintmax_t)phys_avail[indx],
293 			    (uintmax_t)phys_avail[indx + 1] - 1,
294 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
295 		}
296 	}
297 
298 	vm_ksubmap_init(&kmi);
299 
300 	printf("avail memory = %ju (%ju MB)\n",
301 	    ptoa((uintmax_t)vm_free_count()),
302 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
303 #ifdef DEV_PCI
304 	if (bootverbose && intel_graphics_stolen_base != 0)
305 		printf("intel stolen mem: base %#jx size %ju MB\n",
306 		    (uintmax_t)intel_graphics_stolen_base,
307 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
308 #endif
309 
310 	/*
311 	 * Set up buffers, so they can be used to read disk labels.
312 	 */
313 	bufinit();
314 	vm_pager_bufferinit();
315 
316 	cpu_setregs();
317 }
318 
319 static void
late_ifunc_resolve(void * dummy __unused)320 late_ifunc_resolve(void *dummy __unused)
321 {
322 	link_elf_late_ireloc();
323 }
324 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
325 
326 void
cpu_setregs(void)327 cpu_setregs(void)
328 {
329 	register_t cr0;
330 
331 	TSENTER();
332 	cr0 = rcr0();
333 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
334 	TSENTER2("load_cr0");
335 	load_cr0(cr0);
336 	TSEXIT2("load_cr0");
337 	TSEXIT();
338 }
339 
340 /*
341  * Initialize amd64 and configure to run kernel
342  */
343 
344 /*
345  * Initialize segments & interrupt table
346  */
347 static struct gate_descriptor idt0[NIDT];
348 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
349 
350 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
351 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
352 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
353 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
354 CTASSERT(sizeof(struct nmi_pcpu) == 16);
355 
356 /*
357  * Software prototypes -- in more palatable form.
358  *
359  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
360  * slots as corresponding segments for i386 kernel.
361  */
362 struct soft_segment_descriptor gdt_segs[] = {
363 [GNULL_SEL] = { /* 0 Null Descriptor */
364 	.ssd_base = 0x0,
365 	.ssd_limit = 0x0,
366 	.ssd_type = 0,
367 	.ssd_dpl = 0,
368 	.ssd_p = 0,
369 	.ssd_long = 0,
370 	.ssd_def32 = 0,
371 	.ssd_gran = 0		},
372 [GNULL2_SEL] = { /*	1 Null Descriptor */
373 	.ssd_base = 0x0,
374 	.ssd_limit = 0x0,
375 	.ssd_type = 0,
376 	.ssd_dpl = 0,
377 	.ssd_p = 0,
378 	.ssd_long = 0,
379 	.ssd_def32 = 0,
380 	.ssd_gran = 0		},
381 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
382 	.ssd_base = 0x0,
383 	.ssd_limit = 0xfffff,
384 	.ssd_type = SDT_MEMRWA,
385 	.ssd_dpl = SEL_UPL,
386 	.ssd_p = 1,
387 	.ssd_long = 0,
388 	.ssd_def32 = 1,
389 	.ssd_gran = 1		},
390 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
391 	.ssd_base = 0x0,
392 	.ssd_limit = 0xfffff,
393 	.ssd_type = SDT_MEMRWA,
394 	.ssd_dpl = SEL_UPL,
395 	.ssd_p = 1,
396 	.ssd_long = 0,
397 	.ssd_def32 = 1,
398 	.ssd_gran = 1		},
399 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
400 	.ssd_base = 0x0,
401 	.ssd_limit = 0xfffff,
402 	.ssd_type = SDT_MEMERA,
403 	.ssd_dpl = SEL_KPL,
404 	.ssd_p = 1,
405 	.ssd_long = 1,
406 	.ssd_def32 = 0,
407 	.ssd_gran = 1		},
408 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
409 	.ssd_base = 0x0,
410 	.ssd_limit = 0xfffff,
411 	.ssd_type = SDT_MEMRWA,
412 	.ssd_dpl = SEL_KPL,
413 	.ssd_p = 1,
414 	.ssd_long = 1,
415 	.ssd_def32 = 0,
416 	.ssd_gran = 1		},
417 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
418 	.ssd_base = 0x0,
419 	.ssd_limit = 0xfffff,
420 	.ssd_type = SDT_MEMERA,
421 	.ssd_dpl = SEL_UPL,
422 	.ssd_p = 1,
423 	.ssd_long = 0,
424 	.ssd_def32 = 1,
425 	.ssd_gran = 1		},
426 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
427 	.ssd_base = 0x0,
428 	.ssd_limit = 0xfffff,
429 	.ssd_type = SDT_MEMRWA,
430 	.ssd_dpl = SEL_UPL,
431 	.ssd_p = 1,
432 	.ssd_long = 0,
433 	.ssd_def32 = 1,
434 	.ssd_gran = 1		},
435 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
436 	.ssd_base = 0x0,
437 	.ssd_limit = 0xfffff,
438 	.ssd_type = SDT_MEMERA,
439 	.ssd_dpl = SEL_UPL,
440 	.ssd_p = 1,
441 	.ssd_long = 1,
442 	.ssd_def32 = 0,
443 	.ssd_gran = 1		},
444 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
445 	.ssd_base = 0x0,
446 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
447 	.ssd_type = SDT_SYSTSS,
448 	.ssd_dpl = SEL_KPL,
449 	.ssd_p = 1,
450 	.ssd_long = 0,
451 	.ssd_def32 = 0,
452 	.ssd_gran = 0		},
453 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
454 	.ssd_base = 0x0,
455 	.ssd_limit = 0x0,
456 	.ssd_type = 0,
457 	.ssd_dpl = 0,
458 	.ssd_p = 0,
459 	.ssd_long = 0,
460 	.ssd_def32 = 0,
461 	.ssd_gran = 0		},
462 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
463 	.ssd_base = 0x0,
464 	.ssd_limit = 0x0,
465 	.ssd_type = 0,
466 	.ssd_dpl = 0,
467 	.ssd_p = 0,
468 	.ssd_long = 0,
469 	.ssd_def32 = 0,
470 	.ssd_gran = 0		},
471 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
472 	.ssd_base = 0x0,
473 	.ssd_limit = 0x0,
474 	.ssd_type = 0,
475 	.ssd_dpl = 0,
476 	.ssd_p = 0,
477 	.ssd_long = 0,
478 	.ssd_def32 = 0,
479 	.ssd_gran = 0		},
480 };
481 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
482 
483 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)484 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
485 {
486 	struct gate_descriptor *ip;
487 
488 	ip = idt + idx;
489 	ip->gd_looffset = (uintptr_t)func;
490 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
491 	ip->gd_ist = ist;
492 	ip->gd_xx = 0;
493 	ip->gd_type = typ;
494 	ip->gd_dpl = dpl;
495 	ip->gd_p = 1;
496 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
497 }
498 
499 extern inthand_t
500 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
501 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
502 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
503 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
504 	IDTVEC(xmm), IDTVEC(dblfault),
505 	IDTVEC(div_pti), IDTVEC(bpt_pti),
506 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
507 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
508 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
509 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
510 	IDTVEC(xmm_pti),
511 #ifdef KDTRACE_HOOKS
512 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
513 #endif
514 #ifdef XENHVM
515 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
516 #endif
517 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
518 	IDTVEC(fast_syscall_pti);
519 
520 #ifdef DDB
521 /*
522  * Display the index and function name of any IDT entries that don't use
523  * the default 'rsvd' entry point.
524  */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)525 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
526 {
527 	struct gate_descriptor *ip;
528 	int idx;
529 	uintptr_t func;
530 
531 	ip = idt;
532 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
533 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
534 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
535 			db_printf("%3d\t", idx);
536 			db_printsym(func, DB_STGY_PROC);
537 			db_printf("\n");
538 		}
539 		ip++;
540 	}
541 }
542 
543 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)544 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
545 {
546 	struct {
547 		uint16_t limit;
548 		uint64_t base;
549 	} __packed idtr, gdtr;
550 	uint16_t ldt, tr;
551 
552 	__asm __volatile("sidt %0" : "=m" (idtr));
553 	db_printf("idtr\t0x%016lx/%04x\n",
554 	    (u_long)idtr.base, (u_int)idtr.limit);
555 	__asm __volatile("sgdt %0" : "=m" (gdtr));
556 	db_printf("gdtr\t0x%016lx/%04x\n",
557 	    (u_long)gdtr.base, (u_int)gdtr.limit);
558 	__asm __volatile("sldt %0" : "=r" (ldt));
559 	db_printf("ldtr\t0x%04x\n", ldt);
560 	__asm __volatile("str %0" : "=r" (tr));
561 	db_printf("tr\t0x%04x\n", tr);
562 	db_printf("cr0\t0x%016lx\n", rcr0());
563 	db_printf("cr2\t0x%016lx\n", rcr2());
564 	db_printf("cr3\t0x%016lx\n", rcr3());
565 	db_printf("cr4\t0x%016lx\n", rcr4());
566 	if (rcr4() & CR4_XSAVE)
567 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
568 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
569 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
570 		db_printf("FEATURES_CTL\t%016lx\n",
571 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
572 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
573 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
574 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
575 }
576 
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)577 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
578 {
579 
580 	db_printf("dr0\t0x%016lx\n", rdr0());
581 	db_printf("dr1\t0x%016lx\n", rdr1());
582 	db_printf("dr2\t0x%016lx\n", rdr2());
583 	db_printf("dr3\t0x%016lx\n", rdr3());
584 	db_printf("dr6\t0x%016lx\n", rdr6());
585 	db_printf("dr7\t0x%016lx\n", rdr7());
586 }
587 #endif
588 
589 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)590 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
591 {
592 
593 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
594 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
595 	ssd->ssd_type  = sd->sd_type;
596 	ssd->ssd_dpl   = sd->sd_dpl;
597 	ssd->ssd_p     = sd->sd_p;
598 	ssd->ssd_long  = sd->sd_long;
599 	ssd->ssd_def32 = sd->sd_def32;
600 	ssd->ssd_gran  = sd->sd_gran;
601 }
602 
603 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)604 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
605 {
606 
607 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
608 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
609 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
610 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
611 	sd->sd_type  = ssd->ssd_type;
612 	sd->sd_dpl   = ssd->ssd_dpl;
613 	sd->sd_p     = ssd->ssd_p;
614 	sd->sd_long  = ssd->ssd_long;
615 	sd->sd_def32 = ssd->ssd_def32;
616 	sd->sd_gran  = ssd->ssd_gran;
617 }
618 
619 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)620 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
621 {
622 
623 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
624 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
625 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
626 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
627 	sd->sd_type  = ssd->ssd_type;
628 	sd->sd_dpl   = ssd->ssd_dpl;
629 	sd->sd_p     = ssd->ssd_p;
630 	sd->sd_gran  = ssd->ssd_gran;
631 }
632 
633 u_int basemem;
634 
635 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)636 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
637     int *physmap_idxp)
638 {
639 	int i, insert_idx, physmap_idx;
640 
641 	physmap_idx = *physmap_idxp;
642 
643 	if (length == 0)
644 		return (1);
645 
646 	/*
647 	 * Find insertion point while checking for overlap.  Start off by
648 	 * assuming the new entry will be added to the end.
649 	 *
650 	 * NB: physmap_idx points to the next free slot.
651 	 */
652 	insert_idx = physmap_idx;
653 	for (i = 0; i < physmap_idx; i += 2) {
654 		if (base < physmap[i + 1]) {
655 			if (base + length <= physmap[i]) {
656 				insert_idx = i;
657 				break;
658 			}
659 			if (boothowto & RB_VERBOSE)
660 				printf(
661 		    "Overlapping memory regions, ignoring second region\n");
662 			return (1);
663 		}
664 	}
665 
666 	/* See if we can prepend to the next entry. */
667 	if (insert_idx < physmap_idx && base + length == physmap[insert_idx]) {
668 		physmap[insert_idx] = base;
669 		return (1);
670 	}
671 
672 	/* See if we can append to the previous entry. */
673 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
674 		physmap[insert_idx - 1] += length;
675 		return (1);
676 	}
677 
678 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
679 		printf(
680 		"Too many segments in the physical address map, giving up\n");
681 		return (0);
682 	}
683 
684 	/*
685 	 * Move the last 'N' entries down to make room for the new
686 	 * entry if needed.
687 	 */
688 	for (i = physmap_idx; i > insert_idx; i -= 2) {
689 		physmap[i] = physmap[i - 2];
690 		physmap[i + 1] = physmap[i - 1];
691 	}
692 
693 	physmap_idx += 2;
694 	*physmap_idxp = physmap_idx;
695 
696 	/* Insert the new entry. */
697 	physmap[insert_idx] = base;
698 	physmap[insert_idx + 1] = base + length;
699 	return (1);
700 }
701 
702 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)703 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
704                       vm_paddr_t *physmap, int *physmap_idx)
705 {
706 	struct bios_smap *smap, *smapend;
707 
708 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
709 
710 	for (smap = smapbase; smap < smapend; smap++) {
711 		if (boothowto & RB_VERBOSE)
712 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
713 			    smap->type, smap->base, smap->length);
714 
715 		if (smap->type != SMAP_TYPE_MEMORY)
716 			continue;
717 
718 		if (!add_physmap_entry(smap->base, smap->length, physmap,
719 		    physmap_idx))
720 			break;
721 	}
722 }
723 
724 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)725 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
726     int *physmap_idx)
727 {
728 	struct efi_md *map, *p;
729 	const char *type;
730 	size_t efisz;
731 	int ndesc, i;
732 
733 	static const char *types[] = {
734 		"Reserved",
735 		"LoaderCode",
736 		"LoaderData",
737 		"BootServicesCode",
738 		"BootServicesData",
739 		"RuntimeServicesCode",
740 		"RuntimeServicesData",
741 		"ConventionalMemory",
742 		"UnusableMemory",
743 		"ACPIReclaimMemory",
744 		"ACPIMemoryNVS",
745 		"MemoryMappedIO",
746 		"MemoryMappedIOPortSpace",
747 		"PalCode",
748 		"PersistentMemory"
749 	};
750 
751 	/*
752 	 * Memory map data provided by UEFI via the GetMemoryMap
753 	 * Boot Services API.
754 	 */
755 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
756 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
757 
758 	if (efihdr->descriptor_size == 0)
759 		return;
760 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
761 
762 	if (boothowto & RB_VERBOSE)
763 		printf("%23s %12s %12s %8s %4s\n",
764 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
765 
766 	TUNABLE_INT_FETCH("machdep.efirt.regs", &efi_map_regs);
767 	for (i = 0, p = map; i < ndesc; i++,
768 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
769 		if (boothowto & RB_VERBOSE) {
770 			if (p->md_type < nitems(types))
771 				type = types[p->md_type];
772 			else
773 				type = "<INVALID>";
774 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
775 			    p->md_virt, p->md_pages);
776 			if (p->md_attr & EFI_MD_ATTR_UC)
777 				printf("UC ");
778 			if (p->md_attr & EFI_MD_ATTR_WC)
779 				printf("WC ");
780 			if (p->md_attr & EFI_MD_ATTR_WT)
781 				printf("WT ");
782 			if (p->md_attr & EFI_MD_ATTR_WB)
783 				printf("WB ");
784 			if (p->md_attr & EFI_MD_ATTR_UCE)
785 				printf("UCE ");
786 			if (p->md_attr & EFI_MD_ATTR_WP)
787 				printf("WP ");
788 			if (p->md_attr & EFI_MD_ATTR_RP)
789 				printf("RP ");
790 			if (p->md_attr & EFI_MD_ATTR_XP)
791 				printf("XP ");
792 			if (p->md_attr & EFI_MD_ATTR_NV)
793 				printf("NV ");
794 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
795 				printf("MORE_RELIABLE ");
796 			if (p->md_attr & EFI_MD_ATTR_RO)
797 				printf("RO ");
798 			if (p->md_attr & EFI_MD_ATTR_RT)
799 				printf("RUNTIME");
800 			printf("\n");
801 		}
802 
803 		switch (p->md_type) {
804 		case EFI_MD_TYPE_BS_CODE:
805 		case EFI_MD_TYPE_BS_DATA:
806 			if (EFI_MAP_BOOTTYPE_ALLOWED(p->md_type))
807 				continue;
808 			/* FALLTHROUGH */
809 		case EFI_MD_TYPE_CODE:
810 		case EFI_MD_TYPE_DATA:
811 		case EFI_MD_TYPE_FREE:
812 			/*
813 			 * We're allowed to use any entry with these types.
814 			 */
815 			break;
816 		default:
817 			continue;
818 		}
819 
820 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
821 		    physmap, physmap_idx))
822 			break;
823 	}
824 }
825 
826 static void
native_parse_memmap(vm_paddr_t * physmap,int * physmap_idx)827 native_parse_memmap(vm_paddr_t *physmap, int *physmap_idx)
828 {
829 	struct bios_smap *smap;
830 	struct efi_map_header *efihdr;
831 
832 	efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
833 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
834 	smap = (struct bios_smap *)preload_search_info(preload_kmdp,
835 	    MODINFO_METADATA | MODINFOMD_SMAP);
836 	if (efihdr == NULL && smap == NULL)
837 		panic("No BIOS smap or EFI map info from loader!");
838 
839 	if (efihdr != NULL) {
840 		add_efi_map_entries(efihdr, physmap, physmap_idx);
841 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
842 	} else {
843 		/*
844 		 * Memory map from INT 15:E820.
845 		 *
846 		 * subr_module.c says:
847 		 * "Consumer may safely assume that size value precedes data."
848 		 * ie: an int32_t immediately precedes smap.
849 		 */
850 		u_int32_t size = *((u_int32_t *)smap - 1);
851 
852 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
853 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
854 	}
855 }
856 
857 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
858 
859 /*
860  * Populate the (physmap) array with base/bound pairs describing the
861  * available physical memory in the system, then test this memory and
862  * build the phys_avail array describing the actually-available memory.
863  *
864  * Total memory size may be set by the kernel environment variable
865  * hw.physmem or the compile-time define MAXMEM.
866  *
867  * XXX first should be vm_paddr_t.
868  */
869 static void
getmemsize(u_int64_t first)870 getmemsize(u_int64_t first)
871 {
872 	int i, physmap_idx, pa_indx, da_indx;
873 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
874 	u_long physmem_start, physmem_tunable, memtest;
875 	pt_entry_t *pte;
876 	quad_t dcons_addr, dcons_size;
877 	int page_counter;
878 
879 	TSENTER();
880 	/*
881 	 * Tell the physical memory allocator about pages used to store
882 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
883 	 */
884 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
885 
886 	bzero(physmap, sizeof(physmap));
887 	physmap_idx = 0;
888 
889 	init_ops.parse_memmap(physmap, &physmap_idx);
890 	physmap_idx -= 2;
891 
892 	/*
893 	 * Find the 'base memory' segment for SMP
894 	 */
895 	basemem = 0;
896 	for (i = 0; i <= physmap_idx; i += 2) {
897 		if (physmap[i] <= 0xA0000) {
898 			basemem = physmap[i + 1] / 1024;
899 			break;
900 		}
901 	}
902 	if (basemem == 0 || basemem > 640) {
903 		if (bootverbose)
904 			printf(
905 		"Memory map doesn't contain a basemem segment, faking it");
906 		basemem = 640;
907 	}
908 
909 	/*
910 	 * Maxmem isn't the "maximum memory", it's one larger than the
911 	 * highest page of the physical address space.  It should be
912 	 * called something like "Maxphyspage".  We may adjust this
913 	 * based on ``hw.physmem'' and the results of the memory test.
914 	 */
915 	Maxmem = atop(physmap[physmap_idx + 1]);
916 
917 #ifdef MAXMEM
918 	Maxmem = MAXMEM / 4;
919 #endif
920 
921 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
922 		Maxmem = atop(physmem_tunable);
923 
924 	/*
925 	 * The boot memory test is disabled by default, as it takes a
926 	 * significant amount of time on large-memory systems, and is
927 	 * unfriendly to virtual machines as it unnecessarily touches all
928 	 * pages.
929 	 *
930 	 * A general name is used as the code may be extended to support
931 	 * additional tests beyond the current "page present" test.
932 	 */
933 	memtest = 0;
934 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
935 
936 	/*
937 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
938 	 * in the system.
939 	 */
940 	if (Maxmem > atop(physmap[physmap_idx + 1]))
941 		Maxmem = atop(physmap[physmap_idx + 1]);
942 
943 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
944 	    (boothowto & RB_VERBOSE))
945 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
946 
947 	/* call pmap initialization to make new kernel address space */
948 	pmap_bootstrap(&first);
949 
950 	/*
951 	 * Size up each available chunk of physical memory.
952 	 *
953 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
954 	 * By default, mask off the first 16 pages unless we appear to be
955 	 * running in a VM.
956 	 */
957 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
958 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
959 	if (physmap[0] < physmem_start) {
960 		if (physmem_start < PAGE_SIZE)
961 			physmap[0] = PAGE_SIZE;
962 		else if (physmem_start >= physmap[1])
963 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
964 		else
965 			physmap[0] = round_page(physmem_start);
966 	}
967 	pa_indx = 0;
968 	da_indx = 1;
969 	phys_avail[pa_indx++] = physmap[0];
970 	phys_avail[pa_indx] = physmap[0];
971 	dump_avail[da_indx] = physmap[0];
972 	pte = CMAP1;
973 
974 	/*
975 	 * Get dcons buffer address
976 	 */
977 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
978 	    getenv_quad("dcons.size", &dcons_size) == 0)
979 		dcons_addr = 0;
980 
981 	/*
982 	 * physmap is in bytes, so when converting to page boundaries,
983 	 * round up the start address and round down the end address.
984 	 */
985 	page_counter = 0;
986 	if (memtest != 0)
987 		printf("Testing system memory");
988 	for (i = 0; i <= physmap_idx; i += 2) {
989 		vm_paddr_t end;
990 
991 		end = ptoa((vm_paddr_t)Maxmem);
992 		if (physmap[i + 1] < end)
993 			end = trunc_page(physmap[i + 1]);
994 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
995 			int *ptr = (int *)CADDR1;
996 			int tmp;
997 			bool full, page_bad;
998 
999 			full = false;
1000 			/*
1001 			 * block out kernel memory as not available.
1002 			 */
1003 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1004 				goto do_dump_avail;
1005 
1006 			/*
1007 			 * block out dcons buffer
1008 			 */
1009 			if (dcons_addr > 0
1010 			    && pa >= trunc_page(dcons_addr)
1011 			    && pa < dcons_addr + dcons_size)
1012 				goto do_dump_avail;
1013 
1014 			page_bad = false;
1015 			if (memtest == 0)
1016 				goto skip_memtest;
1017 
1018 			/*
1019 			 * Print a "." every GB to show we're making
1020 			 * progress.
1021 			 */
1022 			page_counter++;
1023 			if ((page_counter % PAGES_PER_GB) == 0)
1024 				printf(".");
1025 
1026 			/*
1027 			 * map page into kernel: valid, read/write,non-cacheable
1028 			 */
1029 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1030 			invltlb();
1031 
1032 			tmp = *(int *)ptr;
1033 			/*
1034 			 * Test for alternating 1's and 0's
1035 			 */
1036 			*(volatile int *)ptr = 0xaaaaaaaa;
1037 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1038 				page_bad = true;
1039 			/*
1040 			 * Test for alternating 0's and 1's
1041 			 */
1042 			*(volatile int *)ptr = 0x55555555;
1043 			if (*(volatile int *)ptr != 0x55555555)
1044 				page_bad = true;
1045 			/*
1046 			 * Test for all 1's
1047 			 */
1048 			*(volatile int *)ptr = 0xffffffff;
1049 			if (*(volatile int *)ptr != 0xffffffff)
1050 				page_bad = true;
1051 			/*
1052 			 * Test for all 0's
1053 			 */
1054 			*(volatile int *)ptr = 0x0;
1055 			if (*(volatile int *)ptr != 0x0)
1056 				page_bad = true;
1057 			/*
1058 			 * Restore original value.
1059 			 */
1060 			*(int *)ptr = tmp;
1061 
1062 skip_memtest:
1063 			/*
1064 			 * Adjust array of valid/good pages.
1065 			 */
1066 			if (page_bad == true)
1067 				continue;
1068 			/*
1069 			 * If this good page is a continuation of the
1070 			 * previous set of good pages, then just increase
1071 			 * the end pointer. Otherwise start a new chunk.
1072 			 * Note that "end" points one higher than end,
1073 			 * making the range >= start and < end.
1074 			 * If we're also doing a speculative memory
1075 			 * test and we at or past the end, bump up Maxmem
1076 			 * so that we keep going. The first bad page
1077 			 * will terminate the loop.
1078 			 */
1079 			if (phys_avail[pa_indx] == pa) {
1080 				phys_avail[pa_indx] += PAGE_SIZE;
1081 			} else {
1082 				pa_indx++;
1083 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1084 					printf(
1085 		"Too many holes in the physical address space, giving up\n");
1086 					pa_indx--;
1087 					full = true;
1088 					goto do_dump_avail;
1089 				}
1090 				phys_avail[pa_indx++] = pa;	/* start */
1091 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1092 			}
1093 			physmem++;
1094 do_dump_avail:
1095 			if (dump_avail[da_indx] == pa) {
1096 				dump_avail[da_indx] += PAGE_SIZE;
1097 			} else {
1098 				da_indx++;
1099 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1100 					da_indx--;
1101 					goto do_next;
1102 				}
1103 				dump_avail[da_indx++] = pa; /* start */
1104 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1105 			}
1106 do_next:
1107 			if (full)
1108 				break;
1109 		}
1110 	}
1111 	*pte = 0;
1112 	invltlb();
1113 	if (memtest != 0)
1114 		printf("\n");
1115 
1116 	/*
1117 	 * XXX
1118 	 * The last chunk must contain at least one page plus the message
1119 	 * buffer to avoid complicating other code (message buffer address
1120 	 * calculation, etc.).
1121 	 */
1122 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1123 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1124 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1125 		phys_avail[pa_indx--] = 0;
1126 		phys_avail[pa_indx--] = 0;
1127 	}
1128 
1129 	Maxmem = atop(phys_avail[pa_indx]);
1130 
1131 	/* Trim off space for the message buffer. */
1132 	phys_avail[pa_indx] -= round_page(msgbufsize);
1133 
1134 	/* Map the message buffer. */
1135 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1136 	TSEXIT();
1137 }
1138 
1139 static void
native_parse_preload_data(u_int64_t modulep)1140 native_parse_preload_data(u_int64_t modulep)
1141 {
1142 	char *envp;
1143 #ifdef DDB
1144 	vm_offset_t ksym_start;
1145 	vm_offset_t ksym_end;
1146 #endif
1147 
1148 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1149 	preload_bootstrap_relocate(KERNBASE);
1150 	preload_initkmdp(true);
1151 	boothowto = MD_FETCH(preload_kmdp, MODINFOMD_HOWTO, int);
1152 	envp = MD_FETCH(preload_kmdp, MODINFOMD_ENVP, char *);
1153 	if (envp != NULL)
1154 		envp += KERNBASE;
1155 	init_static_kenv(envp, 0);
1156 #ifdef DDB
1157 	ksym_start = MD_FETCH(preload_kmdp, MODINFOMD_SSYM, uintptr_t);
1158 	ksym_end = MD_FETCH(preload_kmdp, MODINFOMD_ESYM, uintptr_t);
1159 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1160 #endif
1161 	efi_systbl_phys = MD_FETCH(preload_kmdp, MODINFOMD_FW_HANDLE,
1162 	    vm_paddr_t);
1163 }
1164 
1165 static void
native_clock_source_init(void)1166 native_clock_source_init(void)
1167 {
1168 	i8254_init();
1169 }
1170 
1171 static void
amd64_kdb_init(void)1172 amd64_kdb_init(void)
1173 {
1174 	kdb_init();
1175 #ifdef KDB
1176 	if (boothowto & RB_KDB)
1177 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1178 #endif
1179 }
1180 
1181 /* Set up the fast syscall stuff */
1182 void
amd64_conf_fast_syscall(void)1183 amd64_conf_fast_syscall(void)
1184 {
1185 	uint64_t msr;
1186 
1187 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1188 	wrmsr(MSR_EFER, msr);
1189 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1190 	    (u_int64_t)IDTVEC(fast_syscall));
1191 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1192 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1193 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1194 	wrmsr(MSR_STAR, msr);
1195 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1196 }
1197 
1198 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1199 amd64_bsp_pcpu_init1(struct pcpu *pc)
1200 {
1201 	struct user_segment_descriptor *gdt;
1202 
1203 	PCPU_SET(prvspace, pc);
1204 	gdt = *PCPU_PTR(gdt);
1205 	PCPU_SET(curthread, &thread0);
1206 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1207 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1208 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1209 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1210 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1211 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1212 	PCPU_SET(smp_tlb_gen, 1);
1213 }
1214 
1215 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1216 amd64_bsp_pcpu_init2(uint64_t rsp0)
1217 {
1218 
1219 	PCPU_SET(rsp0, rsp0);
1220 	PCPU_SET(pti_rsp0, STACKALIGN((vm_offset_t)PCPU_PTR(pti_stack) +
1221 	    PC_PTI_STACK_SZ * sizeof(uint64_t)));
1222 	PCPU_SET(curpcb, thread0.td_pcb);
1223 }
1224 
1225 void
amd64_bsp_ist_init(struct pcpu * pc)1226 amd64_bsp_ist_init(struct pcpu *pc)
1227 {
1228 	struct nmi_pcpu *np;
1229 	struct amd64tss *tssp;
1230 
1231 	tssp = &pc->pc_common_tss;
1232 
1233 	/* doublefault stack space, runs on ist1 */
1234 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1235 	np->np_pcpu = (register_t)pc;
1236 	tssp->tss_ist1 = (long)np;
1237 
1238 	/*
1239 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1240 	 * above the start of the ist2 stack.
1241 	 */
1242 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1243 	np->np_pcpu = (register_t)pc;
1244 	tssp->tss_ist2 = (long)np;
1245 
1246 	/*
1247 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1248 	 * above the start of the ist3 stack.
1249 	 */
1250 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1251 	np->np_pcpu = (register_t)pc;
1252 	tssp->tss_ist3 = (long)np;
1253 
1254 	/*
1255 	 * DB# stack, runs on ist4.
1256 	 */
1257 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1258 	np->np_pcpu = (register_t)pc;
1259 	tssp->tss_ist4 = (long)np;
1260 }
1261 
1262 /*
1263  * Calculate the kernel load address by inspecting page table created by loader.
1264  * The assumptions:
1265  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1266  *   aligned at 2M, below 4G (the latter is important for AP startup)
1267  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1268  * - kernel is mapped with 2M superpages
1269  * - all participating memory, i.e. kernel, modules, metadata,
1270  *   page table is accessible by pre-created 1:1 mapping
1271  *   (right now loader creates 1:1 mapping for lower 4G, and all
1272  *   memory is from there)
1273  * - there is a usable memory block right after the end of the
1274  *   mapped kernel and all modules/metadata, pointed to by
1275  *   physfree, for early allocations
1276  */
1277 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1278 amd64_loadaddr(void)
1279 {
1280 	pml4_entry_t *pml4e;
1281 	pdp_entry_t *pdpe;
1282 	pd_entry_t *pde;
1283 	uint64_t cr3;
1284 
1285 	cr3 = rcr3();
1286 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1287 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1288 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1289 	return (*pde & PG_FRAME);
1290 }
1291 
1292 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1293 hammer_time(u_int64_t modulep, u_int64_t physfree)
1294 {
1295 	int gsel_tss, x;
1296 	struct pcpu *pc;
1297 	uint64_t rsp0;
1298 	char *env;
1299 	struct user_segment_descriptor *gdt;
1300 	struct region_descriptor r_gdt;
1301 	size_t kstack0_sz;
1302 
1303 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1304 
1305 	kernphys = amd64_loadaddr();
1306 
1307 	physfree += kernphys;
1308 
1309 	/* Initializes preload_kmdp */
1310 	init_ops.parse_preload_data(modulep);
1311 
1312 	efi_boot = preload_search_info(preload_kmdp, MODINFO_METADATA |
1313 	    MODINFOMD_EFI_MAP) != NULL;
1314 
1315 	if (!efi_boot) {
1316 		/* Tell the bios to warmboot next time */
1317 		atomic_store_short((u_short *)0x472, 0x1234);
1318 	}
1319 
1320 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1321 	physfree = roundup2(physfree, PAGE_SIZE);
1322 
1323 	identify_cpu1();
1324 	identify_hypervisor();
1325 	identify_hypervisor_smbios();
1326 	identify_cpu_fixup_bsp();
1327 	identify_cpu2();
1328 	initializecpucache();
1329 
1330 	/*
1331 	 * Check for pti, pcid, and invpcid before ifuncs are
1332 	 * resolved, to correctly select the implementation for
1333 	 * pmap_activate_sw_mode().
1334 	 */
1335 	pti = pti_get_default();
1336 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1337 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1338 	if ((cpu_feature2 & CPUID2_PCID) == 0)
1339 		pmap_pcid_enabled = 0;
1340 	invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0;
1341 
1342 	/*
1343 	 * Now we can do small core initialization, after the PCID
1344 	 * CPU features and user knobs are evaluated.
1345 	 */
1346 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1347 	    &pmap_pcid_invlpg_workaround_uena);
1348 	cpu_init_small_core();
1349 
1350 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1351 		use_xsave = 1;
1352 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1353 	}
1354 
1355 	sched_instance_select();
1356 
1357 	link_elf_ireloc();
1358 
1359 	/*
1360 	 * This may be done better later if it gets more high level
1361 	 * components in it. If so just link td->td_proc here.
1362 	 */
1363 	proc_linkup0(&proc0, &thread0);
1364 
1365 	/* Init basic tunables, hz etc */
1366 	init_param1();
1367 
1368 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1369 	thread0.td_kstack_pages = kstack_pages;
1370 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1371 	bzero((void *)thread0.td_kstack, kstack0_sz);
1372 	physfree += kstack0_sz;
1373 
1374 	/*
1375 	 * Initialize enough of thread0 for delayed invalidation to
1376 	 * work very early.  Rely on thread0.td_base_pri
1377 	 * zero-initialization, it is reset to PVM at proc0_init().
1378 	 */
1379 	pmap_thread_init_invl_gen(&thread0);
1380 
1381 	pc = &temp_bsp_pcpu;
1382 	pcpu_init(pc, 0, sizeof(struct pcpu));
1383 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1384 
1385 	/*
1386 	 * make gdt memory segments
1387 	 */
1388 	for (x = 0; x < NGDT; x++) {
1389 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1390 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1391 			ssdtosd(&gdt_segs[x], &gdt[x]);
1392 	}
1393 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1394 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1395 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1396 
1397 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1398 	r_gdt.rd_base = (long)gdt;
1399 	lgdt(&r_gdt);
1400 
1401 	wrmsr(MSR_FSBASE, 0);		/* User value */
1402 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1403 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1404 
1405 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1406 	physfree += DPCPU_SIZE;
1407 	amd64_bsp_pcpu_init1(pc);
1408 	/* Non-late cninit() and printf() can be moved up to here. */
1409 
1410 	/*
1411 	 * Initialize mutexes.
1412 	 *
1413 	 * icu_lock: in order to allow an interrupt to occur in a critical
1414 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1415 	 *	     must be able to get the icu lock, so it can't be
1416 	 *	     under witness.
1417 	 */
1418 	mutex_init();
1419 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1420 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1421 
1422 	/* exceptions */
1423 	for (x = 0; x < NIDT; x++)
1424 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1425 		    SEL_KPL, 0);
1426 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1427 	    SEL_KPL, 0);
1428 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1429 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1430 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1431 	    SEL_UPL, 0);
1432 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1433 	    SEL_UPL, 0);
1434 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1435 	    SEL_KPL, 0);
1436 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1437 	    SEL_KPL, 0);
1438 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1439 	    SEL_KPL, 0);
1440 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1441 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1442 	    SDT_SYSIGT, SEL_KPL, 0);
1443 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1444 	    SEL_KPL, 0);
1445 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1446 	    SDT_SYSIGT, SEL_KPL, 0);
1447 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1448 	    SEL_KPL, 0);
1449 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1450 	    SEL_KPL, 0);
1451 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1452 	    SEL_KPL, 0);
1453 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1454 	    SEL_KPL, 0);
1455 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1456 	    SEL_KPL, 0);
1457 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1458 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1459 	    SEL_KPL, 0);
1460 #ifdef KDTRACE_HOOKS
1461 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1462 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1463 #endif
1464 #ifdef XENHVM
1465 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1466 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1467 #endif
1468 	r_idt.rd_limit = sizeof(idt0) - 1;
1469 	r_idt.rd_base = (long) idt;
1470 	lidt(&r_idt);
1471 
1472 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1473 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1474 
1475 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1476 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1477 
1478 	TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1479 	    &syscall_ret_l1d_flush_mode);
1480 
1481 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1482 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1483 
1484 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1485 
1486 	TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1487 	    &x86_rngds_mitg_enable);
1488 
1489 	TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1490 	    &zenbleed_enable);
1491 	zenbleed_sanitize_enable();
1492 
1493 	finishidentcpu();	/* Final stage of CPU initialization */
1494 
1495 	invlpgb_works = (amd_extended_feature_extensions &
1496 	    AMDFEID_INVLPGB) != 0;
1497 	TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1498 	if (invlpgb_works)
1499 		invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1500 
1501 	/*
1502 	 * Initialize the clock before the console so that console
1503 	 * initialization can use DELAY().
1504 	 */
1505 	clock_init();
1506 
1507 	initializecpu();	/* Initialize CPU registers */
1508 
1509 	amd64_bsp_ist_init(pc);
1510 
1511 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1512 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1513 	    IOPERM_BITMAP_SIZE;
1514 
1515 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1516 	ltr(gsel_tss);
1517 
1518 	amd64_conf_fast_syscall();
1519 
1520 	/*
1521 	 * We initialize the PCB pointer early so that exception
1522 	 * handlers will work.
1523 	 */
1524 	cpu_max_ext_state_size = sizeof(struct savefpu);
1525 	set_top_of_stack_td(&thread0);
1526 	thread0.td_pcb = get_pcb_td(&thread0);
1527 
1528 	/*
1529 	 * The console and kdb should be initialized even earlier than here,
1530 	 * but some console drivers don't work until after getmemsize().
1531 	 * Default to late console initialization to support these drivers.
1532 	 * This loses mainly printf()s in getmemsize() and early debugging.
1533 	 */
1534 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1535 	if (!late_console) {
1536 		cninit();
1537 		amd64_kdb_init();
1538 	}
1539 
1540 	getmemsize(physfree);
1541 	init_param2(physmem);
1542 
1543 	/* now running on new page tables, configured,and u/iom is accessible */
1544 
1545 #ifdef DEV_PCI
1546         /* This call might adjust phys_avail[]. */
1547         pci_early_quirks();
1548 #endif
1549 
1550 	if (late_console)
1551 		cninit();
1552 
1553 	/*
1554 	 * Dump the boot metadata. We have to wait for cninit() since console
1555 	 * output is required. If it's grossly incorrect the kernel will never
1556 	 * make it this far.
1557 	 */
1558 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1559 		preload_dump();
1560 
1561 #ifdef DEV_ISA
1562 #ifdef DEV_ATPIC
1563 	elcr_probe();
1564 	atpic_startup();
1565 #else
1566 	/* Reset and mask the atpics and leave them shut down. */
1567 	atpic_reset();
1568 
1569 	/*
1570 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1571 	 * interrupt handler.
1572 	 */
1573 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1574 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1575 #endif
1576 #else
1577 #error "have you forgotten the isa device?"
1578 #endif
1579 
1580 	if (late_console)
1581 		amd64_kdb_init();
1582 
1583 	msgbufinit(msgbufp, msgbufsize);
1584 	fpuinit();
1585 
1586 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1587 	rsp0 = thread0.td_md.md_stack_base;
1588 	/* Ensure the stack is aligned to 16 bytes */
1589 	rsp0 = STACKALIGN(rsp0);
1590 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1591 	amd64_bsp_pcpu_init2(rsp0);
1592 
1593 	/* transfer to user mode */
1594 
1595 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1596 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1597 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1598 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1599 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1600 
1601 	load_ds(_udatasel);
1602 	load_es(_udatasel);
1603 	load_fs(_ufssel);
1604 
1605 	/* setup proc 0's pcb */
1606 	thread0.td_pcb->pcb_flags = 0;
1607 
1608         env = kern_getenv("kernelname");
1609 	if (env != NULL)
1610 		strlcpy(kernelname, env, sizeof(kernelname));
1611 
1612 	kcsan_cpu_init(0);
1613 
1614 #ifdef FDT
1615 	x86_init_fdt();
1616 #endif
1617 
1618 	kasan_init();
1619 	kmsan_init();
1620 
1621 	TSEXIT();
1622 
1623 	/* Location of kernel stack for locore */
1624 	return (thread0.td_md.md_stack_base);
1625 }
1626 
1627 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1628 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1629 {
1630 
1631 	pcpu->pc_acpi_id = 0xffffffff;
1632 }
1633 
1634 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1635 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1636 {
1637 	struct bios_smap *smapbase;
1638 	struct bios_smap_xattr smap;
1639 	uint32_t *smapattr;
1640 	int count, error, i;
1641 
1642 	/* Retrieve the system memory map from the loader. */
1643 	smapbase = (struct bios_smap *)preload_search_info(preload_kmdp,
1644 	    MODINFO_METADATA | MODINFOMD_SMAP);
1645 	if (smapbase == NULL)
1646 		return (0);
1647 	smapattr = (uint32_t *)preload_search_info(preload_kmdp,
1648 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1649 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1650 	error = 0;
1651 	for (i = 0; i < count; i++) {
1652 		smap.base = smapbase[i].base;
1653 		smap.length = smapbase[i].length;
1654 		smap.type = smapbase[i].type;
1655 		if (smapattr != NULL)
1656 			smap.xattr = smapattr[i];
1657 		else
1658 			smap.xattr = 0;
1659 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1660 	}
1661 	return (error);
1662 }
1663 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1664     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1665     smap_sysctl_handler, "S,bios_smap_xattr",
1666     "Raw BIOS SMAP data");
1667 
1668 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1669 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1670 {
1671 	struct efi_map_header *efihdr;
1672 	uint32_t efisize;
1673 
1674 	efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
1675 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1676 	if (efihdr == NULL)
1677 		return (0);
1678 	efisize = *((uint32_t *)efihdr - 1);
1679 	return (SYSCTL_OUT(req, efihdr, efisize));
1680 }
1681 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1682     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1683     efi_map_sysctl_handler, "S,efi_map_header",
1684     "Raw EFI Memory Map");
1685 
1686 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1687 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1688 {
1689 	char *arch;
1690 
1691 	arch = (char *)preload_search_info(preload_kmdp,
1692 	    MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1693 	if (arch == NULL)
1694 		return (0);
1695 
1696 	return (SYSCTL_OUT_STR(req, arch));
1697 }
1698 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1699     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1700     efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1701 
1702 void
spinlock_enter(void)1703 spinlock_enter(void)
1704 {
1705 	struct thread *td;
1706 	register_t flags;
1707 
1708 	td = curthread;
1709 	if (td->td_md.md_spinlock_count == 0) {
1710 		flags = intr_disable();
1711 		td->td_md.md_spinlock_count = 1;
1712 		td->td_md.md_saved_flags = flags;
1713 		critical_enter();
1714 	} else
1715 		td->td_md.md_spinlock_count++;
1716 }
1717 
1718 void
spinlock_exit(void)1719 spinlock_exit(void)
1720 {
1721 	struct thread *td;
1722 	register_t flags;
1723 
1724 	td = curthread;
1725 	flags = td->td_md.md_saved_flags;
1726 	td->td_md.md_spinlock_count--;
1727 	if (td->td_md.md_spinlock_count == 0) {
1728 		critical_exit();
1729 		intr_restore(flags);
1730 	}
1731 }
1732 
1733 /*
1734  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1735  * we want to start a backtrace from the function that caused us to enter
1736  * the debugger. We have the context in the trapframe, but base the trace
1737  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1738  * enough for a backtrace.
1739  */
1740 void
makectx(struct trapframe * tf,struct pcb * pcb)1741 makectx(struct trapframe *tf, struct pcb *pcb)
1742 {
1743 
1744 	pcb->pcb_r12 = tf->tf_r12;
1745 	pcb->pcb_r13 = tf->tf_r13;
1746 	pcb->pcb_r14 = tf->tf_r14;
1747 	pcb->pcb_r15 = tf->tf_r15;
1748 	pcb->pcb_rbp = tf->tf_rbp;
1749 	pcb->pcb_rbx = tf->tf_rbx;
1750 	pcb->pcb_rip = tf->tf_rip;
1751 	pcb->pcb_rsp = tf->tf_rsp;
1752 }
1753 
1754 /*
1755  * The pcb_flags is only modified by current thread, or by other threads
1756  * when current thread is stopped.  However, current thread may change it
1757  * from the interrupt context in cpu_switch(), or in the trap handler.
1758  * When we read-modify-write pcb_flags from C sources, compiler may generate
1759  * code that is not atomic regarding the interrupt handler.  If a trap or
1760  * interrupt happens and any flag is modified from the handler, it can be
1761  * clobbered with the cached value later.  Therefore, we implement setting
1762  * and clearing flags with single-instruction functions, which do not race
1763  * with possible modification of the flags from the trap or interrupt context,
1764  * because traps and interrupts are executed only on instruction boundary.
1765  */
1766 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1767 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1768 {
1769 
1770 	__asm __volatile("orl %1,%0"
1771 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1772 	    : "cc", "memory");
1773 
1774 }
1775 
1776 /*
1777  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1778  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1779  * pcb if user space modified the bases.  We must save on the context
1780  * switch or if the return to usermode happens through the doreti.
1781  *
1782  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1783  * which have a consequence that the base MSRs must be saved each time
1784  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1785  * context switches.
1786  */
1787 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1788 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1789 {
1790 	register_t r;
1791 
1792 	if (curpcb == pcb &&
1793 	    (flags & PCB_FULL_IRET) != 0 &&
1794 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1795 		r = intr_disable();
1796 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1797 			pcb->pcb_fsbase = rdfsbase();
1798 			pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1799 		}
1800 		set_pcb_flags_raw(pcb, flags);
1801 		intr_restore(r);
1802 	} else {
1803 		set_pcb_flags_raw(pcb, flags);
1804 	}
1805 }
1806 
1807 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1808 {
1809 
1810 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1811 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1812 }
1813 
1814 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1815 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1816 {
1817 
1818 	__asm __volatile("andl %1,%0"
1819 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1820 	    : "cc", "memory");
1821 }
1822 
1823 extern const char wrmsr_early_safe_gp_handler[];
1824 static struct region_descriptor wrmsr_early_safe_orig_efi_idt;
1825 
1826 void
wrmsr_early_safe_start(void)1827 wrmsr_early_safe_start(void)
1828 {
1829 	struct region_descriptor efi_idt;
1830 	struct gate_descriptor *gpf_descr;
1831 
1832 	sidt(&wrmsr_early_safe_orig_efi_idt);
1833 	efi_idt.rd_limit = 32 * sizeof(idt0[0]);
1834 	efi_idt.rd_base = (uintptr_t)idt0;
1835 	lidt(&efi_idt);
1836 
1837 	gpf_descr = &idt0[IDT_GP];
1838 	gpf_descr->gd_looffset = (uintptr_t)wrmsr_early_safe_gp_handler;
1839 	gpf_descr->gd_hioffset = (uintptr_t)wrmsr_early_safe_gp_handler >> 16;
1840 	gpf_descr->gd_selector = rcs();
1841 	gpf_descr->gd_type = SDT_SYSTGT;
1842 	gpf_descr->gd_p = 1;
1843 }
1844 
1845 void
wrmsr_early_safe_end(void)1846 wrmsr_early_safe_end(void)
1847 {
1848 	struct gate_descriptor *gpf_descr;
1849 
1850 	lidt(&wrmsr_early_safe_orig_efi_idt);
1851 
1852 	gpf_descr = &idt0[IDT_GP];
1853 	memset(gpf_descr, 0, sizeof(*gpf_descr));
1854 }
1855 
1856 #ifdef KDB
1857 
1858 /*
1859  * Provide inb() and outb() as functions.  They are normally only available as
1860  * inline functions, thus cannot be called from the debugger.
1861  */
1862 
1863 /* silence compiler warnings */
1864 u_char inb_(u_short);
1865 void outb_(u_short, u_char);
1866 
1867 u_char
inb_(u_short port)1868 inb_(u_short port)
1869 {
1870 	return inb(port);
1871 }
1872 
1873 void
outb_(u_short port,u_char data)1874 outb_(u_short port, u_char data)
1875 {
1876 	outb(port, data);
1877 }
1878 
1879 #endif /* KDB */
1880 
1881 #undef memset
1882 #undef memmove
1883 #undef memcpy
1884 
1885 void	*memset_std(void *buf, int c, size_t len);
1886 void	*memset_erms(void *buf, int c, size_t len);
1887 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1888 	    size_t len);
1889 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1890 	    size_t len);
1891 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1892 	    size_t len);
1893 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1894 	    size_t len);
1895 
1896 #ifdef KCSAN
1897 /*
1898  * These fail to build as ifuncs when used with KCSAN.
1899  */
1900 void *
memset(void * buf,int c,size_t len)1901 memset(void *buf, int c, size_t len)
1902 {
1903 
1904 	return (memset_std(buf, c, len));
1905 }
1906 
1907 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1908 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1909 {
1910 
1911 	return (memmove_std(dst, src, len));
1912 }
1913 
1914 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1915 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1916 {
1917 
1918 	return (memcpy_std(dst, src, len));
1919 }
1920 #else
1921 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1922 {
1923 
1924 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1925 	    memset_erms : memset_std);
1926 }
1927 
1928 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1929     size_t))
1930 {
1931 
1932 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1933 	    memmove_erms : memmove_std);
1934 }
1935 
1936 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1937 {
1938 
1939 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1940 	    memcpy_erms : memcpy_std);
1941 }
1942 #endif
1943 
1944 void	pagezero_std(void *addr);
1945 void	pagezero_erms(void *addr);
1946 DEFINE_IFUNC(, void , pagezero, (void *))
1947 {
1948 
1949 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1950 	    pagezero_erms : pagezero_std);
1951 }
1952